diff --git a/.gitignore b/.gitignore
index 59ca0d434e57..94cc9baccad9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,4 +161,7 @@ tests/mxnet_unit_tests
 coverage.xml
 
 # Local CMake build config
-cmake_options.yml
\ No newline at end of file
+cmake_options.yml
+
+# header file generated at compile time
+include/mkldnn/mkldnn_version.h
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 3ffea8694adf..3943914eed66 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 3ffea8694adf9c0363f9abbf162dc0e4a45b22c5
+Subproject commit 3943914eed66470bd010df581e29e4dca4f7df6f
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 7de7e5d02bf6..41bee20d7eb4 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 7de7e5d02bf687f971e7668963649728356e0c20
+Subproject commit 41bee20d7eb4a67feeeeb8d597b3598994eb1959
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 6e94643bdf1d..1d79ecfdb4c9 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 6e94643bdf1d51a505b147f28c358fb71070b8fd
+Subproject commit 1d79ecfdb4c9234537e1bf5148f44a1af54501ec
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 0f053c82a747..21935dcbf56a 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 0f053c82a747b4dcdf49570ec87c17e0067b7439
+Subproject commit 21935dcbf56ad3bd66ebff9891a6bc3865b8106d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 896c7b75a1ec..2142a09d6d2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,7 @@ mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
 mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
+mxnet_option(BUILD_CYTHON_MODULES "Build cython modules." OFF)
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
@@ -147,9 +148,11 @@ else(MSVC)
   endif()
   if(CMAKE_BUILD_TYPE STREQUAL "Debug")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0 -g")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_ASSERTIONS")
   elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
     add_definitions(-DNDEBUG=1)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -g")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_ASSERTIONS")
   else()
     add_definitions(-DNDEBUG=1)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
@@ -159,7 +162,7 @@ else(MSVC)
   elseif(SUPPORT_MSSE2)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
   endif()
-  set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS})
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_FLAGS}")
   if(SUPPORT_CXX14)
     add_definitions(-DDMLC_USE_CXX11=1)
     add_definitions(-DDMLC_USE_CXX14=1)
@@ -834,3 +837,12 @@ endif()
 set(LINT_DIRS "include src plugin cpp-package tests")
 set(EXCLUDE_PATH "src/operator/contrib/ctc_include")
 add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DPROJECT_NAME=mxnet -DEXCLUDE_PATH=${EXCLUDE_PATH} -P ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dmlc-core/cmake/lint.cmake)
+
+if(BUILD_CYTHON_MODULES)
+  include(cmake/BuildCythonModules.cmake)
+  add_cython_modules(2) # Build cython module for python2 if python2 is found
+  add_cython_modules(3) # Build cython module for python3 if python3 is found
+  if((NOT ${PYTHON2_FOUND}) AND (NOT ${PYTHON3_FOUND}))
+    message(FATAL_ERROR "No python interpreter found to build cython modules")
+  endif()
+endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f0ec80e2725c..c76f8c6edbc8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -69,6 +69,8 @@ The committers are the granted write access to the project.
   - Patric is a parallel computing expert and a major contributor to the MXNet MKL-DNN backend.
 * [Tao Lv](https://github.com/TaoLv)
   - Tao is a major contributor to the MXNet MKL-DNN backend and performance on CPU.
+* [Zach Kimberg](https://github.com/zachgk)
+  - Zach is one of the major maintainers of the MXNet Scala package.
 
 
 ### Become a Committer
@@ -237,9 +239,10 @@ List of Contributors
 * [Zhennan Qin](https://github.com/ZhennanQin)
 * [Zhiyuan Huang](https://github.com/huangzhiyuan)
 * [Zak Jost](https://github.com/zjost)
+* [Nick Guletskii](https://github.com/nickguletskii)
 * [Shoubhik Bhattacharya](https://github.com/shoubhik)
-* [Zach Kimberg](https://github.com/zachgk)
 * [Rohit Srivastava](https://github.com/access2rohit)
+* [Caner Turkmen](https://github.com/canerturkmen)
 
 Label Bot
 ---------
diff --git a/KEYS b/KEYS
index 7b78ea97e3ea..ff503f7efb7e 100644
--- a/KEYS
+++ b/KEYS
@@ -688,4 +688,63 @@ n4aiPNGpG7CDmCNnGMJgNYEEbqe1RQ7B4xwmNmGJvdVJRsTfy5557hZNfIfVkdES
 QTXMfTPP627GwzHQXTdAn9CSGW5FkaSHTVTCZhalBHhAFMDg86ZGUxZDYwhf3s6W
 44liPzisQFRxRFOwEubvmw==
 =dQvb
+
+-----END PGP PUBLIC KEY BLOCK-----
+pub   rsa4096 2019-05-15 [SC]
+      228ADD932CB218723E61D09E043071126325F0EC
+uid           [ultimate] Zach Kimberg <zachgk@apache.org>
+sig 3        043071126325F0EC 2019-05-15  Zach Kimberg <zachgk@apache.org>
+sub   rsa4096 2019-05-15 [E]
+sig          043071126325F0EC 2019-05-15  Zach Kimberg <zachgk@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFzcgWsBEADpXccvPwwE1tNd/f5L1+x5+Kcaw1jLhb1y3/S9Um8iE3wZsRvR
+pxNXhw8n+obqVicWFrShCI2rS6yUZFeWAP8X3XWM47sx93y/fFpg4+mDP0Ejl0op
+VmZeiX4MCwloMWRMpN5XtiLzilNVUuisa0UhHJaQ67eOjQuZac/nbJojptBaBa9D
+zf/1TLAd7mTTk8TBosouSd13gCX262EJb2n2hOYl2hx59Tky1CYNoHpQYdfH+u0U
+bwOfMEzbrrD8HyqF1eeEu8EagWKc5piByOWn6smBjpU2uBqBh8N6MH/mY5aDsqGB
+wkexiAsq/sKbPi0iFJ0CWmSls69Twe0vmW+THh7SWfGhbUxGwHsuYYIQnjcTHSMg
+HZHhB0RrjqiYtyfSvqo0mSOgwtZAX0dg4uCyZtPPeRo9X5qIl8DVPVtKqybdVsX1
+06Pt8EaSSFlxPJN/giw86GflsP2hL3ttjB/p3/8oa1ffgl+Z8xjrkwlDBnKL2BI5
+sfad/l//oPd41IheNji6C2TdnZYyWRpDumes1Jr924E25bAcy3lI82QDdpyHSp4v
+9+LG3NRpqLzQ/LpgBZnpjSnVMN1xBdwXpJ87omKM+fzgG0qiScBKko8jGYeRr/IR
+sX1ofHUIty57zzWEUc5MN/zgtnIxY3ZSHs1erfnZm5JDou+1YblxkiMGZQARAQAB
+tCBaYWNoIEtpbWJlcmcgPHphY2hna0BhcGFjaGUub3JnPokCTgQTAQgAOBYhBCKK
+3ZMsshhyPmHQngQwcRJjJfDsBQJc3IFrAhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4B
+AheAAAoJEAQwcRJjJfDs1hgQAMMQkFmjWIOvHC63qJPeMP/BwgSeyANczohTWhBk
+fkpoxFHW5nGxOJePyHqINufy0G0eQLWzBQp/VFTTFadS/tPL0gZi2JvOXSLmroLa
+LBGcCkChTjXj1ah7pDq9J3KlHeOY6fzDDA9+3+8XRXMaROL2Bi3ax3jk0hhQ9iMh
+jH8iROHsAZAJg7CQ9MUCnlJlyS2mCNVOWBlsKWjoOW4s3pptvH8VBXnYRQfcTZbN
+VDwbwxvpq8cDgK+YJ+53MsgxCiYXQ0xtYOwYeThOdf29hRdOJBLwPpbZIlwCgYF5
+xeZinRCh10eVc8UvSFurXT4i91XulWs1pSMXv6xEV5LkXINzpfU8zzfCdpMYpq7n
+j7SJc+x5EhjUt8LQsa6ohVHsmEMQpEuX18uOxaA+WE2XAn4Y5kty9Hm1g6osSFEl
+40eGio9rz3zd63cHU3c9ccKAg0oxtWnsAw/kvSfExCg40kCvtrmrSoZt+iQnnw1l
+isvYtrzEjQiufbF4wryIfFcP/BZEZ4KeDvC6cU6tKyH8gYlW89EYfW92E4rfvofs
+o7i0Lo4vVVBmVqCoCORcsJsfRvZs5BpG4SXGD9kOsrz+LVmm3Y7vf68ycc7cTbKs
+rlIGLbMzIfk43s9i+C9IRgCo7PjImi7cbEBi52FvriR37ispDhvm2WX4i9o1DNl0
+FPeiuQINBFzcgWsBEADeqh6lsnBd0dzCNANIAhR8EsbRxuy3ihg52RVZR3HcUYyl
+osMlXWhGxz2HD/Tt6hKMv32oUXik14gIY9hsyXEBZpDsQAwW3fbyM2JJhTmup/ag
+F7Cy76NL1GvgPQ3soClGoXNn6m+W6wDW0shR0yDCfPpWY/h56Ub+7CTcfk4STbgV
+Erib+fr7sFlzGobpCOWq80k7wb70ak1J57sEBg1wJFVyme9OJGUfkUAznX8TPFy4
+o4esggkc9Wnr4hzMpk5n6/J0YVl//2YvVFlY8fvnw2pmHnyJpoMqf6ZomT2YhSJ6
+Z0Ni+tyr/CCTNpF7lKvRn9TEbP9Ll2YAWXFGK+MAz6fGmijm7V39IQN/F6duXq+5
+xrsN+tB8udxSRYgp3jG3jSdWvryC8XYyYcXKhgPhq5+PNzRCbkh2/rqqrgAjOOFS
+9kIMi95r1Rtb72CUevShHfC1WOulp9fzjt/zA5IpRlI5944CbBVa5wpAS4WyEBTb
+FCD9SCK2We2Nu7lJAfjdgKYQeVu3USOQaCRib/eNv2o28veqERV1ZlquefO1qtDa
+rstGbiFMI+CRMN5E6Y57gAxaOGud+H3o+DhdrzSTTHGLXsge/upjnRaS6PN3eEaK
+gLtUKYMuXjNBoSMSNylSkGxvlJSUQWAPEbn6fHUaZSSIugrl9Z9/TcHQTxCd5wAR
+AQABiQI2BBgBCAAgFiEEIordkyyyGHI+YdCeBDBxEmMl8OwFAlzcgWsCGwwACgkQ
+BDBxEmMl8Oyc1g//Z7wnTImcKyFaNNxGMHic5NPpf3e+zIqsLnDqKnUkiWxXMstE
+3pY0aX8b44fXy/QrUm18jC5LdDd/qH6sXdbBb9hBPwXmp6/WT5vSCn+Bnrj7iPE5
+DWr5mM1cisosn20UGQnb9wVGNrVD3GUwylQ58mu6ehbPTQ3Jah1DtVqpx8YfN9fR
+W5PPomKd8zRnYQ7i7nwkj32hWmBW0Kd+lgtCUunT2diic3w1PkQU4IL4Cr9wL4lO
+iLN5YVD4D2JeUC7t4mB5EJ3UT1/IcFYIwF0ULYhD1Ke00JocQ6pEVaGkg4Ll4wLN
+uO7kSOWbhvHwpH2uPtsAfTJ0h3MFsOSLypN+BdEiLNQ54c4U1zQ9BHzk6xJ3U37U
+eSQr5nNq3ceqjtH//7PR5/+OpeTbYuS/75LcujyKP73SqoZLS+41MNzmLnG5nBhJ
+dROfxO+mRLuY7fgZWlDBLAfe8Rmwfd7pxWusggBQ1MQjvweYRbBXwVxog04hL4uY
+Z0/2Tt5t15CGyVCY7HpvnpPTmHKHcSKlRiFHOp+kNLWbxFC0ryMBntYjinAvk1xy
+ihUvH40rKlgxbV3+KS/Ew88D2tJ2JGCACx2yzS3trw+oUXugoaAQiileXQfu47SI
+dM5GPiAt2UECZ9v4WthkRGygnoPiL/4IyDFsS9yDX7mOBSycVmT5R+VXmOs=
+=z8mO
 -----END PGP PUBLIC KEY BLOCK-----
diff --git a/Makefile b/Makefile
index b578683a74b6..741c5f0190f2 100644
--- a/Makefile
+++ b/Makefile
@@ -94,7 +94,7 @@ endif
 
 # CFLAGS for debug
 ifeq ($(DEBUG), 1)
-	CFLAGS += -g -O0
+	CFLAGS += -g -O0 -D_GLIBCXX_ASSERTIONS
 else
 	CFLAGS += -O3 -DNDEBUG=1
 endif
@@ -594,7 +594,7 @@ lint: cpplint rcpplint jnilint pylint
 
 cpplint:
 	3rdparty/dmlc-core/scripts/lint.py mxnet cpp include src plugin cpp-package tests \
-	--exclude_path src/operator/contrib/ctc_include
+	--exclude_path src/operator/contrib/ctc_include include/mkldnn
 
 pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
@@ -612,7 +612,7 @@ doxygen:
 
 # Cython build
 cython:
-	cd python; python setup.py build_ext --inplace --with-cython
+	cd python; $(PYTHON) setup.py build_ext --inplace --with-cython
 
 cython2:
 	cd python; python2 setup.py build_ext --inplace --with-cython
@@ -677,6 +677,26 @@ rclean:
 	$(RM) -r R-package/src/image_recordio.h R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
 		R-package/inst R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
 
+build/rat/apache-rat/target/apache-rat-0.13-SNAPSHOT.jar:
+	mkdir -p build
+	svn co http://svn.apache.org/repos/asf/creadur/rat/branches/0.12-release/ build/rat; \
+	cd build/rat; \
+	mvn -Dmaven.test.skip=true install;
+
+ratcheck: build/rat/apache-rat/target/apache-rat-0.13-SNAPSHOT.jar
+	exec 5>&1; \
+	RAT_JAR=build/rat/apache-rat/target/apache-rat-0.13-SNAPSHOT.jar; \
+	OUTPUT=$(java -jar $(RAT_JAR) -E tests/nightly/apache_rat_license_check/rat-excludes -d .|tee >(cat - >&5)); \
+    ERROR_MESSAGE="Printing headers for text files without a valid license header"; \
+    echo "-------Process The Output-------"; \
+    if [[ $OUTPUT =~ $ERROR_MESSAGE ]]; then \
+        echo "ERROR: RAT Check detected files with unknown licenses. Please fix and run test again!"; \
+        exit 1; \
+    else \
+        echo "SUCCESS: There are no files with an Unknown License."; \
+    fi
+
+
 ifneq ($(EXTRA_OPERATORS),)
 clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~ 
diff --git a/NEWS.md b/NEWS.md
index ad842ac84786..59f8de831c50 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -18,6 +18,24 @@
 MXNet Change Log
 ================
 
+## 1.4.1
+
+Apache MXNet (incubating) 1.4.1 is a maintenance release incorporating important bug fixes and important performance improvements. All users of Apache MXNet (incubating) 1.4.0 are advised to upgrade. You can install Apache MXNet (incubating) 1.4.1 at the usual place. Please review these Release Notes to learn the bug fixes.
+
+### Bug-fixes
+* Java bug-fix cherry pick (#14834)
+* Use DEFAULT macro in C APIs (#14767) (#14789)
+* Set idx2name for Optimizer object (#14703) (#14772)
+* Add pin_device_id option to Gluon DataLoader (#14136) (#14771)
+* Tidy up storage allocation and deallocation (#14480) (#14768)
+* Add MXEnginePushAsync and MXEnginePushSync C APIs (#14615) (#14770)
+* Less cudaGet/SetDevice calls in Gluon execution (#13764)
+* Fix nightly build of 1.4.x (#14556)
+* Memory fixes. Resolves #10867, and resolves #14080 (#14372) (#14586)
+* Fixes for data links (#14526)
+* Backport of Windows CI Fixes (#14420)
+
+
 ## 1.4.0
 
 - [New Features](#new-features)
diff --git a/README.md b/README.md
index 3eea2e78fa54..f3e524c79540 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ How to Contribute
 
 What's New
 ----------
+* [Version 1.4.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.4.1) - MXNet 1.4.1 Patch Release.
 * [Version 1.4.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.4.0) - MXNet 1.4.0 Release.
 * [Version 1.3.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.3.1) - MXNet 1.3.1 Patch Release.
 * [Version 1.3.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.3.0) - MXNet 1.3.0 Release.
diff --git a/amalgamation/README.md b/amalgamation/README.md
index 2ecf1626c1e7..b58776e372aa 100644
--- a/amalgamation/README.md
+++ b/amalgamation/README.md
@@ -114,17 +114,17 @@ To
 Change
 ```
 #ifdef __GNUC__
-  #define MX_TREAD_LOCAL __thread
+  #define MX_THREAD_LOCAL __thread
 #elif __STDC_VERSION__ >= 201112L
-  #define  MX_TREAD_LOCAL _Thread_local
+  #define  MX_THREAD_LOCAL _Thread_local
 #elif defined(_MSC_VER)
-  #define MX_TREAD_LOCAL __declspec(thread)
+  #define MX_THREAD_LOCAL __declspec(thread)
 #endif
 ```
 
 To
 ```
-#define MX_TREAD_LOCAL __declspec(thread)
+#define MX_THREAD_LOCAL __declspec(thread)
 ```
 
 **To build arm32 compatible version (e.g. iPhone 5):**
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/benchmark/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmark/opperf/README.md b/benchmark/opperf/README.md
new file mode 100644
index 000000000000..99c75be2bf7b
--- /dev/null
+++ b/benchmark/opperf/README.md
@@ -0,0 +1,182 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MXNet Operator Performance Benchmarks
+
+A Python utility for benchmarking and profiling individual MXNet operator execution.
+
+With this utility, for each MXNet operator you can get the following details:
+
+**Timing**
+1. Forward execution time
+2. Backward execution time
+3. Time spent for memory management
+
+**Memory**
+1. Total memory allocated
+
+# Motivation
+
+Benchmarks are usually done end-to-end for a given Network Architecture. For example: ResNet-50 benchmarks on ImageNet data. This is good measurement of overall performance and health of a deep learning framework. However, it is important to note the following important factors:
+1. Users use a lot more operators that are not part of a standard network like ResNet. Example: Tensor manipulation operators like mean, max, topk, argmax, sort etc.   
+2. A standard Network Architecture like ResNet-50 is made up of many operators Ex: Convolution2D, Softmax, Dense and more. Consider the following scenarios:
+    1. We improved the performance of Convolution2D operator, but due to a bug, Softmax performance went down. Overall, we may observe end to end benchmarks are running fine, we may miss out the performance degradation of a single operator which can accumulate and become untraceable.
+    2. You need to see in a given network, which operator is taking maximum time and plan optimization work. With end to end benchmarks, it is hard to get more fine grained numbers at operator level.
+3. We need to know on different hardware infrastructure (Ex: CPU with MKLDNN, GPU with NVIDIA CUDA and cuDNN) how different operators performs. With these details, we can plan the optimization work at operator level, which could exponentially boost up end to end performance.
+4. You want to have nightly performance tests across all operators in a deep learning framework to catch regressions early. 
+5. We can integrate this framework with a CI/CD system to run per operator performance tests for PRs. Example: When a PR modifies the kernel of TransposeConv2D, we can run benchmarks of TransposeConv2D operator to verify performance.
+
+Hence, in this utility, we will build the functionality to allow users and developers of deep learning frameworks to easily run benchmarks for individual operators.
+
+# How to use
+
+## Prerequisites
+
+This utility uses MXNet profiler under the hood to fetch compute and memory metrics. Hence, you need to build MXNet with `USE_PROFILER=1` flag.
+
+Make sure to build the flavor of MXNet, for example - with/without MKL, with CUDA 9 or 10.1 etc., on which you would like to measure operator performance. Finally, you need to add path to your cloned MXNet repository to the PYTHONPATH.
+
+```
+export PYTHONPATH=$PYTHONPATH:/path/to/incubator-mxnet/
+```
+
+## Usecase 1 - Run benchmarks for all the operators
+
+Below command runs all the MXNet operators (NDArray) benchmarks with default inputs and saves the final result as JSON in the given file.
+
+```
+python incubator-mxnet/benchmark/opperf/opperf.py --output-format json --output-file mxnet_operator_benchmark_results.json
+```
+
+**Other Supported Options:**
+
+1. **output-format** : `json` or `md` for markdown file output.
+
+2. **ctx** : `cpu` or `gpu`. By default, cpu on CPU machine, gpu(0) on GPU machine. You can override and set the global context for all operator benchmarks. Example: --ctx gpu(2).
+
+3. **dtype** : By default, `float32`. You can override and set the global dtype for all operator benchmarks. Example: --dtype float64.
+
+## Usecase 2 - Run benchmarks for all the operators in a specific category
+
+For example, you want to run benchmarks for all NDArray Broadcast Binary Operators, Ex: broadcast_add, broadcast_mod, broadcast_pow etc., You just run the following python script.
+
+```
+#!/usr/bin/python
+from benchmark.opperf.tensor_operations.binary_broadcast_operators import run_mx_binary_broadcast_operators_benchmarks
+
+# Run all Binary Broadcast operations benchmarks with default input values
+print(run_mx_binary_broadcast_operators_benchmarks())
+```
+
+Output for the above benchmark run, on a CPU machine, would look something like below:
+
+```
+{'broadcast_mod': [{'avg_time_forward_broadcast_mod': 28.7063, 'avg_time_mem_alloc_cpu/0': 4194.3042,
+                    'avg_time_backward_broadcast_mod': 12.0954, 'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}},
+                   {'avg_time_forward_broadcast_mod': 2.7332, 'avg_time_mem_alloc_cpu/0': 400.0,
+                    'avg_time_backward_broadcast_mod': 1.1288, 'inputs': {'lhs': (10000, 10), 'rhs': (10000, 10)}},
+                   {'avg_time_forward_broadcast_mod': 30.5322, 'avg_time_mem_alloc_cpu/0': 4000.0,
+                    'avg_time_backward_broadcast_mod': 225.0255, 'inputs': {'lhs': (10000, 1), 'rhs': (10000, 100)}}],
+ 'broadcast_power': [{'avg_time_backward_broadcast_power': 49.5871, 'avg_time_forward_broadcast_power': 18.0954,
+                      'avg_time_mem_alloc_cpu/0': 4194.3042, 'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}},
+                     {'avg_time_backward_broadcast_power': 4.6623, 'avg_time_forward_broadcast_power': 1.8283,
+                      'avg_time_mem_alloc_cpu/0': 400.0, 'inputs': {'lhs': (10000, 10), 'rhs': (10000, 10)}},
+                     {'avg_time_backward_broadcast_power': 279.922, 'avg_time_forward_broadcast_power': 24.4621,
+                      'avg_time_mem_alloc_cpu/0': 4000.0, 'inputs': {'lhs': (10000, 1), 'rhs': (10000, 100)}}],
+.....
+.....                      
+```
+
+## Usecase 3 - Run benchmarks for specific operator
+For example, you want to run benchmarks for `nd.add` operator in MXNet, you just run the following python script.
+
+```
+#!/usr/bin/python
+import mxnet as mx
+from mxnet import nd
+
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+
+add_res = run_performance_test(nd.add, run_backward=True, dtype='float32', ctx=mx.cpu(),
+                               inputs=[{"lhs": (1024, 1024),
+                                        "rhs": (1024, 1024)}],
+                               warmup=10, runs=25)
+```
+
+Output for the above benchmark run, on a CPU machine, would look something like below:
+
+```
+{'add': [{'avg_time_mem_alloc_cpu/0': 102760.4453,
+          'avg_time_forward_broadcast_add': 4.0372,
+          'avg_time_backward_broadcast_add': 5.3841,
+          'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}}]}
+
+```
+
+## Usecase 3.1 - Run benchmarks for group of operators with same input
+For example, you want to run benchmarks for `nd.add`, `nd.sub` operator in MXNet, with the same set of inputs. You just run the following python script.
+
+```
+#!/usr/bin/python
+import mxnet as mx
+from mxnet import nd
+
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+
+add_res = run_performance_test([nd.add, nd.sub], run_backward=True, dtype='float32', ctx=mx.cpu(),
+                               inputs=[{"lhs": (1024, 1024),
+                                        "rhs": (1024, 1024)}],
+                               warmup=10, runs=25)
+```
+
+Output for the above benchmark run, on a CPU machine, would look something like below:
+
+```
+{'add': [{'avg_time_mem_alloc_cpu/0': 102760.4453,
+          'avg_time_forward_broadcast_add': 4.0372,
+          'avg_time_backward_broadcast_add': 5.3841,
+          'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}}],
+'subtract': [{'avg_time_forward_broadcast_sub': 5.5137, 
+               'avg_time_mem_alloc_cpu/0': 207618.0469,
+               'avg_time_backward_broadcast_sub': 7.2976, 
+               'inputs': {'lhs': (1024, 1024), 'rhs': (1024, 1024)}}
+             ]}
+
+```
+# How does it work under the hood?
+
+Under the hood, executes NDArray operator using randomly generated data. Use MXNet profiler to get summary of the operator execution:
+1. Memory
+2. Computation time (forward, backward)
+
+See the design proposal document for more details - https://cwiki.apache.org/confluence/display/MXNET/MXNet+Operator+Benchmarks 
+
+**NOTE:**
+
+This utility queries MXNet operator registry to fetch all operators registered with MXNet, generate inputs and run benchmarks.
+However, fully automated tests are enabled only for simpler operators such as - broadcast operators, element_wise operators etc... For the purpose of readability and giving more control to the users, complex operators such as convolution (2D, 3D), Pooling, Recurrent are not fully automated but expressed as default rules.
+See `utils/op_registry_utils.py` for more details.
+
+# TODO
+
+All contributions are welcome. Below is the list of desired features:
+
+1. Cover all MXNet operators.
+2. Enhance MXNet profiler with additional APIs to programmatically fetch and process profiler data.
+3. Integration with CI/CD system to run operator benchmarks for PR builds, nightly builds.
+4. Dashboards and other modes of presentation of results for analyzing and planning tasks such as operator performance improvements.
+5. Randomized Tensor Shape generation for profiling to identify bottlenecks in the operators.
diff --git a/benchmark/opperf/__init__.py b/benchmark/opperf/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/benchmark/opperf/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmark/opperf/custom_operations/__init__.py b/benchmark/opperf/custom_operations/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/benchmark/opperf/custom_operations/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmark/opperf/custom_operations/custom_operations.py b/benchmark/opperf/custom_operations/custom_operations.py
new file mode 100644
index 000000000000..f26aed9b5b28
--- /dev/null
+++ b/benchmark/opperf/custom_operations/custom_operations.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+"""
+MXNet's Custom Operator Benchmark Tests.
+
+It does a simple element wise addition to make sure computation
+is not too much and we can observe custom operator logistics overhead.
+"""
+
+
+# 1. Define Custom Operator - Element wise Addition Multiplication
+class CustomAddOne(mx.operator.CustomOp):
+    def forward(self, is_train, req, in_data, out_data, aux):
+        self.assign(out_data[0], req[0], in_data[0] + 1)
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        self.assign(in_grad[0], req[0], out_grad[0])
+
+
+@mx.operator.register("CustomAddOne")
+class CustomAddOneProp(mx.operator.CustomOpProp):
+    def __init__(self):
+        super(CustomAddOneProp, self).__init__(need_top_grad=True)
+
+    def list_arguments(self):
+        return ['in']
+
+    def list_outputs(self):
+        return ['output']
+
+    def infer_shape(self, in_shape):
+        # inputs, outputs, aux
+        return [in_shape[0]], [in_shape[0]], []
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return CustomAddOne()
+
+
+"""Helps to benchmark MXNet's Custom Op for Element wise addition on a (1000, 1) tensor.
+    Performs both forward and backward operation.
+
+    This test mainly uncovers core custom op overhead in MXNet.
+
+    Benchmark will be done on the following operation:
+    native_add -> native_add -> native_add -> CUSTOM_ADD -> native_add -> native_add -> native_add
+
+    By default run on 'float32' precision.
+"""
+
+# TODO
diff --git a/benchmark/opperf/nd_operations/README.md b/benchmark/opperf/nd_operations/README.md
new file mode 100644
index 000000000000..7aa220c4368a
--- /dev/null
+++ b/benchmark/opperf/nd_operations/README.md
@@ -0,0 +1,143 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# TODO - Operators not covered in this Benchmark Utility
+
+**NOTE:** This list is AUTOGENERATED when you run opperf.py utility
+
+0. LogisticRegressionOutput
+1. broadcast_axes
+2. ravel_multi_index
+3. multi_sgd_mom_update
+4. smooth_l1
+5. scatter_nd
+6. reshape
+7. one_hot
+8. linalg_potri
+9. mp_sgd_update
+10. multi_sgd_update
+11. signum_update
+12. Convolution_v1
+13. repeat
+14. Custom
+15. softmax_cross_entropy
+16. SwapAxis
+17. norm
+18. Softmax
+19. rmspropalex_update
+20. fill_element_0index
+21. cast
+22. UpSampling
+23. BatchNorm_v1
+24. CTCLoss
+25. LRN
+26. cast_storage
+27. pick
+28. GridGenerator
+29. sample_multinomial
+30. Activation
+31. LinearRegressionOutput
+32. Pooling_v1
+33. ftml_update
+34. Crop
+35. ElementWiseSum
+36. diag
+37. Reshape
+38. Pad
+39. linalg_gemm2
+40. crop
+41. rmsprop_update
+43. RNN
+44. argmin
+45. SoftmaxOutput
+46. linalg_extractdiag
+47. sgd_mom_update
+48. SequenceLast
+49. Deconvolution
+50. flip
+51. SequenceReverse
+52. swapaxes
+53. SVMOutput
+54. linalg_trsm
+55. where
+56. SoftmaxActivation
+57. signsgd_update
+58. slice
+59. linalg_gelqf
+60. softmin
+61. linalg_gemm
+62. BilinearSampler
+63. mp_sgd_mom_update
+64. choose_element_0index
+65. tile
+66. space_to_depth
+67. gather_nd
+68. argsort
+69. SequenceMask
+70. reshape_like
+71. slice_axis
+72. stack
+73. topk
+74. khatri_rao
+75. multi_mp_sgd_update
+76. linalg_sumlogdiag
+77. broadcast_to
+78. IdentityAttachKLSparseReg
+79. sort
+80. SpatialTransformer
+81. Concat
+82. uniform
+83. InstanceNorm
+84. expand_dims
+85. multi_mp_sgd_mom_update
+86. reverse
+87. add_n
+88. clip
+89. ctc_loss
+90. shape_array
+91. unravel_index
+92. linalg_potrf
+93. Cast
+94. broadcast_like
+95. Embedding
+96. linalg_makediag
+97. transpose
+98. linalg_syrk
+99. squeeze
+101. ROIPooling
+102. ftrl_update
+103. SliceChannel
+104. slice_like
+105. depth_to_space
+106. linalg_maketrian
+108. pad
+109. LayerNorm
+110. split
+111. MAERegressionOutput
+112. Correlation
+113. argmax
+114. batch_take
+115. L2Normalization
+116. broadcast_axis
+117. linalg_trmm
+118. linalg_extracttrian
+119. normal
+120. take
+121. MakeLoss
+122. sgd_update
+123. adam_update
+124. concat
\ No newline at end of file
diff --git a/benchmark/opperf/nd_operations/__init__.py b/benchmark/opperf/nd_operations/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/benchmark/opperf/nd_operations/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py
new file mode 100644
index 000000000000..7f93621eb2ec
--- /dev/null
+++ b/benchmark/opperf/nd_operations/binary_operators.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Performance benchmark tests for MXNet NDArray Binary Operations - covers both broadcast and element_wise.
+1. Operators are automatically fetched from MXNet operator registry.
+2. Default Inputs are generated. See rules/default_params.py. You can override the default values.
+
+Below 20 binary broadcast Operators are covered:
+
+['broadcast_add', 'broadcast_div', 'broadcast_equal', 'broadcast_greater', 'broadcast_greater_equal',
+'broadcast_hypot', 'broadcast_lesser', 'broadcast_lesser_equal', 'broadcast_logical_and',
+'broadcast_logical_or', 'broadcast_logical_xor', 'broadcast_maximum', 'broadcast_minimum',
+'broadcast_minus', 'broadcast_mod', 'broadcast_mul', 'broadcast_not_equal', 'broadcast_plus',
+'broadcast_power', 'broadcast_sub']
+
+Below 4 binary element_wise Operators are covered:
+['elemwise_add', 'elemwise_mul', 'elemwise_sub', 'elemwise_div']
+
+"""
+import mxnet as mx
+
+from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
+from benchmark.opperf.utils.op_registry_utils import get_all_broadcast_binary_operators, \
+    get_all_elemen_wise_binary_operators
+
+
+def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context and precision (dtype)for all the binary
+    broadcast operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    warmup: int, default 10
+        Number of times to run for warmup
+    runs: int, default 50
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Fetch all Binary Broadcast Operators
+    mx_binary_broadcast_ops = get_all_broadcast_binary_operators()
+    # Run benchmarks
+    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, warmup, runs)
+    return mx_binary_op_results
+
+
+def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context and precision (dtype)for all the binary
+    element_wise operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    warmup: int, default 10
+        Number of times to run for warmup
+    runs: int, default 50
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Fetch all Binary Element_wise Operators
+    mx_binary_element_wise_ops = get_all_elemen_wise_binary_operators()
+    # Run benchmarks
+    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, warmup, runs)
+    return mx_binary_op_results
diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py
new file mode 100644
index 000000000000..69a0f4c23121
--- /dev/null
+++ b/benchmark/opperf/nd_operations/gemm_operators.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import nd
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+from benchmark.opperf.utils.common_utils import merge_map_list
+
+"""Performance benchmark tests for MXNet NDArray GEMM Operators.
+
+1. dot
+2. batch_dot
+
+TODO
+3. As part of default tests, following needs to be added:
+    3.1 Sparse dot. (csr, default) -> row_sparse
+    3.2 Sparse dot. (csr, row_sparse) -> default
+    3.3 With Transpose of lhs
+    3.4 With Transpose of rhs
+4. 1D array: inner product of vectors
+"""
+
+
+def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context and precision (dtype)for all the GEMM
+    operators (dot, batch_dot) in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    warmup: int, default 10
+        Number of times to run for warmup
+    runs: int, default 50
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Benchmark tests for dot and batch_dot operators
+    dot_benchmark_res = run_performance_test(
+        [nd.dot], run_backward=True,
+        dtype=dtype, ctx=ctx,
+        inputs=[{"lhs": (1024, 1024),
+                 "rhs": (1024, 1024)},
+                {"lhs": (1000, 10),
+                 "rhs": (1000, 10),
+                 "transpose_b": True},
+                {"lhs": (1000, 1),
+                 "rhs": (100, 1000),
+                 "transpose_a": True,
+                 "transpose_b": True}],
+        warmup=warmup, runs=runs)
+
+    batch_dot_benchmark_res = run_performance_test(
+        [nd.batch_dot], run_backward=True,
+        dtype=dtype, ctx=ctx,
+        inputs=[{"lhs": (32, 1024, 1024),
+                 "rhs": (32, 1024, 1024)},
+                {"lhs": (32, 1000, 10),
+                 "rhs": (32, 1000, 10),
+                 "transpose_b": True},
+                {"lhs": (32, 1000, 1),
+                 "rhs": (32, 100, 1000),
+                 "transpose_a": True,
+                 "transpose_b": True}],
+        warmup=warmup, runs=runs)
+
+    # Prepare combined results for GEMM operators
+    mx_gemm_op_results = merge_map_list(dot_benchmark_res + batch_dot_benchmark_res)
+    return mx_gemm_op_results
diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py
new file mode 100644
index 000000000000..16ea2c6f64f4
--- /dev/null
+++ b/benchmark/opperf/nd_operations/nn_activation_operators.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import nd
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+from benchmark.opperf.utils.common_utils import merge_map_list
+
+"""Performance benchmark tests for MXNet NDArray Activation Operators.
+
+1. LeakyRelu
+    1.1 Elu
+    1.2 Selu
+    1.3 Leaky
+    1.4 PRelu
+    1.5 RRelu
+3. Hard_Sigmoid
+4. Softmax
+5. Log_Softmax
+
+"""
+
+
+def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context and precision (dtype)for all the activation
+    operators (relu, sigmoid, softmax) in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    warmup: int, default 10
+        Number of times to run for warmup
+    runs: int, default 50
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Relu and its variation
+    relu_benchmark_res = run_performance_test([nd.LeakyReLU],
+                                              run_backward=True,
+                                              dtype=dtype,
+                                              ctx=ctx,
+                                              inputs=[{"data": (1024, 1024), "act_type": "leaky", "slope": 0.1},
+                                                      {"data": (10000, 1), "act_type": "leaky", "slope": 0.1},
+                                                      {"data": (10000, 100), "act_type": "leaky", "slope": 0.1},
+                                                      {"data": (1024, 1024), "act_type": "elu", "slope": 0.1},
+                                                      {"data": (10000, 1), "act_type": "elu", "slope": 0.1},
+                                                      {"data": (10000, 100), "act_type": "elu", "slope": 0.1},
+                                                      {"data": (1024, 1024), "act_type": "selu"},
+                                                      {"data": (10000, 1), "act_type": "selu"},
+                                                      {"data": (10000, 100), "act_type": "selu"},
+                                                      {"data": (1024, 1024), "act_type": "prelu", "gamma": (1, 1024)},
+                                                      {"data": (10000, 1), "act_type": "prelu", "gamma": (1, 1)},
+                                                      {"data": (10000, 100), "act_type": "prelu", "gamma": (1, 100)}
+                                                      ],
+                                              warmup=warmup,
+                                              runs=runs)
+
+    # Sigmoid => Covered as part of Unary ops
+    # Hard_Sigmoid
+    hard_sigmoid_benchmark_res = run_performance_test([nd.hard_sigmoid],
+                                                      run_backward=True,
+                                                      dtype=dtype,
+                                                      ctx=ctx,
+                                                      inputs=[{"data": (1024, 1024), "alpha": 0.25, "beta": 0.5},
+                                                              {"data": (10000, 1), "alpha": 0.25, "beta": 0.5},
+                                                              {"data": (10000, 100), "alpha": 0.25, "beta": 0.5}
+                                                              ],
+                                                      warmup=warmup,
+                                                      runs=runs)
+
+    # Softmax, LogSoftmax
+    softmax_benchmark_res = run_performance_test([nd.softmax, nd.log_softmax],
+                                                 run_backward=True,
+                                                 dtype=dtype,
+                                                 ctx=ctx,
+                                                 inputs=[{"data": (1024, 1024), "axis": -1, "temperature": 0.5},
+                                                         {"data": (10000, 1), "axis": -1, "temperature": 0.5},
+                                                         {"data": (10000, 100), "axis": -1, "temperature": 0.5}
+                                                         ],
+                                                 warmup=warmup,
+                                                 runs=runs)
+
+    # Prepare combined results
+    mx_activation_op_results = merge_map_list(relu_benchmark_res + hard_sigmoid_benchmark_res + softmax_benchmark_res)
+    return mx_activation_op_results
diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py
new file mode 100644
index 000000000000..d91b285f41aa
--- /dev/null
+++ b/benchmark/opperf/nd_operations/nn_basic_operators.py
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import nd
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+from benchmark.opperf.utils.common_utils import merge_map_list
+
+"""Performance benchmark tests for MXNet NDArray basic NN Operators.
+
+1. FullyConnected
+2. Dropout
+3. BatchNorm
+
+"""
+
+
+def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    # FullyConnnected operator benchmarks
+    fc_benchmark_res = run_performance_test([nd.FullyConnected],
+                                            run_backward=True,
+                                            dtype=dtype,
+                                            ctx=ctx,
+                                            inputs=[{"data": (32, 3, 256, 256),
+                                                     "num_hidden": 64,
+                                                     "weight": (64, 3 * 256 * 256),
+                                                     "bias": (64,),
+                                                     "flatten": True},
+                                                    {"data": (32, 3, 256, 256),
+                                                     "num_hidden": 64,
+                                                     "weight": (64, 256),
+                                                     "bias": (64,),
+                                                     "flatten": False}],
+                                            warmup=warmup,
+                                            runs=runs)
+
+    # Dropout benchmarks
+    dropout_benchmark_res = run_performance_test([nd.Dropout],
+                                                 run_backward=True,
+                                                 dtype=dtype,
+                                                 ctx=ctx,
+                                                 inputs=[{"data": (32, 3, 256, 256),
+                                                          "p": 0.5,
+                                                          "mode": "always"},
+                                                         {"data": (10000, 10),
+                                                          "p": 0.5,
+                                                          "mode": "always"}],
+                                                 warmup=warmup,
+                                                 runs=runs)
+    # BatchNorm benchmarks
+    batchnorm_benchmark_res = run_performance_test([nd.BatchNorm],
+                                                   run_backward=True,
+                                                   dtype=dtype,
+                                                   ctx=ctx,
+                                                   inputs=[{"data": (32, 3, 256, 256),
+                                                            "gamma": (3,),
+                                                            "beta": (3,),
+                                                            "moving_mean": (3,),
+                                                            "moving_var": (3,)},
+                                                           {"data": (32, 3, 10000, 10),
+                                                            "gamma": (3,),
+                                                            "beta": (3,),
+                                                            "moving_mean": (3,),
+                                                            "moving_var": (3,)}],
+                                                   warmup=warmup,
+                                                   runs=runs)
+    # Prepare combined results
+    mx_basic_nn_results = merge_map_list(fc_benchmark_res + dropout_benchmark_res + batchnorm_benchmark_res)
+    return mx_basic_nn_results
diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py
new file mode 100644
index 000000000000..e4749ec90de4
--- /dev/null
+++ b/benchmark/opperf/nd_operations/nn_conv_operators.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import nd
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+from benchmark.opperf.utils.common_utils import merge_map_list
+
+"""Performance benchmark tests for MXNet NDArray Convolution and Pooling Operators.
+
+MXNet NDArray Pooling Operators
+
+1. MaxPool1D
+2. MaxPool2D
+3. SumPool1D
+4. SumPool2D
+4. AvgPool1D
+5. AvgPool2D
+6. GlobalMaxPool1D
+7. GlobalMaxPool2D
+8. GlobalAvgPool1D
+9. GlobalAvgPool2D
+10.GlobalSumPool1D
+11.GlobalSumPool2D
+
+(Under the hood uses mx.nd.pooling)
+
+MXNet NDArray NN Convolution Operators
+
+1. Conv1D
+2. Conv2D
+3. Conv1DTranspose (DeConvolution)
+4. Conv2DTranspose (DeConvolution)
+
+(Under the hood uses mx.nd.convolution, mx.nd.Deconvolution)
+
+"""
+
+
+def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    pool_types = ['avg', 'max', 'sum']
+    global_pool_types = [0, 1]
+
+    # Run 1D and 2D Pooling performance runs
+    pool1d_benchmark_res = []
+    pool2d_benchmark_res = []
+    for pool_type in pool_types:
+        for global_pool in global_pool_types:
+            for pool1d_data in [(32, 3, 256), (32, 3, 64)]:
+                pool1d_benchmark_res += run_performance_test([nd.Pooling],
+                                                             run_backward=True,
+                                                             dtype=dtype,
+                                                             ctx=ctx,
+                                                             inputs=[{"data": pool1d_data,
+                                                                      "kernel": 3,
+                                                                      "pool_type": pool_type,
+                                                                      "global_pool": global_pool,
+                                                                      "stride": 1,
+                                                                      "pad": 1,
+                                                                      "layout": 'NCW'}
+                                                                     ],
+                                                             warmup=warmup,
+                                                             runs=runs)
+            for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+                pool2d_benchmark_res += run_performance_test([nd.Pooling],
+                                                             run_backward=True,
+                                                             dtype=dtype,
+                                                             ctx=ctx,
+                                                             inputs=[{"data": pool2d_data,
+                                                                      "kernel": (3, 3),
+                                                                      "pool_type": pool_type,
+                                                                      "global_pool": global_pool,
+                                                                      "stride": (1, 1),
+                                                                      "pad": (0, 0),
+                                                                      "layout": 'NCHW'}
+                                                                     ],
+                                                             warmup=warmup,
+                                                             runs=runs)
+    # Prepare combined results
+    mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res)
+    return mx_pooling_op_results
+
+
+def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    # Conv1D Benchmarks
+    conv1d_benchmark_res = []
+    for conv_data in [(32, 3, 256), (32, 3, 64)]:
+        conv1d_benchmark_res += run_performance_test([nd.Convolution],
+                                                     run_backward=True,
+                                                     dtype=dtype,
+                                                     ctx=ctx,
+                                                     inputs=[{"data": conv_data,
+                                                              "weight": (64, 3, 3,),
+                                                              "bias": (64,),
+                                                              "kernel": (3,),
+                                                              "stride": (1,),
+                                                              "dilate": (1,),
+                                                              "pad": (0,),
+                                                              "num_filter": 64,
+                                                              "layout": 'NCW'}
+                                                             ],
+                                                     warmup=warmup,
+                                                     runs=runs)
+    # Conv2D Benchmarks
+    conv2d_benchmark_res = []
+    for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+        conv2d_benchmark_res += run_performance_test([nd.Convolution],
+                                                     run_backward=True,
+                                                     dtype=dtype,
+                                                     ctx=ctx,
+                                                     inputs=[{"data": conv_data,
+                                                              "weight": (64, 3, 3, 3),
+                                                              "bias": (64,),
+                                                              "kernel": (3, 3),
+                                                              "stride": (1, 1),
+                                                              "dilate": (1, 1),
+                                                              "pad": (0, 0),
+                                                              "num_filter": 64,
+                                                              "layout": 'NCHW'}
+                                                             ],
+                                                     warmup=warmup,
+                                                     runs=runs)
+    # Prepare combined results
+    mx_conv_op_results = merge_map_list(conv1d_benchmark_res + conv2d_benchmark_res)
+    return mx_conv_op_results
diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py
new file mode 100644
index 000000000000..bad8c8e4c040
--- /dev/null
+++ b/benchmark/opperf/nd_operations/random_sampling_operators.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Performance benchmark tests for MXNet NDArray Random Sampling Operations.
+1. Operators are automatically fetched from MXNet operator registry.
+2. Default Inputs are generated. See rules/default_params.py. You can override the default values.
+
+Below 16 random sampling Operators are covered:
+
+['random_exponential', 'random_gamma', 'random_generalized_negative_binomial', 'random_negative_binomial',
+'random_normal', 'random_poisson', 'random_randint', 'random_uniform', 'sample_exponential', 'sample_gamma',
+'sample_generalized_negative_binomial', 'sample_multinomial', 'sample_negative_binomial', 'sample_normal',
+'sample_poisson', 'sample_uniform']
+
+"""
+
+import mxnet as mx
+
+from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
+from benchmark.opperf.utils.op_registry_utils import get_all_random_sampling_operators
+
+
+def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context and precision (dtype)for all the random sampling
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    warmup: int, default 10
+        Number of times to run for warmup
+    runs: int, default 50
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Fetch all Random Sampling Operators
+    mx_random_sample_ops = get_all_random_sampling_operators()
+    # Run benchmarks
+    mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, warmup, runs)
+    return mx_random_sample_op_results
diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py
new file mode 100644
index 000000000000..5bfe06621136
--- /dev/null
+++ b/benchmark/opperf/nd_operations/reduction_operators.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Performance benchmark tests for MXNet NDArray Reduction Operations.
+1. Operators are automatically fetched from MXNet operator registry.
+2. Default Inputs are generated. See rules/default_params.py. You can override the default values.
+
+Below 10 reduction Operators are covered:
+
+['max', 'max_axis', 'mean', 'min', 'min_axis', 'nanprod', 'nansum', 'prod', 'sum', 'sum_axis']
+
+"""
+
+import mxnet as mx
+
+from benchmark.opperf.utils.op_registry_utils import get_all_reduction_operators
+from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
+
+
+def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context and precision (dtype)for all the reduction
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    warmup: int, default 10
+        Number of times to run for warmup
+    runs: int, default 50
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Fetch all Reduction Operators
+    mx_reduction_broadcast_ops = get_all_reduction_operators()
+    # Run benchmarks
+    mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, warmup, runs)
+    return mx_reduction_op_results
diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py
new file mode 100644
index 000000000000..a562eebf2a92
--- /dev/null
+++ b/benchmark/opperf/nd_operations/unary_operators.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Performance benchmark tests for MXNet NDArray Unary Operations.
+1. Operators are automatically fetched from MXNet operator registry.
+2. Default Inputs are generated. See rules/default_params.py. You can override the default values.
+
+Below 54 unary Operators are covered:
+
+['BlockGrad', 'Flatten', 'abs', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctanh',
+'argmax_channel', 'cbrt', 'ceil', 'cos', 'cosh', 'degrees', 'erf', 'erfinv', 'exp', 'expm1', 'fix', 'flatten',
+'floor', 'gamma', 'gammaln', 'identity', 'log', 'log10', 'log1p', 'log2', 'logical_not', 'make_loss', 'negative',
+'ones_like', 'radians', 'rcbrt', 'reciprocal', 'relu', 'rint', 'round', 'rsqrt', 'shuffle', 'sigmoid', 'sign',
+'sin', 'sinh', 'size_array', 'softsign', 'sqrt', 'square', 'stop_gradient', 'tan', 'tanh', 'trunc', 'zeros_like']
+
+"""
+
+import mxnet as mx
+
+from benchmark.opperf.utils.op_registry_utils import get_all_unary_operators
+from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
+
+
+def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', warmup=10, runs=50):
+    """Runs benchmarks with the given context and precision (dtype)for all the unary
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    warmup: int, default 10
+        Number of times to run for warmup
+    runs: int, default 50
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Fetch all Unary Operators
+    mx_unary_broadcast_ops = get_all_unary_operators()
+    # Run benchmarks
+    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, warmup, runs)
+    return mx_unary_op_results
diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
new file mode 100755
index 000000000000..34c6cf96b723
--- /dev/null
+++ b/benchmark/opperf/opperf.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -*- coding: utf-8 -*-
+
+"""Commandline utility to run operator benchmarks"""
+
+import argparse
+import logging
+import os
+import sys
+
+import mxnet as mx
+
+from benchmark.opperf.nd_operations.unary_operators import run_mx_unary_operators_benchmarks
+from benchmark.opperf.nd_operations.binary_operators import run_mx_binary_broadcast_operators_benchmarks, \
+    run_mx_binary_element_wise_operators_benchmarks
+from benchmark.opperf.nd_operations.gemm_operators import run_gemm_operators_benchmarks
+from benchmark.opperf.nd_operations.random_sampling_operators import run_mx_random_sampling_operators_benchmarks
+from benchmark.opperf.nd_operations.reduction_operators import run_mx_reduction_operators_benchmarks
+from benchmark.opperf.nd_operations.nn_activation_operators import run_activation_operators_benchmarks
+from benchmark.opperf.nd_operations.nn_conv_operators import run_pooling_operators_benchmarks, \
+    run_convolution_operators_benchmarks
+from benchmark.opperf.nd_operations.nn_basic_operators import run_nn_basic_operators_benchmarks
+
+from benchmark.opperf.utils.common_utils import merge_map_list, save_to_file
+from benchmark.opperf.utils.op_registry_utils import get_operators_with_no_benchmark
+
+
+def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32'):
+    """Run all the MXNet operators (NDArray) benchmarks.
+
+    Returns
+    -------
+    Dictionary of benchmark results.
+    """
+    mxnet_operator_benchmark_results = []
+
+    # *************************MXNET TENSOR OPERATOR BENCHMARKS*****************************
+
+    # Run all Unary operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype))
+
+    # Run all Binary Broadcast, element_wise operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx,
+                                                                                         dtype=dtype))
+    mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx,
+                                                                                            dtype=dtype))
+
+    # Run all GEMM operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx,
+                                                                          dtype=dtype))
+
+    # Run all Random sampling operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype))
+
+    # Run all Reduction operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype))
+
+    # ************************ MXNET NN OPERATOR BENCHMARKS ****************************
+
+    # Run all basic NN operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype))
+
+    # Run all Activation operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype))
+
+    # Run all Pooling operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype))
+
+    # Run all Convolution operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype))
+
+    # ****************************** PREPARE FINAL RESULTS ********************************
+    final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results)
+    return final_benchmark_result_map
+
+
+def _parse_mxnet_context(ctx):
+    if not ctx:
+        raise ValueError("Context cannot be null or empty")
+
+    if ctx.lower() in ['cpu', 'gpu']:
+        return mx.context.Context(ctx)
+    elif ctx.lower().startwith('gpu('):
+        device_id = int(ctx[4:-1])
+        return mx.gpu(device_id)
+
+def main():
+    # 1. GET USER INPUTS
+    parser = argparse.ArgumentParser(
+        description='Run all the MXNet operators (NDArray) benchmarks')
+
+    parser.add_argument('--ctx', type=str, default='cpu',
+                        help='Global context to run all benchmarks. By default, cpu on a '
+                             'CPU machine, gpu(0) on a GPU machine. '
+                             'Valid Inputs - cpu, gpu, gpu(0), gpu(1)...')
+    parser.add_argument('--dtype', type=str, default='float32', help='DType (Precision) to run benchmarks. By default, '
+                                                                     'float32. Valid Inputs - float32, float64.')
+    parser.add_argument('-f', '--output-format', type=str, default='json',
+                        choices=['json', 'md'],
+                        help='Benchmark result output format. By default, json. '
+                             'Valid Inputs - json, md')
+
+    parser.add_argument('-o', '--output-file', type=str, default='./mxnet_operator_benchmarks.json',
+                        help='Name and path for the '
+                             'output file.')
+
+    args = parser.parse_args()
+    logging.info(f"Running MXNet operator benchmarks with the following options: {args}")
+    assert not os.path.isfile(args.output_file), f"Output file {args.output_file} already exists."
+
+    # 2. RUN BENCHMARKS
+    ctx = _parse_mxnet_context(args.ctx)
+    dtype = args.dtype
+    final_benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=args.dtype)
+
+    # 3. PREPARE OUTPUTS
+    save_to_file(final_benchmark_results, args.output_file, args.output_format)
+
+    # 4. Generate list of MXNet operators not covered in benchmarks
+    ops_not_covered = get_operators_with_no_benchmark(final_benchmark_results.keys())
+    for idx, op in enumerate(ops_not_covered):
+        print(f"{idx}. {op}")
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/benchmark/opperf/results/mxnet_operator_benchmark_results_cpu.md b/benchmark/opperf/results/mxnet_operator_benchmark_results_cpu.md
new file mode 100644
index 000000000000..9e2ffee25e1c
--- /dev/null
+++ b/benchmark/opperf/results/mxnet_operator_benchmark_results_cpu.md
@@ -0,0 +1,322 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MXNet Operator Benchmarks
+
+## Settings
+
+1. MXNet - v1.4.1
+2. Instance - C5.8x
+
+| Operator | Avg Forward Time (ms) | Avg. Backward Time (ms) | Max Mem Usage (Storage) (Bytes) | Inputs |
+| :---: | :---: | :---: | :---:| :--- |
+| shuffle | 0.8901 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| shuffle | 1.2146 | --- | 40.0 | {'data': (10000, 1)} |
+| shuffle | 1.8777 | --- | 4000.0 | {'data': (10000, 100)} |
+| broadcast_equal | 0.006 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| broadcast_hypot | 0.0108 | 0.0135 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| ceil | 3.4305 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| ceil | 0.0507 | --- | 40.0 | {'data': (10000, 1)} |
+| ceil | 3.317 | --- | 4000.0 | {'data': (10000, 100)} |
+| sum | 32.4206 | 25.5443 | 0.002 | {'data': (1024, 1024), 'axis': ()} |
+| sum | 0.3393 | 0.2507 | 0.004 | {'data': (10000, 1), 'axis': 0} |
+| sum | 31.0189 | 24.7422 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} |
+| broadcast_logical_xor | 0.0068 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| erf | 35.0669 | 16.5842 | 4194.3042 | {'data': (1024, 1024)} |
+| erf | 0.3982 | 0.1734 | 40.0 | {'data': (10000, 1)} |
+| erf | 29.4103 | 14.3537 | 4000.0 | {'data': (10000, 100)} |
+| tanh | 11.2211 | 6.1798 | 2097.1521 | {'data': (1024, 1024)} |
+| tanh | 0.1628 | 0.0622 | 40.0 | {'data': (10000, 1)} |
+| tanh | 10.7941 | 6.0085 | 4000.0 | {'data': (10000, 100)} |
+| arcsinh | 10.0168 | 8.5245 | 2097.1521 | {'data': (1024, 1024)} |
+| arcsinh | 0.1111 | 0.0905 | 40.0 | {'data': (10000, 1)} |
+| arcsinh | 9.4415 | 7.9082 | 2000.0 | {'data': (10000, 100)} |
+| fix | 15.541 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| fix | 0.1615 | --- | 40.0 | {'data': (10000, 1)} |
+| fix | 14.591 | --- | 4000.0 | {'data': (10000, 100)} |
+| broadcast_maximum | 0.0097 | 0.0099 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| sin | 14.4123 | 16.5642 | 2097.1521 | {'data': (1024, 1024)} |
+| sin | 0.1459 | 0.156 | 40.0 | {'data': (10000, 1)} |
+| sin | 13.821 | 15.4752 | 2000.0 | {'data': (10000, 100)} |
+| random_normal | 151.0089 | --- | 4194.3042 | {'shape': (1024, 1024)} |
+| random_normal | 1.456 | --- | 40.0 | {'shape': (10000, 1)} |
+| random_normal | 144.775 | --- | 2000.0 | {'shape': (10000, 100)} |
+| sqrt | 3.3861 | 5.1123 | 2097.1521 | {'data': (1024, 1024)} |
+| sqrt | 0.0393 | 0.0548 | 20.0 | {'data': (10000, 1)} |
+| sqrt | 3.3037 | 4.7883 | 2000.0 | {'data': (10000, 100)} |
+| BlockGrad | 0.3275 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| BlockGrad | 0.0161 | --- | 40.0 | {'data': (10000, 1)} |
+| BlockGrad | 0.3118 | --- | 4000.0 | {'data': (10000, 100)} |
+| sample_exponential | 123.8534 | --- | 8388.6084 | {'lam': [1.0, 8.5], 'shape': (1024, 1024)} |
+| sample_exponential | 1.3394 | --- | 80.0 | {'lam': [1.0, 8.5], 'shape': (10000, 1)} |
+| sample_exponential | 118.4786 | --- | 8000.0 | {'lam': [1.0, 8.5], 'shape': (10000, 100)} |
+| sample_gamma | 529.0305 | --- | 8388.6084 | {'alpha': [0.0, 2.5], 'shape': (1024, 1024), 'beta': [1.0, 0.7]} |
+| sample_gamma | 5.7426 | --- | 80.0 | {'alpha': [0.0, 2.5], 'shape': (10000, 1), 'beta': [1.0, 0.7]} |
+| sample_gamma | 496.0531 | --- | 8000.0 | {'alpha': [0.0, 2.5], 'shape': (10000, 100), 'beta': [1.0, 0.7]} |
+| log2 | 12.3183 | 4.5842 | 2097.1521 | {'data': (1024, 1024)} |
+| log2 | 0.1269 | 0.0459 | 40.0 | {'data': (10000, 1)} |
+| log2 | 11.6719 | 4.2632 | 4000.0 | {'data': (10000, 100)} |
+| broadcast_greater_equal | 0.0092 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| FullyConnected | 18.4677 | 21.6917 | 8.192 | {'data': (32, 3, 256, 256), 'num_hidden': 64, 'weight': (64, 196608), 'bias': (64,), 'flatten': True} |
+| FullyConnected | 20.3379 | 38.8295 | 6291.4561 | {'data': (32, 3, 256, 256), 'num_hidden': 64, 'weight': (64, 256), 'bias': (64,), 'flatten': False} |
+| cos | 14.8699 | 16.8678 | 2097.1521 | {'data': (1024, 1024)} |
+| cos | 0.1511 | 0.1585 | 40.0 | {'data': (10000, 1)} |
+| cos | 14.0109 | 15.5246 | 2000.0 | {'data': (10000, 100)} |
+| broadcast_mul | 0.0075 | 0.0075 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| arccos | 21.5631 | 12.8768 | 4194.3042 | {'data': (1024, 1024)} |
+| arccos | 0.1719 | 0.1084 | 40.0 | {'data': (10000, 1)} |
+| arccos | 15.3153 | 7.9161 | 2000.0 | {'data': (10000, 100)} |
+| stop_gradient | --- | --- | 4194.3042 | {'data': (1024, 1024)} |
+| stop_gradient | --- | --- | 40.0 | {'data': (10000, 1)} |
+| stop_gradient | --- | --- | 4000.0 | {'data': (10000, 100)} |
+| broadcast_sub | 0.0078 | 0.0059 | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| random_poisson | 112.7425 | --- | 4194.3042 | {'shape': (1024, 1024)} |
+| random_poisson | 1.0701 | --- | 40.0 | {'shape': (10000, 1)} |
+| random_poisson | 114.3405 | --- | 2000.0 | {'shape': (10000, 100)} |
+| rsqrt | 4.3564 | 7.0663 | 2097.1521 | {'data': (1024, 1024)} |
+| rsqrt | 0.075 | 0.0861 | 40.0 | {'data': (10000, 1)} |
+| rsqrt | 4.5076 | 6.6598 | 4000.0 | {'data': (10000, 100)} |
+| nansum | 34.2019 | 57.1624 | 0.002 | {'data': (1024, 1024), 'axis': ()} |
+| nansum | 0.3683 | 0.5326 | 0.002 | {'data': (10000, 1), 'axis': 0} |
+| nansum | 32.9698 | 55.4243 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} |
+| hard_sigmoid | 7.5926 | 6.5839 | 2097.1521 | {'data': (1024, 1024), 'alpha': 0.25, 'beta': 0.5} |
+| hard_sigmoid | 0.1086 | 0.0895 | 40.0 | {'data': (10000, 1), 'alpha': 0.25, 'beta': 0.5} |
+| hard_sigmoid | 8.1285 | 6.6014 | 4000.0 | {'data': (10000, 100), 'alpha': 0.25, 'beta': 0.5} |
+| softmax | 25.4074 | 9.4933 | 2097.1521 | {'data': (1024, 1024), 'axis': -1, 'temperature': 0.5} |
+| softmax | 0.4022 | 0.3145 | 40.0 | {'data': (10000, 1), 'axis': -1, 'temperature': 0.5} |
+| softmax | 25.604 | 9.4286 | 4000.0 | {'data': (10000, 100), 'axis': -1, 'temperature': 0.5} |
+| random_negative_binomial | 285.8721 | --- | 4194.3042 | {'k': 1, 'p': 1, 'shape': (1024, 1024)} |
+| random_negative_binomial | 2.839 | --- | 40.0 | {'k': 1, 'p': 1, 'shape': (10000, 1)} |
+| random_negative_binomial | 273.034 | --- | 2000.0 | {'k': 1, 'p': 1, 'shape': (10000, 100)} |
+| BatchNorm | 66.062 | 88.4693 | 25165.8359 | {'data': (32, 3, 256, 256), 'gamma': (3,), 'beta': (3,), 'moving_mean': (3,), 'moving_var': (3,)} |
+| BatchNorm | 101.3006 | 134.4362 | 38400.0117 | {'data': (32, 3, 10000, 10), 'gamma': (3,), 'beta': (3,), 'moving_mean': (3,), 'moving_var': (3,)} |
+| Pooling | 0.5533 | 0.6485 | 49.152 | {'data': (32, 3, 256), 'kernel': 3, 'pool_type': 'avg', 'global_pool': 0, 'stride': 1, 'pad': 1, 'layout': 'NCW'} |
+| radians | 3.3238 | 3.9704 | 4194.3042 | {'data': (1024, 1024)} |
+| radians | 0.0391 | 0.0436 | 40.0 | {'data': (10000, 1)} |
+| radians | 3.2462 | 3.775 | 4000.0 | {'data': (10000, 100)} |
+| arctanh | 13.3211 | 6.3172 | 2097.1521 | {'data': (1024, 1024)} |
+| arctanh | 0.1498 | 0.0683 | 40.0 | {'data': (10000, 1)} |
+| arctanh | 12.5376 | 6.0177 | 2000.0 | {'data': (10000, 100)} |
+| nanprod | 34.3464 | 57.9841 | 0.004 | {'data': (1024, 1024), 'axis': ()} |
+| nanprod | 0.3638 | 0.5336 | 0.004 | {'data': (10000, 1), 'axis': 0} |
+| nanprod | 32.83 | 55.2982 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} |
+| elemwise_add | 0.0065 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| cosh | 8.4872 | 10.6597 | 2097.1521 | {'data': (1024, 1024)} |
+| cosh | 0.1015 | 0.1201 | 40.0 | {'data': (10000, 1)} |
+| cosh | 8.3937 | 10.6244 | 4000.0 | {'data': (10000, 100)} |
+| tan | 15.4508 | 6.0752 | 2097.1521 | {'data': (1024, 1024)} |
+| tan | 0.1549 | 0.0591 | 40.0 | {'data': (10000, 1)} |
+| tan | 14.6992 | 5.802 | 2000.0 | {'data': (10000, 100)} |
+| broadcast_not_equal | 0.0054 | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| trunc | 3.493 | --- | 2097.1521 | {'data': (1024, 1024)} |
+| trunc | 0.0505 | --- | 40.0 | {'data': (10000, 1)} |
+| trunc | 3.1751 | --- | 2000.0 | {'data': (10000, 100)} |
+| min_axis | 36.7382 | --- | 0.004 | {'data': (1024, 1024), 'axis': ()} |
+| min_axis | 0.4225 | --- | 0.004 | {'data': (10000, 1), 'axis': 0} |
+| min_axis | 31.3261 | --- | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} |
+| random_uniform | 44.7633 | --- | 4194.3042 | {'low': 0, 'high': 5, 'shape': (1024, 1024)} |
+| random_uniform | 0.4607 | --- | 40.0 | {'low': 0, 'high': 5, 'shape': (10000, 1)} |
+| random_uniform | 42.9135 | --- | 4000.0 | {'low': 0, 'high': 5, 'shape': (10000, 100)} |
+| abs | 4.3965 | 13.406 | 4194.3042 | {'data': (1024, 1024)} |
+| abs | 0.0696 | 0.1374 | 40.0 | {'data': (10000, 1)} |
+| abs | 4.3552 | 13.7197 | 4000.0 | {'data': (10000, 100)} |
+| broadcast_lesser_equal | 0.0054 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| random_randint | 65.414 | --- | 4194.3042 | {'low': 0, 'high': 5, 'shape': (1024, 1024)} |
+| random_randint | 0.6331 | --- | 40.0 | {'low': 0, 'high': 5, 'shape': (10000, 1)} |
+| random_randint | 61.32 | --- | 4000.0 | {'low': 0, 'high': 5, 'shape': (10000, 100)} |
+| log1p | 13.6758 | 5.2497 | 2097.1521 | {'data': (1024, 1024)} |
+| log1p | 0.1493 | 0.0562 | 40.0 | {'data': (10000, 1)} |
+| log1p | 12.9494 | 5.0609 | 2000.0 | {'data': (10000, 100)} |
+| log | 11.9666 | 5.1096 | 4194.3042 | {'data': (1024, 1024)} |
+| log | 0.1306 | 0.0588 | 40.0 | {'data': (10000, 1)} |
+| log | 11.8985 | 5.0319 | 2000.0 | {'data': (10000, 100)} |
+| round | 14.6427 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| round | 0.1424 | --- | 20.0 | {'data': (10000, 1)} |
+| round | 13.58 | --- | 2000.0 | {'data': (10000, 100)} |
+| sample_negative_binomial | 1263.9417 | --- | 8388.6084 | {'k': [20, 49], 'shape': (1024, 1024), 'p': [0.4, 0.77]} |
+| sample_negative_binomial | 12.5213 | --- | 80.0 | {'k': [20, 49], 'shape': (10000, 1), 'p': [0.4, 0.77]} |
+| sample_negative_binomial | 1207.5739 | --- | 8000.0 | {'k': [20, 49], 'shape': (10000, 100), 'p': [0.4, 0.77]} |
+| max | 30.7008 | 55.863 | 0.002 | {'data': (1024, 1024), 'axis': ()} |
+| max | 0.3287 | 0.5147 | 0.004 | {'data': (10000, 1), 'axis': 0} |
+| max | 29.4913 | 53.255 | 0.002 | {'data': (10000, 100), 'axis': (0, 1)} |
+| mean | 31.9337 | 35.9235 | 0.002 | {'data': (1024, 1024), 'axis': ()} |
+| mean | 0.4088 | 0.3453 | 0.002 | {'data': (10000, 1), 'axis': 0} |
+| mean | 31.5658 | 34.609 | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} |
+| sign | 10.1736 | 4.1682 | 4194.3042 | {'data': (1024, 1024)} |
+| sign | 0.1251 | 0.0588 | 40.0 | {'data': (10000, 1)} |
+| sign | 9.5196 | 3.9109 | 2000.0 | {'data': (10000, 100)} |
+| broadcast_power | 0.0117 | 0.0112 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| argmax_channel | 10.9332 | --- | 4.096 | {'data': (1024, 1024)} |
+| argmax_channel | 0.2703 | --- | 40.0 | {'data': (10000, 1)} |
+| argmax_channel | 10.7759 | --- | 40.0 | {'data': (10000, 100)} |
+| flatten | --- | --- | 4194.3042 | {'data': (1024, 1024)} |
+| flatten | --- | --- | 40.0 | {'data': (10000, 1)} |
+| flatten | --- | --- | 4000.0 | {'data': (10000, 100)} |
+| ones_like | 2.127 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| ones_like | 0.028 | --- | 40.0 | {'data': (10000, 1)} |
+| ones_like | 1.8846 | --- | 4000.0 | {'data': (10000, 100)} |
+| negative | 2.6672 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| negative | 0.0321 | --- | 40.0 | {'data': (10000, 1)} |
+| negative | 2.4958 | --- | 4000.0 | {'data': (10000, 100)} |
+| elemwise_mul | 0.0054 | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| batch_dot | 766.5307 | 1365.6267 | 134217.7344 | {'lhs': (32, 1024, 1024), 'rhs': (32, 1024, 1024)} |
+| batch_dot | 37.618 | 46.1098 | 128000.0 | {'lhs': (32, 1000, 10), 'rhs': (32, 1000, 10), 'transpose_b': True} |
+| batch_dot | 1.3618 | 4.0882 | 6.4 | {'lhs': (32, 1000, 1), 'rhs': (32, 100, 1000), 'transpose_a': True, 'transpose_b': True} |
+| sum_axis | 33.2033 | --- | 0.004 | {'data': (1024, 1024), 'axis': ()} |
+| sum_axis | 0.3155 | --- | 0.004 | {'data': (10000, 1), 'axis': 0} |
+| sum_axis | 30.9792 | --- | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} |
+| floor | 3.5835 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| floor | 0.0499 | --- | 20.0 | {'data': (10000, 1)} |
+| floor | 3.3519 | --- | 4000.0 | {'data': (10000, 100)} |
+| logical_not | 3.0748 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| logical_not | 0.0319 | --- | 40.0 | {'data': (10000, 1)} |
+| logical_not | 3.0173 | --- | 4000.0 | {'data': (10000, 100)} |
+| log10 | 12.3647 | 4.5036 | 2097.1521 | {'data': (1024, 1024)} |
+| log10 | 0.1647 | 0.0619 | 40.0 | {'data': (10000, 1)} |
+| log10 | 11.7758 | 4.231 | 2000.0 | {'data': (10000, 100)} |
+| rcbrt | 11.737 | 14.931 | 2097.1521 | {'data': (1024, 1024)} |
+| rcbrt | 0.1241 | 0.1421 | 40.0 | {'data': (10000, 1)} |
+| rcbrt | 11.2254 | 14.2139 | 2000.0 | {'data': (10000, 100)} |
+| broadcast_logical_or | 0.0093 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| sample_normal | 304.5372 | --- | 8388.6084 | {'mu': [2.0, 2.5], 'shape': (1024, 1024), 'sigma': [1.0, 3.7]} |
+| sample_normal | 2.8403 | --- | 80.0 | {'mu': [2.0, 2.5], 'shape': (10000, 1), 'sigma': [1.0, 3.7]} |
+| sample_normal | 284.6853 | --- | 8000.0 | {'mu': [2.0, 2.5], 'shape': (10000, 100), 'sigma': [1.0, 3.7]} |
+| broadcast_minimum | 0.0073 | 0.0073 | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| arctan | 10.4997 | 6.4532 | 2097.1521 | {'data': (1024, 1024)} |
+| arctan | 0.1269 | 0.0683 | 40.0 | {'data': (10000, 1)} |
+| arctan | 10.1779 | 6.1741 | 2000.0 | {'data': (10000, 100)} |
+| broadcast_mod | 0.0131 | 0.0127 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| size_array | 0.0056 | --- | 0.008 | {'data': (1024, 1024)} |
+| size_array | 0.005 | --- | 0.008 | {'data': (10000, 1)} |
+| size_array | 0.0081 | --- | 0.004 | {'data': (10000, 100)} |
+| make_loss | 0.4874 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| make_loss | 0.013 | --- | 40.0 | {'data': (10000, 1)} |
+| make_loss | 0.3483 | --- | 4000.0 | {'data': (10000, 100)} |
+| broadcast_greater | 0.0082 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| gammaln | 49.6217 | 105.7931 | 2097.1521 | {'data': (1024, 1024)} |
+| gammaln | 0.4789 | 0.9577 | 40.0 | {'data': (10000, 1)} |
+| gammaln | 48.474 | 102.211 | 4000.0 | {'data': (10000, 100)} |
+| broadcast_lesser | 0.0084 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| max_axis | 30.1487 | --- | 0.004 | {'data': (1024, 1024), 'axis': ()} |
+| max_axis | 0.3101 | --- | 0.004 | {'data': (10000, 1), 'axis': 0} |
+| max_axis | 29.4315 | --- | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} |
+| degrees | 3.659 | 4.2964 | 2097.1521 | {'data': (1024, 1024)} |
+| degrees | 0.0595 | 0.0538 | 20.0 | {'data': (10000, 1)} |
+| degrees | 3.8676 | 4.1255 | 4000.0 | {'data': (10000, 100)} |
+| sinh | 8.9259 | 10.3014 | 2097.1521 | {'data': (1024, 1024)} |
+| sinh | 0.0989 | 0.1048 | 40.0 | {'data': (10000, 1)} |
+| sinh | 8.4579 | 9.7402 | 2000.0 | {'data': (10000, 100)} |
+| zeros_like | 2.4764 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| zeros_like | 0.0056 | --- | 40.0 | {'data': (10000, 1)} |
+| zeros_like | 2.3254 | --- | 4000.0 | {'data': (10000, 100)} |
+| arccosh | 6.8035 | 7.7818 | 2097.1521 | {'data': (1024, 1024)} |
+| arccosh | 0.0764 | 0.0847 | 40.0 | {'data': (10000, 1)} |
+| arccosh | 6.444 | 7.5842 | 2000.0 | {'data': (10000, 100)} |
+| prod | 28.2885 | 55.9765 | 0.002 | {'data': (1024, 1024), 'axis': ()} |
+| prod | 0.2996 | 0.5213 | 0.004 | {'data': (10000, 1), 'axis': 0} |
+| prod | 26.9891 | 54.6354 | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} |
+| random_gamma | 247.5786 | --- | 2097.1521 | {'shape': (1024, 1024)} |
+| random_gamma | 2.3986 | --- | 40.0 | {'shape': (10000, 1)} |
+| random_gamma | 237.5963 | --- | 2000.0 | {'shape': (10000, 100)} |
+| broadcast_minus | --- | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| Flatten | 0.3339 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| Flatten | 0.0152 | --- | 40.0 | {'data': (10000, 1)} |
+| Flatten | 0.3546 | --- | 4000.0 | {'data': (10000, 100)} |
+| expm1 | 9.8241 | 11.7609 | 4194.3042 | {'data': (1024, 1024)} |
+| expm1 | 0.1844 | 0.1675 | 40.0 | {'data': (10000, 1)} |
+| expm1 | 9.0366 | 10.4387 | 4000.0 | {'data': (10000, 100)} |
+| elemwise_div | 0.0064 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| LeakyReLU | 10.3625 | 12.5441 | 4194.3042 | {'data': (1024, 1024), 'act_type': 'leaky', 'slope': 0.1} |
+| LeakyReLU | 0.1076 | 0.1277 | 40.0 | {'data': (10000, 1), 'act_type': 'leaky', 'slope': 0.1} |
+| LeakyReLU | 9.5913 | 11.7957 | 2000.0 | {'data': (10000, 100), 'act_type': 'leaky', 'slope': 0.1} |
+| LeakyReLU | 12.337 | 12.6383 | 2097.1521 | {'data': (1024, 1024), 'act_type': 'elu', 'slope': 0.1} |
+| LeakyReLU | 0.1305 | 0.1217 | 40.0 | {'data': (10000, 1), 'act_type': 'elu', 'slope': 0.1} |
+| LeakyReLU | 11.652 | 11.8465 | 4000.0 | {'data': (10000, 100), 'act_type': 'elu', 'slope': 0.1} |
+| LeakyReLU | 12.4973 | 11.4957 | 2097.1521 | {'data': (1024, 1024), 'act_type': 'selu'} |
+| LeakyReLU | 0.1295 | 0.1176 | 40.0 | {'data': (10000, 1), 'act_type': 'selu'} |
+| LeakyReLU | 12.2224 | 11.548 | 4000.0 | {'data': (10000, 100), 'act_type': 'selu'} |
+| LeakyReLU | 16.9543 | 306.6579 | 2097.1521 | {'data': (1024, 1024), 'act_type': 'prelu', 'gamma': (1, 1024)} |
+| LeakyReLU | 0.2859 | 1.9528 | 20.0 | {'data': (10000, 1), 'act_type': 'prelu', 'gamma': (1, 1)} |
+| LeakyReLU | 16.0125 | 231.8273 | 2000.0 | {'data': (10000, 100), 'act_type': 'prelu', 'gamma': (1, 100)} |
+| rint | 14.9397 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| rint | 0.1535 | --- | 40.0 | {'data': (10000, 1)} |
+| rint | 14.5915 | --- | 4000.0 | {'data': (10000, 100)} |
+| identity | --- | --- | 4194.3042 | {'data': (1024, 1024)} |
+| identity | --- | --- | 40.0 | {'data': (10000, 1)} |
+| identity | --- | --- | 4000.0 | {'data': (10000, 100)} |
+| softsign | 3.9985 | 7.05 | 2097.1521 | {'data': (1024, 1024)} |
+| softsign | 0.0486 | 0.0737 | 40.0 | {'data': (10000, 1)} |
+| softsign | 3.7662 | 6.7975 | 2000.0 | {'data': (10000, 100)} |
+| broadcast_div | 0.0083 | 0.0075 | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| square | 4.2037 | 4.9639 | 2097.1521 | {'data': (1024, 1024)} |
+| square | 0.0467 | 0.0558 | 40.0 | {'data': (10000, 1)} |
+| square | 3.9986 | 4.6533 | 2000.0 | {'data': (10000, 100)} |
+| elemwise_sub | 0.0058 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| dot | 14.562 | 29.1605 | 4194.3042 | {'lhs': (1024, 1024), 'rhs': (1024, 1024)} |
+| dot | 0.745 | 1.5842 | 2000.0 | {'lhs': (1000, 10), 'rhs': (1000, 10), 'transpose_b': True} |
+| dot | 0.0579 | 0.1673 | 0.2 | {'lhs': (1000, 1), 'rhs': (100, 1000), 'transpose_a': True, 'transpose_b': True} |
+| broadcast_logical_and | 0.0071 | --- | 0.024 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| broadcast_add | 0.0081 | 0.0066 | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| random_exponential | 63.2732 | --- | 4194.3042 | {'shape': (1024, 1024)} |
+| random_exponential | 0.6453 | --- | 40.0 | {'shape': (10000, 1)} |
+| random_exponential | 59.2788 | --- | 2000.0 | {'shape': (10000, 100)} |
+| Dropout | 249.4661 | 23.5141 | 37748.7344 | {'data': (32, 3, 256, 256), 'p': 0.5, 'mode': 'always'} |
+| Dropout | 3.9634 | 0.3516 | 600.0 | {'data': (10000, 10), 'p': 0.5, 'mode': 'always'} |
+| exp | 8.9413 | --- | 4194.3042 | {'data': (1024, 1024)} |
+| exp | 0.0971 | --- | 40.0 | {'data': (10000, 1)} |
+| exp | 7.9211 | --- | 4000.0 | {'data': (10000, 100)} |
+| random_generalized_negative_binomial | 362.7789 | --- | 2097.1521 | {'shape': (1024, 1024)} |
+| random_generalized_negative_binomial | 3.4276 | --- | 40.0 | {'shape': (10000, 1)} |
+| random_generalized_negative_binomial | 344.3516 | --- | 4000.0 | {'shape': (10000, 100)} |
+| min | 30.8723 | 55.9413 | 0.002 | {'data': (1024, 1024), 'axis': ()} |
+| min | 0.3168 | 0.5206 | 0.002 | {'data': (10000, 1), 'axis': 0} |
+| min | 29.9547 | 53.8245 | 0.004 | {'data': (10000, 100), 'axis': (0, 1)} |
+| erfinv | 79.987 | 99.2274 | 2097.1521 | {'data': (1024, 1024)} |
+| erfinv | 0.7567 | 0.9105 | 40.0 | {'data': (10000, 1)} |
+| erfinv | 76.0479 | 95.5001 | 2000.0 | {'data': (10000, 100)} |
+| broadcast_plus | --- | --- | 0.012 | {'lhs': [(1024, 1024), (10000, 10), (10000, 1)], 'rhs': [(1024, 1024), (10000, 10), (10000, 1)]} |
+| arcsin | 16.3157 | 7.6156 | 2097.1521 | {'data': (1024, 1024)} |
+| arcsin | 0.1611 | 0.0758 | 40.0 | {'data': (10000, 1)} |
+| arcsin | 16.0225 | 7.5081 | 2000.0 | {'data': (10000, 100)} |
+| sample_generalized_negative_binomial | 629.1785 | --- | 8388.6084 | {'mu': [2.0, 2.5], 'shape': (1024, 1024), 'alpha': [0.0, 2.5]} |
+| sample_generalized_negative_binomial | 6.8681 | --- | 80.0 | {'mu': [2.0, 2.5], 'shape': (10000, 1), 'alpha': [0.0, 2.5]} |
+| sample_generalized_negative_binomial | 604.3484 | --- | 8000.0 | {'mu': [2.0, 2.5], 'shape': (10000, 100), 'alpha': [0.0, 2.5]} |
+| relu | 11.0979 | 8.3262 | 2097.1521 | {'data': (1024, 1024)} |
+| relu | 0.1163 | 0.0853 | 40.0 | {'data': (10000, 1)} |
+| relu | 10.6863 | 8.0702 | 4000.0 | {'data': (10000, 100)} |
+| cbrt | 11.3121 | 6.5254 | 2097.1521 | {'data': (1024, 1024)} |
+| cbrt | 0.1238 | 0.0687 | 40.0 | {'data': (10000, 1)} |
+| cbrt | 10.4631 | 6.0997 | 2000.0 | {'data': (10000, 100)} |
+| sample_uniform | 89.1332 | --- | 8388.6084 | {'low': [0.0, 2.5], 'shape': (1024, 1024), 'high': [1.0, 3.7]} |
+| sample_uniform | 0.8895 | --- | 80.0 | {'low': [0.0, 2.5], 'shape': (10000, 1), 'high': [1.0, 3.7]} |
+| sample_uniform | 84.4477 | --- | 8000.0 | {'low': [0.0, 2.5], 'shape': (10000, 100), 'high': [1.0, 3.7]} |
+| Convolution | 13.4072 | 17.0238 | 56610.418 | {'data': (32, 3, 256), 'weight': (64, 3, 3), 'bias': (64,), 'kernel': (3,), 'stride': (1,), 'dilate': (1,), 'pad': (0,), 'num_filter': 64, 'layout': 'NCW'} |
+| sample_poisson | 512.1068 | --- | 8388.6084 | {'lam': [1.0, 8.5], 'shape': (1024, 1024)} |
+| sample_poisson | 4.6203 | --- | 80.0 | {'lam': [1.0, 8.5], 'shape': (10000, 1)} |
+| sample_poisson | 474.1238 | --- | 8000.0 | {'lam': [1.0, 8.5], 'shape': (10000, 100)} |
+| log_softmax | 21.4413 | 15.7456 | 2097.1521 | {'data': (1024, 1024), 'axis': -1, 'temperature': 0.5} |
+| log_softmax | 0.4613 | 0.2958 | 20.0 | {'data': (10000, 1), 'axis': -1, 'temperature': 0.5} |
+| log_softmax | 21.9745 | 15.2407 | 4000.0 | {'data': (10000, 100), 'axis': -1, 'temperature': 0.5} |
+| gamma | 35.1027 | 124.2015 | 2097.1521 | {'data': (1024, 1024)} |
+| gamma | 0.3611 | 1.1177 | 20.0 | {'data': (10000, 1)} |
+| gamma | 33.636 | 117.6889 | 2000.0 | {'data': (10000, 100)} |
+| reciprocal | 3.4646 | 6.1106 | 2097.1521 | {'data': (1024, 1024)} |
+| reciprocal | 0.0413 | 0.0635 | 40.0 | {'data': (10000, 1)} |
+| reciprocal | 3.2553 | 5.8762 | 2000.0 | {'data': (10000, 100)} |
+| sigmoid | 9.8017 | 5.9639 | 2097.1521 | {'data': (1024, 1024)} |
+| sigmoid | 0.1095 | 0.0651 | 40.0 | {'data': (10000, 1)} |
+| sigmoid | 9.0443 | 5.7901 | 2000.0 | {'data': (10000, 100)} |
\ No newline at end of file
diff --git a/benchmark/opperf/rules/__init__.py b/benchmark/opperf/rules/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/benchmark/opperf/rules/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py
new file mode 100644
index 000000000000..59b2aff53570
--- /dev/null
+++ b/benchmark/opperf/rules/default_params.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Default Input Tensor shapes to use for benchmarking"""
+
+""""""
+
+# For Unary operators like abs, arccos, arcsin etc..
+DEFAULT_DATA = [(1024, 1024), (10000, 1), (10000, 100)]
+
+# For Binary broadcast operators like - broadcast_add/sub/mode/logical_and etc..
+DEFAULT_LHS = [[(1024, 1024), (10000, 10), (10000, 1)]]
+DEFAULT_RHS = [[(1024, 1024), (10000, 10), (10000, 1)]]
+
+# For operators like - random_uniform, random_normal etc..
+DEFAULT_SHAPE = [(1024, 1024), (10000, 1), (10000, 100)]
+DEFAULT_LOW = [0]
+DEFAULT_HIGH = [5]
+DEFAULT_K = [1]
+DEFAULT_P = [1]
+
+# For operators like - sample_uniform, sample_normal etc..
+# NOTE: There are many overlapping operators in random_* and sample_*,
+# Ex: random_uniform, sample_uniform. Parameter names are same, but, for
+# random_* operators they are float/int and for sample_* operators they are NDArray.
+# Hence, below we append ND to mark the difference.
+DEFAULT_LOW_ND = [[0.0, 2.5]]
+DEFAULT_HIGH_ND = [[1.0, 3.7]]
+DEFAULT_MU_ND = [[2.0, 2.5]]
+DEFAULT_SIGMA = [[1.0, 3.7]]
+DEFAULT_ALPHA_ND = [[0.0, 2.5]]
+DEFAULT_BETA_ND = [[1.0, 0.7]]
+DEFAULT_LAM = [[1.0, 8.5]]
+DEFAULT_K_ND = [[20, 49]]
+DEFAULT_P_ND = [[0.4, 0.77]]
+
+# For reduction operators
+# NOTE: Data used is DEFAULT_DATA
+DEFAULT_AXIS = [(), 0, (0, 1)]
+
+# Default Inputs. MXNet Op Param Name to Default Input mapping
+DEFAULTS_INPUTS = {"data": DEFAULT_DATA,
+                   "lhs": DEFAULT_LHS,
+                   "rhs": DEFAULT_RHS,
+                   "shape": DEFAULT_SHAPE,
+                   "low": DEFAULT_LOW,
+                   "high": DEFAULT_HIGH,
+                   "low_nd": DEFAULT_LOW_ND,
+                   "high_nd": DEFAULT_HIGH_ND,
+                   "mu_nd": DEFAULT_MU_ND,
+                   "sigma": DEFAULT_SIGMA,
+                   "alpha_nd": DEFAULT_ALPHA_ND,
+                   "beta_nd": DEFAULT_BETA_ND,
+                   "lam_nd": DEFAULT_LAM,
+                   "k": DEFAULT_K,
+                   "p": DEFAULT_P,
+                   "k_nd": DEFAULT_K_ND,
+                   "p_nd": DEFAULT_P_ND,
+                   "axis": DEFAULT_AXIS}
+
+# These are names of MXNet operator parameters that is of type NDArray.
+# We maintain this list to automatically recognize these parameters are to be
+# given as NDArray and translate users inputs such as a shape tuple, Numpy Array or
+# a list to MXNet NDArray. This is just a convenience added so benchmark utility users
+# can just say shape of the tensor, and we automatically create Tensors.
+PARAMS_OF_TYPE_NDARRAY = ["lhs", "rhs", "data", "base", "exp",
+                          "mu", "sigma", "lam", "alpha", "beta", "gamma", "k", "p",
+                          "low", "high", "weight", "bias", "moving_mean", "moving_var"]
diff --git a/benchmark/opperf/utils/__init__.py b/benchmark/opperf/utils/__init__.py
new file mode 100644
index 000000000000..13a83393a912
--- /dev/null
+++ b/benchmark/opperf/utils/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
new file mode 100644
index 000000000000..dc4890b3df0f
--- /dev/null
+++ b/benchmark/opperf/utils/benchmark_utils.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+
+import mxnet as mx
+from mxnet import nd
+
+from .ndarray_utils import get_mx_ndarray, nd_forward_and_profile, nd_forward_backward_and_profile
+from .common_utils import merge_map_list
+from .op_registry_utils import prepare_op_inputs
+from benchmark.opperf.rules.default_params import PARAMS_OF_TYPE_NDARRAY
+
+
+def _prepare_op_inputs(inputs, run_backward, dtype, ctx):
+    kwargs_list = []
+
+    for inp in inputs:
+        kwargs = {}
+        for key, value in inp.items():
+            if key in PARAMS_OF_TYPE_NDARRAY:
+                kwargs[key] = get_mx_ndarray(ctx=ctx, in_tensor=value,
+                                             dtype=dtype,
+                                             initializer=nd.normal,
+                                             attach_grad=run_backward)
+            else:
+                kwargs[key] = value
+        kwargs_list.append(kwargs)
+
+    return kwargs_list
+
+
+def _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list):
+    if run_backward:
+        benchmark_helper_func = nd_forward_backward_and_profile
+    else:
+        benchmark_helper_func = nd_forward_and_profile
+
+    # Warm up, ignore the profiler output
+    _, _ = benchmark_helper_func(op, warmup, **kwargs_list[0])
+
+    # Run Benchmarks
+    op_benchmark_result = {op.__name__: []}
+    logging.info(f"Begin Benchmark - {op.__name__}")
+    for idx, kwargs in enumerate(kwargs_list):
+        _, profiler_output = benchmark_helper_func(op, runs, **kwargs)
+
+        # Add inputs used for profiling this operator into result
+        profiler_output["inputs"] = inputs[idx]
+        op_benchmark_result[op.__name__].append(profiler_output)
+    logging.info(f"Complete Benchmark - {op.__name__}")
+    return op_benchmark_result
+
+
+def run_performance_test(ops, inputs, run_backward=True,
+                         dtype='float32', ctx=mx.cpu(),
+                         warmup=10, runs=50):
+    """Run operator benchmark for given operator or list of operators, ops, with the given inputs.
+
+    Returns benchmark results as a list of dictionary where each dictionary represents benchmarks result per operator.
+    key -> name of the operator and value -> map of results (forward time, backward time, time spent in memory
+    operations.
+
+    Parameters
+    ----------
+    ops: [Str]
+        One or list of operators to benchmark. Should be an NDArray operator.
+    inputs: map
+        Inputs for operator. Key should be name of parameter for operator.
+        Example: inputs = {"lhs": (1024, 1024), "rhs": (1024, 1024)} for mx.nd.add
+    run_backward: Boolean, Default is True
+        Should we have backward operator benchmarks.
+    dtype: Str, default 'float32'
+        Precision to use for input tensors. Defaults to float32. Example: 'float32', 'int64'
+    ctx: mx.ctx, default mx.cpu()
+        Context to use for benchmarks. Default to mx.cpu()
+    warmup: int, default 10
+        Number of warmup runs
+    runs: int, default 50
+        Number of runs for capturing benchmark results
+
+    Returns
+    -------
+    List of dictionary of benchmark results. key -> name of the operator, Value is benchmark results.
+
+    """
+    kwargs_list = _prepare_op_inputs(inputs, run_backward, dtype, ctx)
+
+    if not isinstance(ops, list):
+        ops = [ops]
+
+    op_benchmark_result = []
+    for op in ops:
+        if hasattr(mx.nd, op.__name__):
+            benchmark_result = _run_nd_operator_performance_test(op, inputs, run_backward, warmup, runs, kwargs_list)
+        else:
+            raise ValueError("Unknown NDArray operator provided to benchmark. -  ", op.__name__)
+        op_benchmark_result.append(benchmark_result)
+    return op_benchmark_result
+
+
+def run_op_benchmarks(ops, dtype, ctx, warmup, runs):
+    # For each operator, run benchmarks
+    mx_op_benchmark_results = []
+    for _, op_params in ops.items():
+        # Prepare inputs for the operator
+        inputs = prepare_op_inputs(op_params)
+        # Run benchmarks
+        cur_op_res = run_performance_test(op_params["nd_op_handle"],
+                                          run_backward=op_params["has_backward"],
+                                          dtype=dtype, ctx=ctx,
+                                          inputs=inputs,
+                                          warmup=warmup, runs=runs)
+        mx_op_benchmark_results += cur_op_res
+
+    # Prepare combined results for all operators
+    mx_op_benchmark_results = merge_map_list(mx_op_benchmark_results)
+    return mx_op_benchmark_results
diff --git a/benchmark/opperf/utils/common_utils.py b/benchmark/opperf/utils/common_utils.py
new file mode 100644
index 000000000000..9fe2e19b13b3
--- /dev/null
+++ b/benchmark/opperf/utils/common_utils.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import json
+from operator import itemgetter
+
+from collections import ChainMap
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+
+def merge_map_list(map_list):
+    """Merge all the Map in map_list into one final Map.
+
+    Useful when you have a list of benchmark result maps and you want to
+    prepare one final map combining all results.
+
+    Parameters
+    ----------
+    map_list: List[maps]
+        List of maps to be merged.
+
+    Returns
+    -------
+    map where all individual maps in the into map_list are merged
+
+    """
+    return dict(ChainMap(*map_list))
+
+
+def save_to_file(inp_dict, out_filepath, out_format='json'):
+    """Saves the given input dictionary to the given output file.
+
+    By default, saves the input dictionary as JSON file. Other supported formats include:
+    1. md
+
+    Parameters
+    ----------
+    inp_dict: map
+        Input dictionary to be saved
+    out_filepath: str
+        Output file path
+    out_format: str, default 'json'
+        Format of the output file. Supported options - 'json', 'md'. Default - json.
+
+    """
+    if out_format == 'json':
+        # Save as JSON
+        with open(out_filepath, "w") as result_file:
+            json.dump(inp_dict, result_file, indent=4, sort_keys=True)
+    elif out_format == 'md':
+        # Save as md
+        with open(out_filepath, "w") as result_file:
+            result_file.write(_prepare_markdown(inp_dict))
+    else:
+        raise ValueError("Invalid output file format provided - '{}'. Supported - json, md".format(format))
+
+
+def get_json(inp_dict):
+    """Converts a given dictionary to prettified JSON string.
+
+    Parameters
+    ----------
+    inp_dict: map
+        Input dictionary to be converted to JSON.
+
+    Returns
+    -------
+    Prettified JSON string
+
+    """
+    return json.dumps(inp_dict, indent=4)
+
+
+def _prepare_op_benchmark_result(op, op_bench_result):
+    operator_name = op
+    avg_forward_time = "---"
+    avg_backward_time = "---"
+    max_mem_usage = "---"
+    inputs = "---"
+    for key, value in op_bench_result.items():
+        if "avg_time_forward" in key:
+            avg_forward_time = value
+        elif "avg_time_backward" in key:
+            avg_backward_time = value
+        elif "max_storage_mem_alloc_" in key:
+            max_mem_usage = value
+        elif "inputs" in key:
+            inputs = value
+    return "| {} | {} | {} | {} | {} |".format(operator_name, avg_forward_time, avg_backward_time,
+                                               max_mem_usage, inputs)
+
+
+def _prepare_markdown(results):
+    results_markdown = [
+        "| Operator | Avg Forward Time (ms) | Avg. Backward Time (ms) | Max Mem Usage (Storage) (Bytes)"
+        " | Inputs |",
+        "| :---: | :---: | :---: | :---:| :--- |"]
+
+    for op, op_bench_results in sorted(results.items(), key=itemgetter(0)):
+        for op_bench_result in op_bench_results:
+            results_markdown.append(_prepare_op_benchmark_result(op, op_bench_result))
+
+    return os.linesep.join(results_markdown)
diff --git a/benchmark/opperf/utils/ndarray_utils.py b/benchmark/opperf/utils/ndarray_utils.py
new file mode 100644
index 000000000000..7ed2fa107066
--- /dev/null
+++ b/benchmark/opperf/utils/ndarray_utils.py
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import mxnet as mx
+import mxnet.ndarray as nd
+
+from .profiler_utils import profile
+
+
+@profile
+def nd_forward_backward_and_profile(op, runs, *args, **kwargs):
+    """Helper function to run a given NDArray operator (op) for 'runs' number of times with
+    given args and kwargs. Executes both forward and backward pass.
+
+    NOTE: This is a sync call and waits for all the operations execution to complete.
+
+    Parameters
+    ----------
+    op: Str
+        NDArray operator (Function reference) to execute. Example: mx.nd.add
+    runs: int
+        Number of times to execute the operation
+    args:
+        Arguments for the NDArray operator (op) being executed.
+    kwargs:
+        Key value arguments for the NDArray operator (op) being executed.
+
+    Returns
+    -------
+    any results from NDArray operation execution
+
+    """
+    for _ in range(runs):
+        with mx.autograd.record():
+            res = op(*args, **kwargs)
+        res.backward()
+        nd.waitall()
+    return res
+
+
+@profile
+def nd_forward_and_profile(op, runs, *args, **kwargs):
+    """Helper function to run a given NDArray operator (op) for 'runs' number of times with
+    given args and kwargs. Executes ONLY forward pass.
+
+    NOTE: This is a sync call and waits for all the operations execution to complete.
+
+    Parameters
+    ----------
+    op: Str
+        NDArray operator (Function reference) to execute. Example: mx.nd.add
+    runs: int
+        Number of time to execute the operation
+    args:
+        Arguments for the NDArray operator (op) being executed.
+    kwargs:
+        Key value arguments for the NDArray operator (op) being executed.
+
+    Returns
+    -------
+    any results from NDArray operation execution
+    """
+    for _ in range(runs):
+        res = op(*args, **kwargs)
+        nd.waitall()
+    return res
+
+
+def get_mx_ndarray(ctx, in_tensor, dtype, initializer, attach_grad=True):
+    """Helper function to prepare a MXNet NDArray tensor in given Context (ctx) of type (dtype) with given
+    initializer. You can get a new Tensor by providing only "Shape" or "Numpy NDArray" or another MXNet NDArray as
+    "in_tensor".
+
+    NOTE: This is a sync call and waits for the Tensor to be created.
+
+    Parameters
+    ----------
+    ctx: mx.ctx, default mx.cpu()
+        Context of the new MXNet NDArray Tensor.
+    in_tensor: Numpy NDArray or MXNet NDArray or Tuple of shape
+        Can be a tuple of shape or Numpy NDArray or MXNet NDArray.
+    dtype: str
+        Precision or Dtype of the expected Tensor. Ex: "float32", "Int64"
+    initializer:
+        Function reference to the initialize to use. Ex: mx.nd.random.normal, mx.nd.zeros
+    attach_grad: Boolean, default True
+        To attach a gradient for the Tensor. Default is True.
+
+    Returns
+    -------
+    MXNet NDArray Tensor.
+    """
+    if isinstance(in_tensor, int) or isinstance(in_tensor, float):
+        return in_tensor
+
+    if isinstance(in_tensor, tuple):
+        tensor = initializer(ctx=ctx, shape=in_tensor, dtype=dtype)
+    elif isinstance(in_tensor, list):
+        tensor = nd.array(in_tensor, ctx=ctx, dtype=dtype)
+    elif isinstance(in_tensor, np.ndarray):
+        tensor = nd.array(in_tensor, ctx=ctx, dtype=dtype)
+    elif isinstance(in_tensor, mx.ndarray):
+        tensor = in_tensor.as_in_context(ctx=ctx).astype(dtype=dtype)
+    else:
+        raise ValueError("Invalid input type for creating input tensor. Input can be tuple() of shape or Numpy Array or"
+                         " MXNet NDArray. Given - ", in_tensor)
+
+    if attach_grad:
+        tensor.attach_grad()
+
+    tensor.wait_to_read()
+    return tensor
diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py
new file mode 100644
index 000000000000..f5b47cb7a9e4
--- /dev/null
+++ b/benchmark/opperf/utils/op_registry_utils.py
@@ -0,0 +1,331 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Utilities to interact with MXNet operator registry."""
+import ctypes
+import sys
+from mxnet.base import _LIB, check_call, py_str, OpHandle, c_str, mx_uint
+
+from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS
+
+# We will use all operators inside NDArray Module
+mx_nd_module = sys.modules["mxnet.ndarray.op"]
+
+# Operators where parameter have special criteria that cannot be cleanly automated.
+# Example: sample_multinomial operator has a parameter 'data'. It expects values to sum up to 1.
+unique_ops = ("sample_multinomial",)
+
+
+def _select_ops(operator_names, filters=("_contrib", "_"), merge_op_forward_backward=True):
+    """From a given list of operators, filter out all operator names starting with given filters and prepares
+    a dictionary of operator with attributes - 'has_backward' and 'nd_op_handle = mxnet.ndarray.op'
+
+    By default, merge forward and backward operators for a given op into one operator and sets the attribute
+    'has_backward' for the operator.
+
+    By default, filter out all Contrib operators that starts with '_contrib' and internal operators that
+    starts with '_'.
+
+    Parameters
+    ----------
+    operator_names: List[str]
+        List of operator names.
+    filters: Tuple(str)
+        Tuple of filters to apply on operator names.
+    merge_op_forward_backward: Boolean, Default - True
+        Merge forward and backward operators for a given op in to one op.
+
+    Returns
+    -------
+    {"operator_name": {"has_backward", "nd_op_handle"}}
+    """
+    mx_operators = {}
+    operators_with_backward = []
+
+    if merge_op_forward_backward:
+        filters += ("_backward",)
+
+    for cur_op_name in operator_names:
+        if not cur_op_name.startswith(filters):
+            mx_operators[cur_op_name] = {"has_backward": False,
+                                         "nd_op_handle": getattr(mx_nd_module, cur_op_name)}
+
+        if cur_op_name.startswith("_backward_"):
+            operators_with_backward.append(cur_op_name)
+
+    if merge_op_forward_backward:
+        # Identify all operators that can run backward.
+        for op_with_backward in operators_with_backward:
+            op_name = op_with_backward.split("_backward_")[1]
+            if op_name in mx_operators:
+                mx_operators[op_name]["has_backward"] = True
+
+    return mx_operators
+
+
+def _get_all_registered_ops():
+    """Get all registered MXNet operator names.
+
+
+    Returns
+    -------
+    ["operator_name"]
+    """
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    check_call(_LIB.MXListAllOpNames(ctypes.byref(size),
+                                     ctypes.byref(plist)))
+
+    mx_registered_operator_names = [py_str(plist[i]) for i in range(size.value)]
+    return mx_registered_operator_names
+
+
+def _get_op_handles(op_name):
+    """Get handle for an operator with given name - op_name.
+
+    Parameters
+    ----------
+    op_name: str
+        Name of operator to get handle for.
+    """
+    op_handle = OpHandle()
+    check_call(_LIB.NNGetOpHandle(c_str(op_name), ctypes.byref(op_handle)))
+    return op_handle
+
+
+def _get_op_arguments(op_handle):
+    """Given operator name and handle, fetch operator arguments - number of arguments,
+    argument names, argument types.
+
+    Parameters
+    ----------
+    op_handle: OpHandle
+        Handle for the operator
+
+    Returns
+    -------
+    (narg, arg_names, arg_types)
+    """
+    real_name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+    key_var_num_args = ctypes.c_char_p()
+    ret_type = ctypes.c_char_p()
+
+    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
+        op_handle, ctypes.byref(real_name), ctypes.byref(desc),
+        ctypes.byref(num_args),
+        ctypes.byref(arg_names),
+        ctypes.byref(arg_types),
+        ctypes.byref(arg_descs),
+        ctypes.byref(key_var_num_args),
+        ctypes.byref(ret_type)))
+
+    narg = int(num_args.value)
+    arg_names = [py_str(arg_names[i]) for i in range(narg)]
+    arg_types = [py_str(arg_types[i]) for i in range(narg)]
+
+    return narg, arg_names, arg_types
+
+
+def _set_op_arguments(mx_operators):
+    """Fetch and set operator arguments - nargs, arg_names, arg_types
+    """
+    for op_name in mx_operators:
+        op_handle = _get_op_handles(op_name)
+        narg, arg_names, arg_types = _get_op_arguments(op_handle)
+        mx_operators[op_name]["params"] = {"narg": narg,
+                                           "arg_names": arg_names,
+                                           "arg_types": arg_types}
+
+
+def _get_all_mxnet_operators():
+    # Step 1 - Get all registered op names and filter it
+    operator_names = _get_all_registered_ops()
+    mx_operators = _select_ops(operator_names)
+
+    # Step 2 - Get all parameters for the operators
+    _set_op_arguments(mx_operators)
+    return mx_operators
+
+
+def prepare_op_inputs(arg_params, arg_values):
+    inputs = []
+
+    for arg_value in arg_values:
+        inp = {}
+        for arg_name in arg_params["params"]["arg_names"]:
+            if arg_name in arg_value:
+                inp[arg_name] = arg_value[arg_name]
+        inputs.append(inp)
+    return inputs
+
+
+def prepare_op_inputs(arg_params):
+    inputs = []
+
+    # Prepare op to default input mapping
+    arg_values = {}
+    for arg_name, arg_type in zip(arg_params["params"]["arg_names"],
+                                  arg_params["params"]["arg_types"]):
+        if "NDArray" in arg_type and arg_name + "_nd" in DEFAULTS_INPUTS:
+            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_nd"]
+        elif arg_name in DEFAULTS_INPUTS:
+            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name]
+        elif "float" in arg_type and arg_name + "_float" in DEFAULTS_INPUTS:
+            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_float"]
+
+    # Number of different inputs we want to use to test
+    # the operator
+    num_input_combinations = max([len(value) for value in arg_values.values()])
+
+    # Prepare key/value args for param to input value
+    for idx in range(num_input_combinations):
+        inp = {}
+        for arg_name in arg_params["params"]["arg_names"]:
+            if arg_name in arg_values:
+                if len(arg_values[arg_name]) == num_input_combinations:
+                    inp[arg_name] = arg_values[arg_name][idx]
+                else:
+                    # This is required when we want to use a param same across all
+                    # input combination. Example: keeping low and high same for random sampling
+                    # operator for all different types of Tensor shape.
+                    inp[arg_name] = arg_values[arg_name][0]
+
+        inputs.append(inp)
+    return inputs
+
+
+def get_all_unary_operators():
+    """Gets all Unary operators registered with MXNet.
+
+    Returns
+    -------
+    {"operator_name": {"has_backward", "nd_op_handle", "params"}}
+    """
+    # Get all mxnet operators
+    mx_operators = _get_all_mxnet_operators()
+
+    # Filter for unary broadcast operators
+    unary_broadcast_mx_operators = {}
+    for op_name, op_params in mx_operators.items():
+        if op_params["params"]["narg"] == 1 and \
+                "data" in op_params["params"]["arg_names"]:
+            unary_broadcast_mx_operators[op_name] = mx_operators[op_name]
+    return unary_broadcast_mx_operators
+
+
+def get_all_broadcast_binary_operators():
+    """Gets all binary broadcast operators registered with MXNet.
+
+    Returns
+    -------
+    {"operator_name": {"has_backward", "nd_op_handle", "params"}}
+    """
+    # Get all mxnet operators
+    mx_operators = _get_all_mxnet_operators()
+
+    # Filter for binary broadcast operators
+    binary_broadcast_mx_operators = {}
+    for op_name, op_params in mx_operators.items():
+        if op_name.startswith("broadcast_") and op_params["params"]["narg"] == 2 and \
+                "lhs" in op_params["params"]["arg_names"] and \
+                "rhs" in op_params["params"]["arg_names"]:
+            binary_broadcast_mx_operators[op_name] = mx_operators[op_name]
+    return binary_broadcast_mx_operators
+
+
+def get_all_elemen_wise_binary_operators():
+    """Gets all binary elemen_wise operators registered with MXNet.
+
+    Returns
+    -------
+    {"operator_name": {"has_backward", "nd_op_handle", "params"}}
+    """
+    # Get all mxnet operators
+    mx_operators = _get_all_mxnet_operators()
+
+    # Filter for binary elemen_wise operators
+    binary_elemen_wise_mx_operators = {}
+    for op_name, op_params in mx_operators.items():
+        if op_name.startswith("elemwise_") and op_params["params"]["narg"] == 2 and \
+                "lhs" in op_params["params"]["arg_names"] and \
+                "rhs" in op_params["params"]["arg_names"]:
+            binary_elemen_wise_mx_operators[op_name] = mx_operators[op_name]
+    return binary_elemen_wise_mx_operators
+
+
+def get_all_random_sampling_operators():
+    """Gets all Random Sampling operators registered with MXNet.
+
+    Returns
+    -------
+    {"operator_name": {"has_backward", "nd_op_handle", "params"}}
+    """
+    # Get all mxnet operators
+    mx_operators = _get_all_mxnet_operators()
+
+    # Filter for Random Sampling operators
+    random_sampling_mx_operators = {}
+    for op_name, op_params in mx_operators.items():
+        if op_name.startswith(("random_", "sample_")) and op_name not in unique_ops:
+            random_sampling_mx_operators[op_name] = mx_operators[op_name]
+    return random_sampling_mx_operators
+
+
+def get_all_reduction_operators():
+    """Gets all Reduction operators registered with MXNet.
+
+    Returns
+    -------
+    {"operator_name": {"has_backward", "nd_op_handle", "params"}}
+    """
+    # Get all mxnet operators
+    mx_operators = _get_all_mxnet_operators()
+
+    # Filter for Reduction operators
+    reduction_mx_operators = {}
+    for op_name, op_params in mx_operators.items():
+        if op_params["params"]["narg"] == 4 and \
+                set(["data", "axis", "exclude", "keepdims"]).issubset(set(op_params["params"]["arg_names"])) \
+                and op_name not in unique_ops:
+            reduction_mx_operators[op_name] = mx_operators[op_name]
+    return reduction_mx_operators
+
+
+def get_operators_with_no_benchmark(operators_with_benchmark):
+    """Gets all MXNet operators with not benchmark.
+
+    Retrieve all operators registered with MXNet and prepares a list of operators that are not part of given
+    operators with benchmark list.
+
+    Parameters
+    ----------
+    operators_with_benchmark: list[Str]
+        List of operator names that has benchmarks
+
+    Returns
+    -------
+    list[Str]
+        List of operator names that is registered with MXNet but has no benchmarks.
+    """
+    all_mxnet_operators = _get_all_mxnet_operators().keys()
+    return list(set(all_mxnet_operators) - set(operators_with_benchmark))
diff --git a/benchmark/opperf/utils/profiler_utils.py b/benchmark/opperf/utils/profiler_utils.py
new file mode 100644
index 000000000000..a434d3be1e5c
--- /dev/null
+++ b/benchmark/opperf/utils/profiler_utils.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import functools
+
+from .common_utils import merge_map_list
+from mxnet import profiler
+
+"""
+TODO: Below we are using logic of parsing the MXNet profiler output string to
+fetch the benchmark results. Note that this is a temporary solution till we add 
+a new utility API into MXNet profiler to get_summary(), reset(). All the below
+parsing logic should be removed once these read APIs are available in Profiler.
+
+"""
+
+
+def _get_memory_profile(memory_profile_results):
+    memory_profile = {}
+    for line in memory_profile_results:
+        if line.startswith("Memory:"):
+            device_id = line.split()[1]
+            avg_time_memory_alloc = float(line.split()[-1])
+            memory_profile["max_storage_mem_alloc_" + device_id] = avg_time_memory_alloc
+
+    return memory_profile
+
+
+def _get_operator_profile(operator_name, operator_profile_results):
+    operator_profile = {}
+    for line in operator_profile_results:
+        if operator_name in line or operator_name[:3] + " " in line:
+            operation = line.split()[0]
+            operation_avg_time = float(line.split()[-1])
+            if "_backward" in operation:
+                operator_profile["avg_time" + operation] = operation_avg_time
+            else:
+                operator_profile["avg_time_forward_" + operation] = operation_avg_time
+
+    return operator_profile
+
+
+def parse_profiler_dump(operator_name, profiler_dump):
+    """Parse the MXNet profiler dump output, fetch Memory profile results and
+    Operator compute profiler results.
+
+    Parameters
+    ----------
+    profiler_dump: string
+        MXNet profiler output from mx.profiler.dumps() API.
+
+    Returns
+    -------
+    map, Memory and Compute profiler results.
+
+    """
+    if not profiler_dump:
+        raise AssertionError("Invalid MXNet profiler output provided to parse!")
+
+    """
+    MXNet profiler output from mx.profiler.dumps() API looks like below. This function parses
+    this string profiler output to fetch Memory and Compute metrics.
+    
+    Profile Statistics.
+    Note that counter items are counter values and not time units.
+    Device Storage
+    =================
+    Name                          Total Count        Time (ms)    Min Time (ms)    Max Time (ms)    Avg Time (ms)
+    ----                          -----------        ---------    -------------    -------------    -------------
+    Memory: cpu/0                         100     2097152.0000     1681915.8750     2097152.0000      207618.0469
+
+    MXNET_C_API
+    =================
+    Name                          Total Count        Time (ms)    Min Time (ms)    Max Time (ms)    Avg Time (ms)
+    ----                          -----------        ---------    -------------    -------------    -------------
+    MXNDArrayFree                          49           1.1220           0.0170           0.0360           0.0229
+    MXAutogradBackwardEx                   50          11.5460           0.1980           0.3360           0.2309
+    MXNet C API Calls                     399           1.9990           1.6010           1.9990           0.1990
+    MXImperativeInvokeEx                   50           4.4810           0.0700           0.1330           0.0896
+    MXNDArrayWaitAll                       50         769.0570          14.0200          24.5030          15.3811
+    MXAutogradSetIsTraining               100           0.0190           0.0000           0.0010           0.0002
+    MXAutogradSetIsRecording              100           0.0400           0.0000           0.0010           0.0004
+    MXNet C API Concurrency               798           0.0000           0.0000           0.0010           0.0005
+
+    operator
+    =================
+    Name                          Total Count        Time (ms)    Min Time (ms)    Max Time (ms)    Avg Time (ms)
+    ----                          -----------        ---------    -------------    -------------    -------------
+    DeleteVariable                        196           1.4490           0.0040           0.0250           0.0074
+    _backward_broadcast_add               100         521.2320           4.8070           8.5970           5.2123
+    SetValueOp                            100         645.8060           5.8820          10.0380           6.4581
+    broadcast_add                         100         394.8910           3.5230           5.8790           3.9489
+    """
+
+    # String Patterns to look out for when parsing
+    memory_profile_result_start = "Device Storage"  # Helps identify start of Memory profile
+    c_api_profile_result_start = "MXNET_C_API"  # Helps identify end of Memory profile
+    operator_profile_result_start = "operator"  # Helps identify start of Operator profile
+
+    memory_profile_results = []
+    operator_profile_results = []
+
+    # Parse lines corresponding to Memory and Computation profiling
+    read_memory_profile = False
+    read_operator_profile = False
+    for line in profiler_dump.splitlines():
+        if line.startswith(memory_profile_result_start):
+            read_memory_profile = True
+        elif line.startswith(operator_profile_result_start):
+            read_operator_profile = True
+        elif line.startswith(c_api_profile_result_start):
+            read_memory_profile = False
+
+        if read_memory_profile:
+            memory_profile_results.append(line)
+        elif read_operator_profile:
+            operator_profile_results.append(line)
+
+    # Prepare results
+    memory_profile = _get_memory_profile(memory_profile_results)
+    operator_profile = _get_operator_profile(operator_name, operator_profile_results)
+
+    return merge_map_list([memory_profile, operator_profile])
+
+
+def profile(func):
+    """Decorator for profiling MXNet operation.
+    Uses MXNet profiler to collect metrics on memory usage and execution time
+    of the operation.
+
+    Parameters
+    ----------
+    func:
+        Operation to be executed and timed.
+
+    Returns
+    -------
+    res, profiler output. res being an return values from operator execution.
+    profiler output is a dictionary with summary of operation execution.
+    Example output : { "add": [{"avg_time_mem_alloc_cpu/0": 207618.0469,
+                                "avg_time_forward_broadcast_add": 4.204,
+                                "avg_time_backward_broadcast_add": 5.6288,
+                                "inputs": {
+                                            "lhs": [1024, 1024],
+                                            "rhs": [1024,1024]
+                                          }]
+                     }
+    """
+
+    @functools.wraps(func)
+    def profile_it(*args, **kwargs):
+        # Profile the operation
+        profiler.set_config(profile_all=True, aggregate_stats=True)
+        profiler.set_state('run')
+        res = func(*args, **kwargs)
+        profiler.set_state('stop')
+
+        # Prepare the results
+        profiler_dump = profiler.dumps(reset=True)
+
+        # args[0] is assumed to operator name, if not found check for block name.
+        # NOTE: This parameter should be removed when we get away from parsing
+        # profiler output and start using new profiler APIs - get_summary(), reset()
+        if len(args) > 0:
+            operator_name = args[0].__name__
+        elif 'block' in kwargs:
+            operator_name = kwargs['block']._op_name
+        else:
+            raise ValueError("Unable to identify operator name to extract profiler output!")
+
+        # Get the MXNet profile output
+        profiler_output = parse_profiler_dump(operator_name, profiler_dump)
+        return res, profiler_output
+
+    return profile_it
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index 38cc0b927c43..b8117f39377a 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -67,7 +67,7 @@ def pack_lib(name, libs, include_gcov_data = false) {
   sh returnStatus: true, script: """
 set +e
 echo "Packing ${libs} into ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+for i in \$(echo ${libs} | sed -e 's/,/ /g'); do md5sum \$i; done
 return 0
 """
   stash includes: libs, name: name
@@ -86,7 +86,7 @@ def unpack_and_init(name, libs, include_gcov_data = false) {
   sh returnStatus: true, script: """
 set +e
 echo "Unpacked ${libs} from ${name}"
-echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+for i in \$(echo ${libs} | sed -e 's/,/ /g'); do md5sum \$i; done
 return 0
 """
   if (include_gcov_data) {
diff --git a/ci/build_windows.py b/ci/build_windows.py
index e8658995b68e..7ec24395e22e 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -44,6 +44,8 @@
 class BuildFlavour(Enum):
     WIN_CPU = 'WIN_CPU'
     WIN_CPU_MKLDNN = 'WIN_CPU_MKLDNN'
+    WIN_CPU_MKLDNN_MKL = 'WIN_CPU_MKLDNN_MKL'
+    WIN_CPU_MKL = 'WIN_CPU_MKL'
     WIN_GPU = 'WIN_GPU'
     WIN_GPU_MKLDNN = 'WIN_GPU_MKLDNN'
 
@@ -72,8 +74,34 @@ class BuildFlavour(Enum):
                          '-DUSE_LAPACK=1 '
                          '-DUSE_DIST_KVSTORE=0 '
                          '-DUSE_MKL_IF_AVAILABLE=1 '
+                         '-DUSE_MKLDNN=1 '
                          '-DCMAKE_BUILD_TYPE=Release')
 
+    , 'WIN_CPU_MKLDNN_MKL': ('-DUSE_CUDA=0 '
+                         '-DUSE_CUDNN=0 '
+                         '-DUSE_NVRTC=0 '
+                         '-DUSE_OPENCV=1 '
+                         '-DUSE_OPENMP=1 '
+                         '-DUSE_PROFILER=1 '
+                         '-DUSE_BLAS=mkl '
+                         '-DUSE_LAPACK=1 '
+                         '-DUSE_DIST_KVSTORE=0 '
+                         '-DUSE_MKL_IF_AVAILABLE=1 '
+                         '-DUSE_MKLDNN=1 '
+                         '-DCMAKE_BUILD_TYPE=Release')
+
+    , 'WIN_CPU_MKL': ('-DUSE_CUDA=0 '
+                         '-DUSE_CUDNN=0 '
+                         '-DUSE_NVRTC=0 '
+                         '-DUSE_OPENCV=1 '
+                         '-DUSE_OPENMP=1 '
+                         '-DUSE_PROFILER=1 '
+                         '-DUSE_BLAS=mkl '
+                         '-DUSE_LAPACK=1 '
+                         '-DUSE_DIST_KVSTORE=0 '
+                         '-DUSE_MKL_IF_AVAILABLE=1 '
+                         '-DUSE_MKLDNN=0 '
+                         '-DCMAKE_BUILD_TYPE=Release')
     , 'WIN_GPU': ('-DUSE_CUDA=1 '
                   '-DUSE_CUDNN=1 '
                   '-DUSE_NVRTC=1 '
@@ -218,6 +246,8 @@ def main():
             os.environ["OpenCV_DIR"] = "C:\\Program Files\\OpenCV-v3.4.1\\build"
         if 'CUDA_PATH' not in os.environ:
             os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
+        if 'MKL_ROOT' not in os.environ:
+            os.environ["MKL_ROOT"] = "C:\\Program Files (x86)\\IntelSWTools\\compilers_and_libraries\\windows\\mkl"
         windows_build(args)
 
     elif system == 'Linux' or system == 'Darwin':
diff --git a/ci/docker/Dockerfile.build.centos7_gpu b/ci/docker/Dockerfile.build.centos7_gpu
index cf76f22a9f0a..1a927c4d5832 100644
--- a/ci/docker/Dockerfile.build.centos7_gpu
+++ b/ci/docker/Dockerfile.build.centos7_gpu
@@ -29,7 +29,7 @@ RUN /work/centos7_ccache.sh
 COPY install/centos7_python.sh /work/
 RUN /work/centos7_python.sh
 
-ENV CUDNN_VERSION=7.3.1.20
+ENV CUDNN_VERSION=7.6.0.64
 COPY install/centos7_cudnn.sh /work/
 RUN /work/centos7_cudnn.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_base_cpu b/ci/docker/Dockerfile.build.ubuntu_base_cpu
index c3ad2e90fb8d..a75ed0255d82 100644
--- a/ci/docker/Dockerfile.build.ubuntu_base_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_base_cpu
@@ -25,6 +25,7 @@ WORKDIR /work/deps
 
 RUN apt-get update && apt-get -y install sudo
 
+# Always last
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
diff --git a/ci/docker/Dockerfile.build.ubuntu_base_gpu b/ci/docker/Dockerfile.build.ubuntu_base_gpu
index 94e49b6fb297..40e1da657203 100644
--- a/ci/docker/Dockerfile.build.ubuntu_base_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_base_gpu
@@ -21,12 +21,11 @@
 
 FROM nvidia/cuda:10.0-devel-ubuntu16.04
 
-ENV CUDNN_VERSION=7.3.1.20
-
 WORKDIR /work/deps
 
 RUN apt-get update && apt-get -y install sudo
 
+ENV CUDNN_VERSION=7.6.0.64
 COPY install/ubuntu_cudnn.sh /work/
 RUN /work/ubuntu_cudnn.sh
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
index 08c67cd660f8..0607ec1a5e75 100644
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -23,8 +23,6 @@
 
 FROM nvidia/cuda:10.0-devel-ubuntu16.04
 
-ENV CUDNN_VERSION=7.3.1.20
-
 WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
@@ -45,6 +43,8 @@ COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
 COPY install/ubuntu_mklml.sh /work/
 RUN /work/ubuntu_mklml.sh
+
+ENV CUDNN_VERSION=7.6.0.64
 COPY install/ubuntu_cudnn.sh /work/
 RUN /work/ubuntu_cudnn.sh
 
@@ -62,4 +62,3 @@ RUN /work/ubuntu_adduser.sh
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu b/ci/docker/Dockerfile.build.ubuntu_cpu
index 2df9f5887f54..35dcf3ed7410 100644
--- a/ci/docker/Dockerfile.build.ubuntu_cpu
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu
@@ -70,6 +70,7 @@ COPY install/ubuntu_docs.sh /work/
 COPY install/docs_requirements /work/
 RUN /work/ubuntu_docs.sh
 
+# Always last
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
@@ -78,4 +79,3 @@ RUN /work/ubuntu_adduser.sh
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu100 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu100
index 6ec4a1fe415f..46d27e35022b 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu100
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu100
@@ -67,16 +67,16 @@ RUN /work/ubuntu_docs.sh
 COPY install/ubuntu_tutorials.sh /work/
 RUN /work/ubuntu_tutorials.sh
 
+ENV CUDNN_VERSION=7.6.0.64
+COPY install/ubuntu_cudnn.sh /work/
+RUN /work/ubuntu_cudnn.sh
+
+# Always last
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
-ENV CUDNN_VERSION=7.3.1.20
-COPY install/ubuntu_cudnn.sh /work/
-RUN /work/ubuntu_cudnn.sh
-
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu90 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu90
index 2730cc2caee1..19530a212424 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu90
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu90
@@ -67,15 +67,16 @@ RUN /work/ubuntu_docs.sh
 COPY install/ubuntu_tutorials.sh /work/
 RUN /work/ubuntu_tutorials.sh
 
+ENV CUDNN_VERSION=7.6.0.64
+COPY install/ubuntu_cudnn.sh /work/
+RUN /work/ubuntu_cudnn.sh
+
+# Always last
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
-ENV CUDNN_VERSION=7.3.1.20
-COPY install/ubuntu_cudnn.sh /work/
-RUN /work/ubuntu_cudnn.sh
-
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_cu92 b/ci/docker/Dockerfile.build.ubuntu_gpu_cu92
index 316c81d8a6e1..f239eec4af27 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_cu92
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_cu92
@@ -67,15 +67,15 @@ RUN /work/ubuntu_docs.sh
 COPY install/ubuntu_tutorials.sh /work/
 RUN /work/ubuntu_tutorials.sh
 
+ENV CUDNN_VERSION=7.6.0.64
+COPY install/ubuntu_cudnn.sh /work/
+RUN /work/ubuntu_cudnn.sh
+
 ARG USER_ID=0
 ARG GROUP_ID=0
 COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
-ENV CUDNN_VERSION=7.3.1.20
-COPY install/ubuntu_cudnn.sh /work/
-RUN /work/ubuntu_cudnn.sh
-
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
index 934aded5101d..a667f7b7a94f 100644
--- a/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
+++ b/ci/docker/Dockerfile.build.ubuntu_nightly_gpu
@@ -20,8 +20,6 @@
 
 FROM nvidia/cuda:10.0-devel-ubuntu16.04
 
-ENV CUDNN_VERSION=7.3.1.20
-
 WORKDIR /work/deps
 
 COPY install/ubuntu_core.sh /work/
@@ -72,6 +70,7 @@ RUN /work/ubuntu_tutorials.sh
 COPY install/ubuntu_nightly_tests.sh /work/
 RUN /work/ubuntu_nightly_tests.sh
 
+ENV CUDNN_VERSION=7.6.0.64
 COPY install/ubuntu_cudnn.sh /work/
 RUN /work/ubuntu_cudnn.sh
 
@@ -83,4 +82,3 @@ RUN /work/ubuntu_adduser.sh
 COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/install/docs_requirements b/ci/docker/install/docs_requirements
index 3cfef1e33901..f78dca2bc655 100644
--- a/ci/docker/install/docs_requirements
+++ b/ci/docker/install/docs_requirements
@@ -26,8 +26,8 @@ h5py==2.8.0rc1
 mock==2.0.0
 nose==1.3.7
 nose-timer==0.7.3
-numpy<=1.15.2,>=1.8.2
-pylint==1.8.3
+numpy>1.16.0,<2.0.0
+pylint==2.3.1; python_version >= '3.0'
 pypandoc==1.4
 recommonmark==0.4.0
 requests<2.19.0,>=2.18.4
diff --git a/ci/docker/install/ubuntu_mklml.sh b/ci/docker/install/ubuntu_mklml.sh
index e50b6d273b8c..f97ce10e8e85 100755
--- a/ci/docker/install/ubuntu_mklml.sh
+++ b/ci/docker/install/ubuntu_mklml.sh
@@ -21,5 +21,5 @@
 # the whole docker cache for the image
 
 set -ex
-wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.18/mklml_lnx_2019.0.3.20190220.tgz
+wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.19/mklml_lnx_2019.0.5.20190502.tgz
 tar -zxf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
index 2d8b019372c7..65982eead389 100755
--- a/ci/docker/install/ubuntu_publish.sh
+++ b/ci/docker/install/ubuntu_publish.sh
@@ -66,5 +66,5 @@ python2 get-pip.py
 
 apt-get remove -y python3-urllib3
 
-pip2 install nose cpplint==1.3.0 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip2 install nose cpplint==1.3.0 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh
index 23158ba4c068..2ca0cceec515 100755
--- a/ci/docker/install/ubuntu_python.sh
+++ b/ci/docker/install/ubuntu_python.sh
@@ -30,5 +30,5 @@ wget -nv https://bootstrap.pypa.io/get-pip.py
 python3 get-pip.py
 python2 get-pip.py
 
-pip2 install nose cpplint==1.3.0 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
-pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip2 install nose cpplint==1.3.0 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 Cython==0.29.7
+pip3 install nose cpplint==1.3.0 pylint==2.3.1 'numpy>1.16.0,<2.0.0' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3 Cython==0.29.7
diff --git a/ci/docker/install/ubuntu_tutorials.sh b/ci/docker/install/ubuntu_tutorials.sh
index 4e40426ed85c..d82763e8fd3e 100755
--- a/ci/docker/install/ubuntu_tutorials.sh
+++ b/ci/docker/install/ubuntu_tutorials.sh
@@ -25,5 +25,5 @@ apt-get update || true
 apt-get install graphviz python-opencv
 
 # sckit-learn past version 0.20 does not support python version 2 and 3.4
-pip2 install jupyter matplotlib Pillow opencv-python "scikit-learn<0.21.0" graphviz tqdm mxboard scipy
-pip3 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm mxboard scipy
+pip2 install jupyter matplotlib Pillow opencv-python "scikit-learn<0.21.0" graphviz==0.8.4 tqdm mxboard scipy gluoncv
+pip3 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz==0.8.4 tqdm mxboard scipy gluoncv
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 091ffdf2551d..1ad67280617d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -41,6 +41,26 @@ scala_prepare() {
     export MAVEN_OPTS="-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn"
 }
 
+check_cython() {
+    set -ex
+    local python_ver=$1
+    local is_cython_used=$(python${python_ver} <<EOF
+import sys
+import mxnet as mx
+cython_ndarraybase = 'mxnet._cy' + str(sys.version_info.major) + '.ndarray'
+print(mx.nd._internal.NDArrayBase.__module__ == cython_ndarraybase)
+EOF
+)
+
+    if [ "${is_cython_used}" != "True" ]; then
+        echo "ERROR: cython is not used."
+        return 1
+    else
+        echo "NOTE: cython is used."
+        return 0
+    fi 
+}
+
 build_ccache_wrappers() {
     set -ex
 
@@ -370,6 +390,8 @@ build_ubuntu_cpu_openblas() {
         USE_LIBJPEG_TURBO=1           \
         USE_SIGNAL_HANDLER=1          \
         -j$(nproc)
+    make cython PYTHON=python2
+    make cython PYTHON=python3
 }
 
 build_ubuntu_cpu_mkl() {
@@ -682,6 +704,9 @@ build_ubuntu_gpu_cuda100_cudnn7() {
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
         -j$(nproc)
+
+    make cython PYTHON=python2
+    make cython PYTHON=python3
 }
 
 build_ubuntu_amalgamation() {
@@ -749,6 +774,7 @@ build_ubuntu_gpu_cmake() {
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \
         -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DBUILD_CYTHON_MODULES=1                \
         -G Ninja                                \
         /work/mxnet
 
@@ -816,12 +842,25 @@ sanity_check() {
     nosetests-3.4 tests/tutorials/test_sanity_tutorials.py
 }
 
+unittest_ubuntu_python2_cpu_cython() {
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_MKLDNN_DEBUG=1
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=1
+    export MXNET_ENFORCE_CYTHON=1
+    check_cython 2
+    nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
+    nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
+    nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
+}
 
 unittest_ubuntu_python2_cpu() {
     set -ex
     export PYTHONPATH=./python/
-    export MXNET_MKLDNN_DEBUG=1
+    export MXNET_MKLDNN_DEBUG=0
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
@@ -830,8 +869,9 @@ unittest_ubuntu_python2_cpu() {
 unittest_ubuntu_python3_cpu() {
     set -ex
     export PYTHONPATH=./python/
-    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
 }
@@ -839,8 +879,9 @@ unittest_ubuntu_python3_cpu() {
 unittest_ubuntu_python3_cpu_mkldnn() {
     set -ex
     export PYTHONPATH=./python/
-    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_mkl.xml --verbose tests/python/mkl
 }
@@ -848,18 +889,31 @@ unittest_ubuntu_python3_cpu_mkldnn() {
 unittest_ubuntu_python2_gpu() {
     set -ex
     export PYTHONPATH=./python/
-    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
 unittest_ubuntu_python3_gpu() {
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
+    export MXNET_ENABLE_CYTHON=0
+    nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
+}
+
+unittest_ubuntu_python3_gpu_cython() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
+    export MXNET_ENABLE_CYTHON=1
+    export MXNET_ENFORCE_CYTHON=1
+    check_cython 3
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
@@ -868,6 +922,7 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export CUDNN_OFF_TEST_ONLY=true
+    export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
 
@@ -877,6 +932,7 @@ unittest_ubuntu_tensorrt_gpu() {
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
+    export MXNET_ENABLE_CYTHON=0
     python tests/python/tensorrt/lenet5_train.py
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose --nocapture tests/python/tensorrt/
 }
@@ -886,9 +942,10 @@ unittest_ubuntu_tensorrt_gpu() {
 unittest_ubuntu_python2_quantization_gpu() {
     set -ex
     export PYTHONPATH=./python/
-    export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
+    export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
@@ -897,9 +954,10 @@ unittest_ubuntu_python2_quantization_gpu() {
 unittest_ubuntu_python3_quantization_gpu() {
     set -ex
     export PYTHONPATH=./python/
-    export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
+    export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
+    export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
@@ -1350,6 +1408,16 @@ nightly_scala_demo_test_cpu() {
     bash bin/run_im.sh
 }
 
+nightly_estimator() {
+    set -ex
+    cd /work/mxnet/tests/nightly/estimator
+    export PYTHONPATH=/work/mxnet/python/
+    python test_estimator_cnn.py --type gpu
+    python test_sentiment_rnn.py --type gpu
+    python test_estimator_cnn.py --type cpu
+    python test_sentiment_rnn.py --type cpu
+}
+
 # Deploy
 
 deploy_docs() {
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 2c86cf740a80..668d2f7c7dca 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -24,18 +24,20 @@ utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib_cython = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, python/mxnet/_cy2/*.so, python/mxnet/_cy3/*.so'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_cmake_lib_cython = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, python/mxnet/_cy2/*.so, python/mxnet/_cy3/*.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'build/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
+mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, python/mxnet/_cy2/*.so, python/mxnet/_cy3/*.so'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
 
 // Python unittest for CPU
@@ -46,6 +48,12 @@ def python2_ut(docker_container_name) {
   }
 }
 
+def python2_ut_cython(docker_container_name) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python2_cpu_cython', false)
+  }
+}
+
 // Python 3
 def python3_ut(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
@@ -89,6 +97,12 @@ def python3_gpu_ut_nocudnn(docker_container_name) {
   }
 }
 
+def python3_gpu_ut_cython(docker_container_name) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu_cython', true)
+  }
+}
+
 //------------------------------------------------------------------------------------------
 
 def compile_unix_cpu_openblas() {
@@ -98,7 +112,7 @@ def compile_unix_cpu_openblas() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            utils.pack_lib('cpu', mx_lib, true)
+            utils.pack_lib('cpu', mx_lib_cython, true)
           }
         }
       }
@@ -252,7 +266,7 @@ def compile_unix_cmake_gpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_gpu_cu100', 'build_ubuntu_gpu_cmake', false)
-            utils.pack_lib('cmake_gpu', mx_cmake_lib, true)
+            utils.pack_lib('cmake_gpu', mx_cmake_lib_cython, true)
           }
         }
       }
@@ -515,6 +529,48 @@ def compile_windows_cpu() {
     }]
 }
 
+def compile_windows_cpu_mkldnn() {
+    return ['Build CPU MKLDNN windows':{
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/build-cpu-mkldnn') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'py -3 ci/build_windows.py -f WIN_CPU_MKLDNN'
+            stash includes: 'windows_package.7z', name: 'windows_package_cpu_mkldnn'
+          }
+        }
+      }
+    }]
+}
+
+def compile_windows_cpu_mkldnn_mkl() {
+    return ['Build CPU MKLDNN MKL windows':{
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/build-cpu-mkldnn-mkl') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'py -3 ci/build_windows.py -f WIN_CPU_MKLDNN_MKL'
+            stash includes: 'windows_package.7z', name: 'windows_package_cpu_mkldnn_mkl'
+          }
+        }
+      }
+    }]
+}
+
+def compile_windows_cpu_mkl() {
+    return ['Build CPU MKL windows':{
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/build-cpu-mkl') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            powershell 'py -3 ci/build_windows.py -f WIN_CPU_MKL'
+            stash includes: 'windows_package.7z', name: 'windows_package_cpu_mkl'
+          }
+        }
+      }
+    }]
+}
+
 def compile_windows_gpu() {
     return ['Build GPU windows':{
       node(NODE_WINDOWS_CPU) {
@@ -587,8 +643,8 @@ def test_unix_python2_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-python2-cpu') {
           try {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            python2_ut('ubuntu_cpu')
+            utils.unpack_and_init('cpu', mx_lib_cython, true)
+            python2_ut_cython('ubuntu_cpu')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
@@ -689,8 +745,8 @@ def test_unix_python3_gpu() {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-python3-gpu') {
           try {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            python3_gpu_ut('ubuntu_gpu_cu100')
+            utils.unpack_and_init('gpu', mx_lib_cython, true)
+            python3_gpu_ut_cython('ubuntu_gpu_cu100')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
diff --git a/ci/jenkins/Jenkinsfile_windows_cpu b/ci/jenkins/Jenkinsfile_windows_cpu
index 5bc40d625930..4475796771d1 100644
--- a/ci/jenkins/Jenkinsfile_windows_cpu
+++ b/ci/jenkins/Jenkinsfile_windows_cpu
@@ -34,7 +34,10 @@ utils.assign_node_labels(utility: 'utility', windows_cpu: 'mxnetwindows-cpu')
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_windows_cpu()
+    custom_steps.compile_windows_cpu(),
+    custom_steps.compile_windows_cpu_mkldnn(),
+    custom_steps.compile_windows_cpu_mkldnn_mkl(),
+    custom_steps.compile_windows_cpu_mkl()
   ])
 
   utils.parallel_stage('Tests', [
diff --git a/ci/qemu/mxnet_requirements.txt b/ci/qemu/mxnet_requirements.txt
index a2e485efed19..2ab0fd9612e5 100644
--- a/ci/qemu/mxnet_requirements.txt
+++ b/ci/qemu/mxnet_requirements.txt
@@ -1,7 +1,7 @@
 urllib3<1.23,>=1.21.1
 requests<2.19.0,>=2.18.4
 graphviz<0.9.0,>=0.8.1
-numpy<=1.15.0,>=1.8.2
+numpy>1.16.0,<2.0.0
 mock
 nose
 nose-timer
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index 1c4a72682ae5..df9b15ba1ec3 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -24,6 +24,10 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
-if (! $?) { Throw ("Error running unittest") }
+if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
-if (! $?) { Throw ("Error running train tests") }
+if (! $?) { Throw ("Error running train tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest\test_operator.py:test_norm
+if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index 8a6c8e9b44f9..f2974ff6f7b6 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -24,10 +24,14 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
-if (! $?) { Throw ("Error running unittest") }
+if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
-if (! $?) { Throw ("Error running tests") }
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
-if (! $?) { Throw ("Error running tests") }
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error tests\python\train
-if (! $?) { Throw ("Error running tests") }
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python27\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py:test_norm
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index a7067f9f3f83..900bfd161cd0 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -24,6 +24,10 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
-if (! $?) { Throw ("Error running unittest") }
+if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
-if (! $?) { Throw ("Error running train tests") }
+if (! $?) { Throw ("Error running train tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest\test_operator.py:test_norm
+if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index 5fbc9f2f8036..b6e951b291fb 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -24,10 +24,14 @@ $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
-if (! $?) { Throw ("Error running unittest") }
+if (! $?) { Throw ("Error running unittest, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
-if (! $?) { Throw ("Error running tests") }
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
-if (! $?) { Throw ("Error running tests") }
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_train.xml tests\python\train
-if (! $?) { Throw ("Error running tests") }
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
+# Adding this extra test since it's not possible to set env var on the fly in Windows.
+$env:MXNET_SAFE_ACCUMULATION=1
+C:\Python37\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py:test_norm
+if (! $?) { Throw ("Error running tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
diff --git a/cmake/BuildCythonModules.cmake b/cmake/BuildCythonModules.cmake
new file mode 100644
index 000000000000..d2c3a46f1a71
--- /dev/null
+++ b/cmake/BuildCythonModules.cmake
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+function(add_cython_modules python_version)
+  unset(PYTHON_EXECUTABLE CACHE)
+  set(PYTHONINTERP_FOUND FALSE)
+  find_package(PythonInterp ${python_version} EXACT)
+  if(PYTHONINTERP_FOUND)
+    find_program(CYTHON_EXECUTABLE NAMES cython)
+    if(CYTHON_EXECUTABLE)
+      add_custom_command(COMMAND ${CMAKE_COMMAND} POST_BUILD
+                          -E env MXNET_LIBRARY_PATH=${CMAKE_BINARY_DIR}/libmxnet.so
+                          ${PYTHON_EXECUTABLE} setup.py build_ext --inplace --with-cython
+                          TARGET mxnet
+                          WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/python")
+      message("-- Cython modules for python${python_version} will be built")
+      set(PYTHON${python_version}_FOUND 1 PARENT_SCOPE)
+    else()
+      message(FATAL_ERROR "-- Cython not found")
+    endif()
+  else()
+    set(PYTHON${python_version}_FOUND 0 PARENT_SCOPE)
+  endif()
+endfunction()
diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake
index 5f4af2d89c91..e16594794ae8 100644
--- a/cmake/ChooseBlas.cmake
+++ b/cmake/ChooseBlas.cmake
@@ -18,14 +18,14 @@
 set(BLAS "Open" CACHE STRING "Selected BLAS library")
 set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL")
 
-if(USE_MKL_IF_AVAILABLE)
-  if(NOT MKL_FOUND)
-    find_package(MKL)
-  endif()
-  if(MKL_FOUND)
-	if(USE_MKLDNN)
-      set(BLAS "open")
-    else()
+if(DEFINED USE_BLAS)
+  set(BLAS "${USE_BLAS}")
+else()
+  if(USE_MKL_IF_AVAILABLE)
+    if(NOT MKL_FOUND)
+      find_package(MKL)
+    endif()
+    if(MKL_FOUND)
       set(BLAS "MKL")
     endif()
   endif()
diff --git a/cmake/DownloadMKLML.cmake b/cmake/DownloadMKLML.cmake
index 7b0e5ecf7c9c..73a588fa8afe 100644
--- a/cmake/DownloadMKLML.cmake
+++ b/cmake/DownloadMKLML.cmake
@@ -19,12 +19,12 @@
 
 message(STATUS "Downloading MKLML...")
 
-set(MKLDNN_RELEASE v0.18)
-set(MKLML_RELEASE_FILE_SUFFIX 2019.0.3.20190220)
+set(MKLDNN_RELEASE v0.19)
+set(MKLML_RELEASE_FILE_SUFFIX 2019.0.5.20190502)
 
-set(MKLML_LNX_MD5 76354b74325cd293aba593d7cbe36b3f)
-set(MKLML_WIN_MD5 02286cb980f12af610c05e99dbd78755)
-set(MKLML_MAC_MD5 3b28da686a25a4cf995ca4fc5e30e514)
+set(MKLML_LNX_MD5 dfcea335652dbf3518e1d02cab2cea97)
+set(MKLML_WIN_MD5 ff8c5237570f03eea37377ccfc95a08a)
+set(MKLML_MAC_MD5 0a3d83ec1fed9ea318e8573bb5e14c24)
 
 if(MSVC)
   set(MKL_NAME "mklml_win_${MKLML_RELEASE_FILE_SUFFIX}")
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 70405566d8ae..51fca23c1161 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -43,55 +43,6 @@ endif()
 # ---[ Root folders
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 
-if(USE_MKLDNN)
-
-  find_path(MKL_ROOT include/mkl_blas.h
-    PATHS $ENV{MKL_ROOT}
-    ${INTEL_ROOT}/mklml
-    ${DIRECT_DEPENDENCY_ROOTS}
-    DOC "Folder contains MKL"
-    )
-
-  # ---[ Find include dir
-  find_path(MKL_INCLUDE_DIR mkl_blas.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
-  set(__looked_for MKL_INCLUDE_DIR)
-
-  # ---[ Find libraries
-  if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-    set(__path_suffixes lib lib/ia32)
-  else()
-    set(__path_suffixes lib lib/intel64)
-  endif()
-
-  set(__mkl_libs "")
-
-  if(WIN32)
-    list(APPEND __mkl_libs mklml_intel)
-  else()
-    list(APPEND __mkl_libs mklml_gnu)
-  endif()
-  list(APPEND __mkl_libs mkldnn)
-
-  foreach (__lib ${__mkl_libs})
-    set(__mkl_lib "${__lib}")
-    string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
-
-    if(MKL_USE_STATIC_LIBS)
-      set(__mkl_lib "lib${__mkl_lib}.a")
-    endif()
-
-    find_library(${__mkl_lib_upper}_LIBRARY
-      NAMES ${__mkl_lib}
-      PATHS ${MKL_ROOT} "${MKL_INCLUDE_DIR}/.."
-      PATH_SUFFIXES ${__path_suffixes}
-      DOC "The path to Intel(R) MKL ${__mkl_lib} library")
-    mark_as_advanced(${__mkl_lib_upper}_LIBRARY)
-
-    list(APPEND __looked_for ${__mkl_lib_upper}_LIBRARY)
-    list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
-  endforeach()
-
-else(USE_MKLDNN)
 
   # ---[ Options
   mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON)
@@ -193,7 +144,7 @@ else(USE_MKLDNN)
     list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY})
   endif()
 
-endif(USE_MKLDNN)
+
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for})
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
index f81a35803171..68dcbfec5850 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
@@ -17,6 +17,7 @@
 
 (ns org.apache.clojure-mxnet.image
   "Image API of Clojure package."
+  (:refer-clojure :exclude [read])
   (:require [t6.from-scala.core :refer [$ $$] :as $]
             [org.apache.clojure-mxnet.dtype :as dtype]
             [org.apache.clojure-mxnet.ndarray :as ndarray]
@@ -38,8 +39,10 @@
 (s/def ::decode-image-opts
   (s/keys :opt-un [::color-flag ::to-rgb ::output]))
 
-(defn decode-image
-  "Decodes an image from an input stream with OpenCV
+(defn ^:deprecated decode-image
+  "DEPRECATED: use `decode` instead.
+
+   Decodes an image from an input stream with OpenCV
     `input-stream`: `InputStream` - Contains the binary encoded image
     `color-flag`: 0 or 1 - Convert decoded image to grayscale (0) or color (1)
     `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB
@@ -60,14 +63,47 @@
   ([input-stream]
    (decode-image input-stream {})))
 
+(s/def ::color #{:grayscale :color})
+(s/def ::decode-image-opts-2 (s/keys :opt-un [::color ::to-rgb ::output]))
+
+(defn- color->int [color]
+  (case color
+    :grayscale 0
+    :color 1))
+
+(defn decode
+  "Decodes an image from an input stream with OpenCV.
+    `input-stream`: `InputStream` - Contains the binary encoded image
+    `color`: keyword in `#{:color :grayscale}` - Convert decoded image to
+             grayscale or color
+    `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB
+            format (instead of opencv's default BGR)
+    `output`: nil or `NDArray`
+    returns: `NDArray` with dtype uint8
+
+  Ex:
+    (decode input-stream)
+    (decode input-stream {:color :color})
+    (decode input-stream {:color :grayscale :output nd})"
+  ([input-stream {:keys [color to-rgb output]
+                  :or {color :color to-rgb true output nil}
+                  :as opts}]
+   (util/validate! ::input-stream input-stream "Invalid input stream")
+   (util/validate! ::decode-image-opts-2 opts "Invalid options for decoding")
+   (Image/imDecode input-stream (color->int color) to-rgb ($/option output)))
+  ([input-stream]
+   (decode input-stream {})))
+
 (s/def ::filename string?)
 (s/def ::optional-color-flag
   (s/or :none nil? :some ::color-flag))
 (s/def ::optional-to-rgb
   (s/or :none nil? :some ::to-rgb))
 
-(defn read-image
-  "Reads an image file and returns an ndarray with OpenCV. It returns image in
+(defn ^:deprecated read-image
+  "DEPRECATED: use `read` instead.
+
+   Reads an image file and returns an ndarray with OpenCV. It returns image in
    RGB by default instead of OpenCV's default BGR.
     `filename`: string - Name of the image file to be loaded
     `color-flag`: 0 or 1 - Convert decoded image to grayscale (0) or color (1)
@@ -95,11 +131,43 @@
   ([filename]
    (read-image filename {})))
 
+(defn read
+  "Reads an image file and returns an ndarray with OpenCV. It returns image in
+   RGB by default instead of OpenCV's default BGR.
+    `filename`: string - Name of the image file to be loaded
+    `color`: keyword in `#{:color :grayscale}` - Convert decoded image to
+             grayscale or color
+    `to-rgb`: boolean - Whether to convert decoded image to mxnet's default RGB
+            format (instead of opencv's default BGR)
+    `output`: nil or `NDArray`
+    returns: `NDArray` with dtype uint8
+
+   Ex:
+     (read \"cat.jpg\")
+     (read \"cat.jpg\" {:color :grayscale})
+     (read \"cat.jpg\" {:color :color :output nd})"
+  ([filename {:keys [color to-rgb output]
+              :or {color :color to-rgb nil output nil}
+              :as opts}]
+   (util/validate! ::filename filename "Invalid filename")
+   (util/validate! ::color color "Invalid color")
+   (util/validate! ::optional-to-rgb to-rgb "Invalid conversion flag")
+   (util/validate! ::output output "Invalid output")
+   (Image/imRead
+    filename
+    ($/option (when color (color->int color)))
+    ($/option to-rgb)
+    ($/option output)))
+  ([filename]
+   (read filename {})))
+
 (s/def ::int int?)
 (s/def ::optional-int (s/or :none nil? :some int?))
 
-(defn resize-image
-  "Resizes the image array to (width, height)
+(defn ^:deprecated resize-image
+  "DEPRECATED: use `resize` instead.
+
+   Resizes the image array to (width, height)
    `input`: `NDArray` - source image in NDArray
    `w`: int - Width of resized image
    `h`: int - Height of resized image
@@ -122,6 +190,30 @@
   ([input w h]
    (resize-image input w h {})))
 
+(defn resize
+  "Resizes the image array to (width, height)
+   `input`: `NDArray` - source image in NDArray
+   `w`: int - Width of resized image
+   `h`: int - Height of resized image
+   `interpolation`: Interpolation method. Default is INTER_LINEAR
+   `ouput`: nil or `NDArray`
+   returns: `NDArray`
+
+   Ex:
+     (resize nd-img 300 300)
+     (resize nd-img 28 28 {:output nd})"
+  ([input w h {:keys [interpolation output]
+               :or {interpolation nil output nil}
+               :as opts}]
+   (util/validate! ::ndarray input "Invalid input array")
+   (util/validate! ::int w "Invalid width")
+   (util/validate! ::int h "Invalid height")
+   (util/validate! ::optional-int interpolation "Invalid interpolation")
+   (util/validate! ::output output "Invalid output")
+   (Image/imResize input w h ($/option interpolation) ($/option output)))
+  ([input w h]
+   (resize input w h {})))
+
 (defn apply-border
   "Pad image border with OpenCV.
    `input`: `NDArray` - source image in NDArray
@@ -193,7 +285,17 @@
 (s/def ::to-image-ndarray
   (s/and ::ndarray ::all-bytes ::rgb-array))
 
-(defn to-image
+(defn ^:deprecated to-image
+  "DEPRECATED: user `ndarray->image` instead.
+
+   Convert a NDArray image in RGB format to a real image.
+   `input`: `NDArray` - Source image in NDArray
+   returns: `BufferedImage`"
+  [input]
+  (util/validate! ::to-image-ndarray input "Invalid input array")
+  (Image/toImage input))
+
+(defn ndarray->image
   "Convert a NDArray image in RGB format to a real image.
    `input`: `NDArray` - Source image in NDArray
    returns: `BufferedImage`"
diff --git a/contrib/clojure-package/test/good-test-ndarray-api.clj b/contrib/clojure-package/test/good-test-ndarray-api.clj
index 7554089d0ba0..f7f58f8f7c88 100644
--- a/contrib/clojure-package/test/good-test-ndarray-api.clj
+++ b/contrib/clojure-package/test/good-test-ndarray-api.clj
@@ -106,7 +106,7 @@
   
   
   
-  Defined in src/operator/nn/batch_norm.cc:L574
+  Defined in src/operator/nn/batch_norm.cc:L572
   
   `data`: Input data to batch normalization
   `gamma`: gamma array
diff --git a/contrib/clojure-package/test/good-test-symbol-api.clj b/contrib/clojure-package/test/good-test-symbol-api.clj
index c7450f8eb5c1..3081304ebdb3 100644
--- a/contrib/clojure-package/test/good-test-symbol-api.clj
+++ b/contrib/clojure-package/test/good-test-symbol-api.clj
@@ -119,7 +119,7 @@
   
   
   
-  Defined in src/operator/nn/batch_norm.cc:L574
+  Defined in src/operator/nn/batch_norm.cc:L572
   
   `data`: Input data to batch normalization (optional)
   `gamma`: gamma array (optional)
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
index 23b88d07e896..fd200f18a78f 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
@@ -19,73 +19,102 @@
   (:require [org.apache.clojure-mxnet.image :as image]
             [org.apache.clojure-mxnet.ndarray :as ndarray]
             [clojure.java.io :as io]
-            [clojure.test :refer :all])
+            [clojure.test :refer [deftest is use-fixtures]])
   (:import (javax.imageio ImageIO)
            (java.io File)))
 
 (def tmp-dir (System/getProperty "java.io.tmpdir"))
 (def image-path (.getAbsolutePath (io/file tmp-dir "Pug-Cookie.jpg")))
+(def image-src-path "test/test-images/Pug-Cookie.jpg")
 
-(defn download-image []
-  (with-open [in (io/input-stream "https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg")
-              out (io/output-stream (io/file image-path))]
+(defn- cp
+  "Copy from filepath `from` to filepath `to`."
+  [from to]
+  (with-open [in (io/input-stream (io/file from))
+              out (io/output-stream (io/file to))]
     (io/copy in out)))
 
-(defn delete-image []
-  (io/delete-file image-path))
+(defn- rm
+  "Removes `filepath`."
+  [filepath]
+  (io/delete-file filepath))
 
-(defn with-downloaded-image [f]
-  (download-image)
-  (f)
-  (delete-image))
+(defn- with-file
+  "Provides `src-path` in `dest-path` for the test function `f` to use."
+  [src-path dest-path]
+  (fn [f]
+    (cp src-path dest-path)
+    (f)
+    (rm dest-path)))
 
-(use-fixtures :once with-downloaded-image)
+(use-fixtures :once (with-file image-src-path image-path))
 
 (deftest test-decode-image
-  (let [img-arr (image/decode-image 
-                 (io/input-stream image-path))
-        img-arr-2 (image/decode-image 
-                   (io/input-stream image-path)
-                   {:color-flag image/GRAYSCALE})]
+  (let [img-arr (image/decode-image (io/input-stream image-path))
+        img-arr-2 (image/decode-image (io/input-stream image-path)
+                                      {:color-flag image/GRAYSCALE})]
+    (is (= [576 1024 3] (ndarray/shape-vec img-arr)))
+    (is (= [576 1024 1] (ndarray/shape-vec img-arr-2)))))
+
+(deftest test-decode
+  (let [img-arr (image/decode (io/input-stream image-path))
+        img-arr-2 (image/decode (io/input-stream image-path)
+                                {:color :grayscale})]
     (is (= [576 1024 3] (ndarray/shape-vec img-arr)))
     (is (= [576 1024 1] (ndarray/shape-vec img-arr-2)))))
 
 (deftest test-read-image
   (let [img-arr (image/read-image image-path)
-        img-arr-2 (image/read-image
-                   image-path
-                   {:color-flag image/GRAYSCALE})]
+        img-arr-2 (image/read-image image-path {:color-flag image/GRAYSCALE})]
+    (is (= [576 1024 3] (ndarray/shape-vec img-arr)))
+    (is (= [576 1024 1] (ndarray/shape-vec img-arr-2)))))
+
+(deftest test-read
+  (let [img-arr (image/read image-path)
+        img-arr-2 (image/read image-path {:color :grayscale})]
     (is (= [576 1024 3] (ndarray/shape-vec img-arr)))
     (is (= [576 1024 1] (ndarray/shape-vec img-arr-2)))))
 
 (deftest test-resize-image
-  (let [img-arr (image/read-image image-path)
+  (let [img-arr (image/read image-path)
         resized-arr (image/resize-image img-arr 224 224)]
     (is (= [224 224 3] (ndarray/shape-vec resized-arr)))))
 
-(deftest test-crop-image
-  (let [img-arr (image/read-image image-path)
+(deftest test-resize
+  (let [img-arr (image/read image-path)
+        resized-arr (image/resize img-arr 224 224)]
+    (is (= [224 224 3] (ndarray/shape-vec resized-arr)))))
+
+(deftest test-fixed-crop
+  (let [img-arr (image/read image-path)
         cropped-arr (image/fixed-crop img-arr 0 0 224 224)]
     (is (= [224 224 3] (ndarray/shape-vec cropped-arr)))))
 
 (deftest test-apply-border
-  (let [img-arr (image/read-image image-path)
+  (let [img-arr (image/read image-path)
         padded-arr (image/apply-border img-arr 1 1 1 1)]
     (is (= [578 1026 3] (ndarray/shape-vec padded-arr)))))
 
 (deftest test-to-image
-  (let [img-arr (image/read-image image-path)
-        resized-arr (image/resize-image img-arr 224 224)
+  (let [img-arr (image/read image-path)
+        resized-arr (image/resize img-arr 224 224)
         new-img (image/to-image resized-arr)]
     (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png")))))
 
+(deftest test-ndarray->image
+  (let [img-arr (image/read image-path)
+        resized-arr (image/resize img-arr 224 224)
+        new-img (image/ndarray->image resized-arr)]
+    (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png")))))
+
 (deftest test-draw-bounding-box!
   (let [orig-img (ImageIO/read (new File image-path))
-        new-img  (-> orig-img
-                     (image/draw-bounding-box! [{:x-min 190 :x-max 850 :y-min 50 :y-max 450}
-                                                {:x-min 200 :x-max 350 :y-min 440 :y-max 530}]
-                                               {:stroke 2
-                                                :names ["pug" "cookie"]
-                                                :transparency 0.8
-                                                :font-size-mult 2.0}))]
+        new-img  (image/draw-bounding-box!
+                   orig-img
+                   [{:x-min 190 :x-max 850 :y-min 50 :y-max 450}
+                    {:x-min 200 :x-max 350 :y-min 440 :y-max 530}]
+                   {:stroke 2
+                    :names ["pug" "cookie"]
+                    :transparency 0.8
+                    :font-size-mult 2.0})]
     (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png")))))
diff --git a/cpp-package/example/get_data.sh b/cpp-package/example/get_data.sh
index b0913bdb684d..e11077234ade 100755
--- a/cpp-package/example/get_data.sh
+++ b/cpp-package/example/get_data.sh
@@ -51,11 +51,12 @@ download () {
     (($? != 0)) && exit 1 || return 0
 }
 
+# MNIST dataset from: http://yann.lecun.com/exdb/mnist/
 FILES=(
-    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz"
-    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz"
-    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz"
-    "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz"
+    "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"
+    "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"
+    "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"
+    "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
     "http://data.mxnet.io/data/mnist_train.csv.gz")
 
 for FILE in ${FILES[@]}; do
diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp
index fa5600190f95..cb952aa69f54 100644
--- a/cpp-package/example/inference/inception_inference.cpp
+++ b/cpp-package/example/inference/inception_inference.cpp
@@ -302,13 +302,11 @@ void Predictor::PredictImage(const std::string& image_file) {
 
   // The output is available in executor->outputs.
   auto array = executor->outputs[0].Copy(Context::cpu());
-
   /*
    * Find out the maximum accuracy and the index associated with that accuracy.
    * This is done by using the argmax operator on NDArray.
    */
   auto predicted = array.ArgmaxChannel();
-
   /*
    * Wait until all the previous write operations on the 'predicted'
    * NDArray to be complete before we read it.
@@ -317,7 +315,7 @@ void Predictor::PredictImage(const std::string& image_file) {
    */
   predicted.WaitToRead();
 
-  int best_idx = predicted.At(0, 0);
+  int best_idx = predicted.At(0);
   float best_accuracy = array.At(0, best_idx);
 
   if (output_labels.empty()) {
@@ -331,9 +329,7 @@ void Predictor::PredictImage(const std::string& image_file) {
 
 
 Predictor::~Predictor() {
-  if (executor) {
-    delete executor;
-  }
+  delete executor;
   MXNotifyShutdown();
 }
 
diff --git a/cpp-package/example/test_ndarray_copy.cpp b/cpp-package/example/test_ndarray_copy.cpp
new file mode 100644
index 000000000000..a3b3011993fa
--- /dev/null
+++ b/cpp-package/example/test_ndarray_copy.cpp
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * 
+ */
+#include <vector>
+#include "mxnet/c_api.h"
+#include "dmlc/logging.h"
+#include "mxnet-cpp/MxNetCpp.h"
+using namespace mxnet::cpp;
+
+enum TypeFlag {
+  kFloat32 = 0,
+  kFloat64 = 1,
+  kFloat16 = 2,
+  kUint8 = 3,
+  kInt32 = 4,
+  kInt8  = 5,
+  kInt64 = 6,
+};
+
+/*
+ * The file is used for testing if there exist type inconsistency
+ * when using Copy API to create a new NDArray.
+ * By running: build/test_ndarray.
+ */
+int main(int argc, char** argv) {
+    std::vector<mx_uint> shape1{128, 2, 32};
+    Shape shape2(32, 8, 64);
+
+    int gpu_count = 0;
+    if (MXGetGPUCount(&gpu_count) != 0) {
+      LOG(ERROR) << "MXGetGPUCount failed";
+      return -1;
+    }
+
+    Context context = (gpu_count > 0) ? Context::gpu() : Context::cpu();
+
+    NDArray src1(shape1, context, true, kFloat16);
+    NDArray src2(shape2, context, false, kInt8);
+    NDArray dst1, dst2;
+    dst1 = src1.Copy(context);
+    dst2 = src2.Copy(context);
+    NDArray::WaitAll();
+    CHECK_EQ(src1.GetDType(), dst1.GetDType());
+    CHECK_EQ(src2.GetDType(), dst2.GetDType());
+    return 0;
+}
diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
index 6f37d91aa68e..c4d51c59a532 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.h
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -131,18 +131,21 @@ class NDArray {
   /*!
   * \brief construct a new dynamic NDArray
   * \param shape the shape of array
-  * \param constext context of NDArray
+  * \param context context of NDArray
   * \param delay_alloc whether delay the allocation
+  * \param dtype data type of NDArray
   */
   NDArray(const std::vector<mx_uint> &shape, const Context &context,
-          bool delay_alloc = true);
+          bool delay_alloc = true, int dtype = 0);
   /*!
   * \brief construct a new dynamic NDArray
   * \param shape the shape of array
   * \param constext context of NDArray
   * \param delay_alloc whether delay the allocation
+  * \param dtype data type of NDArray
   */
-  NDArray(const Shape &shape, const Context &context, bool delay_alloc = true);
+  NDArray(const Shape &shape, const Context &context,
+          bool delay_alloc = true, int dtype = 0);
   NDArray(const mx_float *data, size_t size);
   /*!
   * \brief construct a new dynamic NDArray
@@ -318,6 +321,12 @@ class NDArray {
    */
   size_t Offset(size_t c, size_t h, size_t w) const;
   /*!
+  * \brief return value of the element at (index)
+  * \param index  position
+  * \return value of one dimensions array
+  */
+  mx_float At(size_t index) const;
+  /*!
   * \brief return value of the element at (h, w)
   * \param h height position
   * \param w width position
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index d0438305a62e..ed23c76ddc00 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -47,17 +47,18 @@ inline NDArray::NDArray(const NDArrayHandle &handle) {
   blob_ptr_ = std::make_shared<NDBlob>(handle);
 }
 inline NDArray::NDArray(const std::vector<mx_uint> &shape, const Context &context,
-                        bool delay_alloc) {
+                        bool delay_alloc, int dtype) {
   NDArrayHandle handle;
-  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.size(), context.GetDeviceType(),
-                           context.GetDeviceId(), delay_alloc, &handle),
+  CHECK_EQ(MXNDArrayCreateEx(shape.data(), shape.size(), context.GetDeviceType(),
+                             context.GetDeviceId(), delay_alloc, dtype, &handle),
            0);
   blob_ptr_ = std::make_shared<NDBlob>(handle);
 }
-inline NDArray::NDArray(const Shape &shape, const Context &context, bool delay_alloc) {
+inline NDArray::NDArray(const Shape &shape, const Context &context,
+                        bool delay_alloc, int dtype) {
   NDArrayHandle handle;
-  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
-                           context.GetDeviceId(), delay_alloc, &handle),
+  CHECK_EQ(MXNDArrayCreateEx(shape.data(), shape.ndim(), context.GetDeviceType(),
+                             context.GetDeviceId(), delay_alloc, dtype, &handle),
            0);
   blob_ptr_ = std::make_shared<NDBlob>(handle);
 }
@@ -208,7 +209,7 @@ inline void NDArray::SyncCopyToCPU(std::vector<mx_float> *data, size_t size) {
   MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data->data(), size);
 }
 inline NDArray NDArray::Copy(const Context &ctx) const {
-  NDArray ret(GetShape(), ctx);
+  NDArray ret(GetShape(), ctx, true, this->GetDType());
   Operator("_copyto")(*this).Invoke(ret);
   return ret;
 }
@@ -374,11 +375,15 @@ inline void NDArray::Save(const std::string &file_name,
 }
 
 inline size_t NDArray::Offset(size_t h, size_t w) const {
-  return (h * GetShape()[1]) + w;
+  auto const shape = GetShape();
+  CHECK_EQ(shape.size(), 2) << "The NDArray needs to be 2 dimensional.";
+
+  return (h * shape[1]) + w;
 }
 
 inline size_t NDArray::Offset(size_t c, size_t h, size_t w) const {
   auto const shape = GetShape();
+  CHECK_EQ(shape.size(), 3) << "The NDArray needs to be 3 dimensional.";
   return h * shape[0] * shape[2] + w * shape[0] + c;
 }
 
@@ -390,6 +395,13 @@ inline mx_float NDArray::At(size_t c, size_t h, size_t w) const {
   return GetData()[Offset(c, h, w)];
 }
 
+inline mx_float NDArray::At(size_t index) const {
+  auto shape = GetShape();
+  CHECK_EQ(shape.size(), 1) << "The NDArray needs to be 1 dimensional.";
+  CHECK_LT(index, shape[0]) << "Specified index is out of range.";
+  return GetData()[index];
+}
+
 inline size_t NDArray::Size() const {
   size_t ret = 1;
   for (auto &i : GetShape()) ret *= i;
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 2d1f8e4f68e6..ef7fceacfd6e 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -57,6 +57,9 @@ cp ../../build/cpp-package/example/test_kvstore .
 cp ../../build/cpp-package/example/test_score .
 ./test_score 0.93
 
+cp ../../build/cpp-package/example/test_ndarray_copy .
+./test_ndarray_copy
+
 sh unittests/unit_test_mlp_csv.sh
 
 cd inference
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index ec3977601c92..ca4ac363dc41 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -19,7 +19,7 @@
  */
 
 /* Installation page display functions for install selector */
-var versionSelect   = defaultVersion = 'v1.4.0';
+var versionSelect   = defaultVersion = 'v1.4.1';
 var platformSelect    = 'Linux';
 var languageSelect  = 'Python';
 var processorSelect = 'CPU';
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index 34f675853924..a5f0bed636e9 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -23,14 +23,14 @@
   <div class="container">
     <div class="row">
       <div class="col-lg-4 col-sm-12">
-        <h3>MXNet 1.4.0 Released</h3>
-        <p>This release introduces the Java Inference API and Julia API, as well as Control Flow Operators, MKLDNN optimizations, and SVRG optimization.</p>
-        <a href="https://github.com/apache/incubator-mxnet/releases/tag/1.4.0">Learn More</a>
+        <h3>MXNet 1.4.1 Released</h3>
+        <p>This patch release features bug fixes and performance improvements.</p>
+        <a href="https://github.com/apache/incubator-mxnet/releases/tag/1.4.1">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
         <h3>A 60-minute Gluon Crash Course</h3>
         <p>Check out our quick overview of how to use Gluon, the imperative interface of MXNet.</p>
-        <a href="http://gluon-crash-course.mxnet.io/">Learn More</a>
+        <a href="https://beta.mxnet.io/guide/getting-started/crash-course/index.html">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
         <h3>Get the latest news from MXNet blogs on Medium</h3>
diff --git a/docs/tutorials/tensorrt/wavenet_optimized.png b/docs/_static/tutorials/tensorrt/wavenet_optimized.png
similarity index 100%
rename from docs/tutorials/tensorrt/wavenet_optimized.png
rename to docs/_static/tutorials/tensorrt/wavenet_optimized.png
diff --git a/docs/tutorials/tensorrt/wavenet_unoptimized.png b/docs/_static/tutorials/tensorrt/wavenet_unoptimized.png
similarity index 100%
rename from docs/tutorials/tensorrt/wavenet_unoptimized.png
rename to docs/_static/tutorials/tensorrt/wavenet_unoptimized.png
diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md
index f60e7f141adf..d4358ddcea22 100644
--- a/docs/api/python/ndarray/contrib.md
+++ b/docs/api/python/ndarray/contrib.md
@@ -75,6 +75,7 @@ In the rest of this document, we list routines provided by the `ndarray.contrib`
     isinf
     isfinite
     isnan
+    index_array
     index_copy
     getnnz
     edge_id
diff --git a/docs/api/python/optimization/optimization.md b/docs/api/python/optimization/optimization.md
index 03448123a14f..47f99f3602f8 100644
--- a/docs/api/python/optimization/optimization.md
+++ b/docs/api/python/optimization/optimization.md
@@ -171,8 +171,11 @@ for examples.
 ```eval_rst
 .. automodule:: mxnet.optimizer
     :members:
+    :exclude-members: NDabs
+
 .. automodule:: mxnet.lr_scheduler
     :members:
+
 .. automodule:: mxnet.initializer
     :members:
 ```
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index 2a6a5efe29be..38537f7487c7 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -72,6 +72,7 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     foreach
     while_loop
     cond
+    index_array
     index_copy
     getnnz
     edge_id
diff --git a/docs/api/python/symbol/linalg.md b/docs/api/python/symbol/linalg.md
index 5b467b501247..436bab78c451 100644
--- a/docs/api/python/symbol/linalg.md
+++ b/docs/api/python/symbol/linalg.md
@@ -59,6 +59,7 @@ In the rest of this document, we list routines provided by the `symbol.linalg` p
     makediag
     extracttrian
     maketrian
+    inverse
 ```
 
 ## API Reference
diff --git a/docs/community/contribute.md b/docs/community/contribute.md
index 50aa1a607a44..2dd92648715b 100644
--- a/docs/community/contribute.md
+++ b/docs/community/contribute.md
@@ -74,7 +74,6 @@ Please join either or both of the MXNet mailing lists:
 
 To join the MXNet slack channel send request to the contributor mailing list.
  * <a href="mailto:dev@mxnet.apache.org?subject=Requesting%20slack%20access">email</a> <i class="far fa-envelope"></i>
- * [archive](https://the-asf.slackarchive.io/mxnet) <i class="fas fa-external-link-alt"></i>
 
 
 ### Social Media
diff --git a/docs/community/ecosystem.md b/docs/community/ecosystem.md
index 1e2bf07335d3..e7e101372115 100644
--- a/docs/community/ecosystem.md
+++ b/docs/community/ecosystem.md
@@ -85,4 +85,4 @@ Community contributions to MXNet have added many new valuable features and funct
 
 ## Contributions
 
-Do you know of a project or resource in the MXNet ecosystem that should be listed here? Or would you like to get involved by providing your own contribution? Check out the [guide for contributing to MXNet](contribute.html), and browse the [design proposals](https://cwiki.apache.org/confluence/display/MXNET/Design+Proposals) to see what others are working on. You might find something you would like to help with or use those design docs as a template for your own proposal. Use one of the [developer communication channels](https://mxnet.incubator.apache.org/community/contribute.html#mxnet-dev-communications) if you would like to know more, or [create a GitHub issue](https://github.com/apache/incubator-mxnet/issues/new) if you would like to propose something for the MXNet ecosystem.
+Do you know of a project or resource in the MXNet ecosystem that should be listed here? Or would you like to get involved by providing your own contribution? Check out the [guide for contributing to MXNet](contribute.html), and browse the [design proposals](https://cwiki.apache.org/confluence/display/MXNET/Proposals) to see what others are working on. You might find something you would like to help with or use those design docs as a template for your own proposal. Use one of the [developer communication channels](https://mxnet.incubator.apache.org/community/contribute.html#mxnet-dev-communications) if you would like to know more, or [create a GitHub issue](https://github.com/apache/incubator-mxnet/issues/new) if you would like to propose something for the MXNet ecosystem.
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index c5ebd54c55a1..cdd528cd8c8f 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -47,7 +47,7 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - The maximum number of concurrent threads that do the memory copy job on each GPU.
 * MXNET_CPU_WORKER_NTHREADS
   - Values: Int ```(default=1)```
-  - The maximum number of scheduling threads on CPU. It specifies how many operators can be run in parallel.
+  - The maximum number of scheduling threads on CPU. It specifies how many operators can be run in parallel. Note that most CPU operators are parallelized by OpenMP. To change the number of threads used by individual operators, please set `OMP_NUM_THREADS` instead.
 * MXNET_CPU_PRIORITY_NTHREADS
   - Values: Int ```(default=4)```
   - The number of threads given to prioritized CPU jobs.
@@ -146,7 +146,7 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - Values: 0(false) or 1(true) ```(default=0)```
   - If true, MXNet tries to use tree reduction for Push and Pull communication.
   - Otherwise, MXNet uses the default Push and Pull implementation.
-  - [Tree reduction technology](http://www.sysml.cc/doc/178.pdf) has been shown to be faster than the standard ```--kv-store device``` Push/Pull and ```--kv-store nccl``` Push/Pull for small batch sizes.
+  - Tree reduction technology has been shown to be faster than the standard ```--kv-store device``` Push/Pull and ```--kv-store nccl``` Push/Pull for small batch sizes.
 
 * MXNET_KVSTORE_LOGTREE
   - Values: 0(false) or 1(true) ```(default=0)```
@@ -199,6 +199,22 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
 	- If set to '0', profiler records the events of the symbolic operators.
 	- If set to '1', profiler records the events of all operators.
 
+## Interface between Python and the C API
+
+* MXNET_ENABLE_CYTHON
+  - Values: 0(false), 1(true) ```(default=1)```
+  - If set to 0, MXNet uses the ctypes to interface with the C API.
+  - If set to 1, MXNet tries to use the cython modules for the ndarray and symbol. If it fails, the ctypes is used or an error occurs depending on MXNET_ENFORCE_CYTHON.
+
+* MXNET_ENFORCE_CYTHON
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - This has an effect only if MXNET_ENABLE_CYTHON is 1.
+  - If set to 0, MXNet fallbacks to the ctypes if importing the cython modules fails.
+  - If set to 1, MXNet raises an error if importing the cython modules fails.
+
+If cython modules are used, `mx.nd._internal.NDArrayBase` must be `mxnet._cy3.ndarray.NDArrayBase` for python 3 or `mxnet._cy2.ndarray.NDArrayBase` for python 2.
+If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
+
 ## Other Environment Variables
 
 * MXNET_GPU_WORKER_NSTREAMS
@@ -280,6 +296,19 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - Values: Int ```(default=4)```
   - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator.
 
+* MXNET_SUBGRAPH_BACKEND
+  - Values: String ```(default="")```
+  - This variable controls the subgraph partitioning in MXNet.
+  - This variable is used to perform MKL-DNN FP32 operator fusion and quantization. Please refer to the [MKL-DNN operator list](../tutorials/mkldnn/operator_list.md) for how this variable is used and the list of fusion passes.
+
+* MXNET_SAFE_ACCUMULATION
+  - Values: Values: 0(false) or 1(true) ```(default=0)```
+  - If this variable is set, the accumulation will enter the safe mode, meaning accumulation is done in a data type of higher precision than
+    the input data type, leading to more accurate accumulation results with a possible performance loss and backward compatibility loss.
+    For example, when the variable is set to 1(true), if the input data type is float16, then the accumulation will be done
+    with float32.
+  - Model accuracies do not necessarily improve with this environment variable turned on.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
@@ -302,5 +331,5 @@ Settings for controlling OMP tuning
 
 - Set ```MXNET_USE_NUM_CORES_OPERATOR_TUNING``` to define num_cores to be used by operator tuning code.
   - This reduces operator tuning overhead when there are multiple instances of mxnet running in the system and we know that
-    each mxnet will take only partial num_cores available with system. 
+    each mxnet will take only partial num_cores available with system.
   - refer: https://github.com/apache/incubator-mxnet/pull/13602
diff --git a/docs/faq/float16.md b/docs/faq/float16.md
index 323218ce7df6..465668610413 100644
--- a/docs/faq/float16.md
+++ b/docs/faq/float16.md
@@ -17,109 +17,158 @@
 
 # Mixed precision training using float16
 
-In this tutorial you will walk through how one can train deep learning neural networks with mixed precision on supported hardware. You will first see how to use float16 (both with Gluon and Symbolic APIs) and then some techniques on achieving good performance and accuracy.
+In this tutorial we will walk through how one can train deep learning neural networks with mixed precision on supported hardware. We will first see how to use float16 (both with Gluon and Symbolic APIs) and then some techniques on achieving good performance and accuracy.
 
 ## Background
-The computational resources required for training deep neural networks have been increasing of late because of complexity of the architectures and size of models. Mixed precision training allows us to reduces the resources required by using lower precision arithmetic. In this approach you can train using 16 bit floating points (half precision) while using 32 bit floating points (single precision) for output buffers of float16 computation. This combination of single and half precision gives rise to the name mixed precision. It allows us to achieve the same accuracy as training with single precision, while decreasing the required memory and training or inference time.
 
-The float16 data type is a 16 bit floating point representation according to the IEEE 754 standard. It has a dynamic range where the precision can go from 0.0000000596046 (highest, for values closest to 0) to 32 (lowest, for values in the range 32768-65536). Despite the inherent reduced precision when compared to single precision float (float32), using float16 has many advantages. The most obvious advantages are that you can reduce the size of the model by half allowing the training of larger models and using larger batch sizes. The reduced memory footprint also helps in reducing the pressure on memory bandwidth and lowering communication costs. On hardware with specialized support for float16 computation you can also greatly improve the speed of training and inference. The Volta range of Graphics Processing Units (GPUs) from Nvidia have [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensorcore/) which perform efficient float16 computation. A tensor core allows accumulation of half precision products into single or half precision outputs. For the rest of this tutorial we assume that we are working with Nvidia's Tensor Cores on a Volta GPU.
+The computational resources required for training deep neural networks have been lately increasing because of growing complexity and model size. Mixed precision training allows us to reduce the utilization of the resources by using lower precision arithmetic which is computationally less expensive and less costly in terms of space utilization. In this approach you can train using 16 bit floating point (half precision) while using 32 bit floating point (single precision) for output buffers of float16 computation. This allows one to achieve the same accuracy as training with single precision, while decreasing the required memory and training or inference time.
+
+The float16 data type is a 16 bit floating point representation according to the [IEEE 754 standard](https://ieeexplore.ieee.org/document/4610935). It has a dynamic range where the precision can go from 0.0000000596046 (highest, for values closest to 0) to 32 (lowest, for values in the range 32768-65536). Despite the inherent reduced precision when compared to single precision float (float32), using float16 has many advantages. The most obvious advantages are that you can reduce the size of the model by half allowing the training of larger models and using larger batch sizes. The reduced memory footprint also helps in reducing the pressure on memory bandwidth and lowering communication costs. On hardware with specialized support for float16 computation you can also greatly improve the speed of training and inference. The Volta range of Graphics Processing Units (GPUs) from Nvidia have [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensorcore/) which perform efficient float16 computation. A tensor core allows accumulation of half precision products into single or half precision outputs. For the rest of this tutorial we assume that we are working with Nvidia's Tensor Cores on a Volta GPU.
 
 ## Prerequisites
-- Volta range of Nvidia GPUs
-- Cuda 9 or higher
-- CUDNN v7 or higher
 
-This tutorial also assumes that you understand how to train a network with float32. Please refer to other tutorials [here](http://mxnet.incubator.apache.org/tutorials/index.html) to get started with MXNet and/or Gluon. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
+- [Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) range of Nvidia GPUs (e.g. AWS P3 instance)
+- CUDA 9 or higher
+- cuDNN v7 or higher
+
+This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
 
 ## Using the Gluon API
 
 ### Training or Inference
 
-With Gluon, you need to take care of three things to convert a model to support float16.
+With Gluon API, you need to take care of three things to convert a model to support computation with float16.
 
-1. Cast the Gluon Block, so as to cast the parameters of layers and change the type of input expected, to float16. This is as simple as calling the [cast](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.cast) method of the Block representing the network.
-```
+1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.cast) method of the `Block` representing the network.
+
+```python
 net = net.cast('float16')
 ```
 
-2. Ensure the data input to the network is of float16 type. If your DataLoader or Iterator produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [`astype`](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.astype) method of ndarrays.
-```
+2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.astype) method of NDArrays.
+
+```python
 data = data.astype('float16', copy=False)
 ```
 
-If you are using images and DataLoader, you can also use a [Cast transform](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.transforms.Cast)
+If you are using images and DataLoader, you can also use a [Cast transform](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.transforms.Cast).
 
-3. It is preferable to use **multi_precision mode of optimizer** when training in float16. This mode of optimizer maintains a master copy of weights in float32 even when the training (i.e. forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence for some networks. (Further discussion on this towards the end.)
+3. It is preferable to use **multi_precision mode of optimizer** when training in float16. This mode of optimizer maintains a master copy of the weights in float32 even when the training (i.e. forward and backward pass) is in float16. This helps increase precision of the weight updates and can lead to faster convergence in some scenarios.
 
 ```python
 optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01)
 ```
 
-You can play around with mixed precision using the image classification example [here](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py). We suggest using the Caltech101 dataset option in that example and using a Resnet50_v1 network so you can quickly see the performance improvement and how the accuracy is unaffected. Here's a starter command to run this.
+You can play around with mixed precision using the image classification [example](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py). We suggest using the Caltech101 dataset option in that example and using a ResNet50V1 network so you can quickly see the performance improvement and how the accuracy is unaffected. Here's the starter command to run this example.
 
-```
+```bash
 python image_classification.py --model resnet50_v1 --dataset caltech101 --gpus 0 --num-worker 30 --dtype float16
 ```
 
-
 ### Fine-tuning
 
-You can also fine-tune in float16, a model which was originally trained in float32. Here is how you would do it. As an example if you are trying to use a model pretrained on the Imagenet dataset from the ModelZoo, you would first fetch the pretrained network and then cast that network to float16.
+You can also fine-tune a model, which was originally trained in float32, to use float16. Below is an example of how to fine-tune a pretrained model from the Model Zoo. You would first need to fetch the pretrained network and then cast that network to float16.
 
-```
-pretrained_net = models.get_model(name='resnet50_v2', ctx=ctx, pretrained=True, classes=1000)
+```python
+import numpy as np
+import mxnet as mx
+from mxnet.gluon.model_zoo.vision import get_model
+
+
+pretrained_net = get_model(name='resnet50_v2', ctx=mx.cpu(),
+                           pretrained=True, classes=1000)
 pretrained_net.cast('float16')
 ```
-Then if you have another Resnet50_v2 model you want to fine-tune, you can just assign the features to that network and then cast it.
 
-```
-net = models.get_model(name='resnet50_v2', ctx=ctx, pretrained=False, classes=101)
-net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
+Then, if you have another Resnet50V2 model you want to fine-tune, you can just assign the features to that network and then cast it.
+
+```python
+net = get_model(name='resnet50_v2', ctx=mx.cpu(),
+                pretrained=False, classes=101)
+net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=mx.cpu())
 net.features = pretrained_net.features
-net.cast(dtype)
+net.cast('float16')
+```
+
+You can check the parameters of the model by calling [summary](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.summary) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
+
+```python
+net.summary(mx.nd.uniform(shape=(1, 3, 224, 224), dtype=np.float16))
 ```
 
 ## Using the Symbolic API
 
 Training a network in float16 with the Symbolic API involves the following steps.
+
 1. Add a layer at the beginning of the network, to cast the data to float16. This will ensure that all the following layers compute in float16.
 2. It is advisable to cast the output of the layers before softmax to float32, so that the softmax computation is done in float32. This is because softmax involves large reductions and it helps to keep that in float32 for more precise answer.
-3. It is advisable to use the multi-precision mode of the optimizer for more precise weight updates. This is discussed in some detail below. Here's how you would enable this mode when creating an optimizer.
+3. It is advisable to use the multi-precision mode of the optimizer for more precise weight updates. Here's how you would enable this mode when creating an optimizer.
 
 ```python
 optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01)
 ```
 
-There are a few examples of building such networks which can handle float16 input in [examples/image-classification/symbols/](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/symbols). Specifically you could look at the [resnet](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/symbols/resnet.py) example.
+For a full example, please refer to [resnet.py](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/symbols/resnet.py) file on GitHub. A small, relevant excerpt from that file is presented below.
 
-An illustration of the relevant section of the code is below.
-```
+```python
 data = mx.sym.Variable(name="data")
+
 if dtype == 'float16':
     data = mx.sym.Cast(data=data, dtype=np.float16)
 
-// the rest of the network 
+# ... the rest of the network
 net_out = net(data)
 
 if dtype == 'float16':
     net_out = mx.sym.Cast(data=net_out, dtype=np.float32)
+
 output = mx.sym.SoftmaxOutput(data=net_out, name='softmax')
 ```
 
-We have an example script which show how to train imagenet with resnet50 using float16 [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/train_imagenet.py) 
+If you would like to train ResNet50 model on ImageNet using float16 precision, you can find the full script [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classificatiIfon/train_imagenet.py)
 
-Here's how you can use the above script to train Resnet50 v1 model with synthetic data using float16, so you can try it out even if you don't have the Imagenet dataset handy.
-```
+If you don't have ImageNet dataset at your disposal, you can still run the script above using synthetic float16 data by providing the following command:
+
+```bash
 python train_imagenet.py --network resnet-v1 --num-layers 50 --benchmark 1 --gpus 0 --batch-size 256 --dtype float16
 ```
 
-There's a similar example for fine tuning [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/fine-tune.py). The following command shows how to use that script to fine tune a Resnet50 model trained on Imagenet for the Caltech 256 dataset using float16.
+There's a similar example for float16 fine tuning [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/fine-tune.py) of selected models: Inception v3, Inception v4, ResNetV1, ResNet50, ResNext or VGG. The command below shows how to use that script to fine-tune a Resnet50 model trained on Imagenet for the Caltech 256 dataset using float16.
+
+```bash
+python fine-tune.py --network resnet --num-layers 50 --pretrained-model imagenet1k-resnet-50 --data-train ~/.mxnet/dataset/caltech-256/caltech256-train.rec --data-val ~/data/caltech-256/caltech256-val.rec --num-examples 15420 --num-classes 256 --gpus 0 --batch-size 64 --dtype float16
 ```
-python fine-tune.py --network resnet --num-layers 50 --pretrained-model imagenet1k-resnet-50 --data-train ~/data/caltech-256/caltech256-train.rec --data-val ~/data/caltech-256/caltech256-val.rec --num-examples 15420 --num-classes 256 --gpus 0 --batch-size 64 --dtype float16
+
+If you don't have the `Caltech256` dataset, you can download it using the script below, and convert it into .rec file format using [im2rec utility file](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py)
+
+```python
+import os
+from os.path import expanduser
+import tarfile
+import mxnet as mx
+
+
+data_folder = expanduser("~/.mxnet/datasets/")
+dataset_name = "256_ObjectCategories"
+archive_file = "{}.tar".format(dataset_name)
+archive_path = os.path.join(data_folder, archive_file)
+data_url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/"
+
+if not os.path.isfile(archive_path):
+    mx.test_utils.download("{}{}".format(data_url, archive_file),
+                           dirname=data_folder)
+    print('Extracting {} in {}...'.format(archive_file, data_folder))
+    tar = tarfile.open(archive_path)
+    tar.extractall(data_folder)
+    tar.close()
+    print('Data extracted.')
 ```
 
 ## Example training results
-Let us consider training a Resnet50 v1 model on the Imagenet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a AWS p3.16x large instance. Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below.
+
+Let us consider training a Resnet50V1 model on the ImageNet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a an [AWS p3.16xlarge](https://aws.amazon.com/ec2/instance-types/p3/#Amazon_EC2_P3_Instance_Product_Details) instance.
+
+Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below.
 
 Batch size | Data type | Top 1 Validation accuracy | Time to train | Speedup |
 --- | --- | --- | --- | --- |
@@ -127,65 +176,73 @@ Batch size | Data type | Top 1 Validation accuracy | Time to train | Speedup |
 1024 | float16 | 76.34% | 7.3 hrs | 1.62x |
 2048 | float16 | 76.29% | 6.5 hrs | 1.82x |
 
-![Training curves of Resnet50 v1 on Imagenet 2012](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png)
+![Training curves of Resnet50V1 on Imagenet 2012](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png)
 
-The differences in accuracies above are within normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. As the plot indicates training behaves similarly for these cases, even though we didn't have to change any other hyperparameters. We can also see from the table that using float16 helps train faster through faster computation with float16 as well as allowing the use of larger batch sizes.
+The difference in accuracies above are within normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. As the plot indicates, training behaves similarly for these cases, even though we didn't have to change any other hyperparameters. We can also see from the table that using float16 helps train faster through faster computation with float16 as well as allowing the use of larger batch sizes.
 
 ## Things to keep in mind
 
 ### For performance
 
-Typical performance gains seen for float16 typically range 1.6x-2x for convolutional networks like Resnet and even about 3x for networks with LSTMs. The performance gain you see can depend on certain things which this section will introduce you to.
+Typical performance gains seen for float16 typically range 1.6x-2x for convolutional networks like Resnet and even about 3x for networks with LSTMs. The performance gain you see can depend on certain things which this section will introduce.
 
-1. Nvidia Tensor Cores essentially perform the computation D = A * B + C, where A and B are half precision matrices, while C and D could be either half precision or full precision. The tensor cores are most efficient when dimensions of these matrices are multiples of 8. This means that Tensor Cores can not be used in all cases for fast float16 computation. When training models like Resnet50 on the Cifar10 dataset, the tensors involved are sometimes smaller, and Tensor Cores can not always be used. The computation in that case falls back to slower algorithms and using float16 turns out to be slower than float32 on a single GPU. Note that when using multiple GPUs, using float16 can still be faster than float32 because of reduction in communication costs.
+1. Nvidia Tensor Cores essentially perform the computation `D = A * B + C`, where A and B are half precision matrices, while C and D could be either half precision or full precision. The tensor cores are most efficient when dimensions of these matrices are multiples of 8. This means that Tensor Cores can not be used in all cases for fast float16 computation. When training models like Resnet50 on the Cifar10 dataset, the tensors involved are sometimes smaller, and Tensor Cores can not always be used. The computation in that case falls back to slower algorithms and using float16 turns out to be slower than float32 on a single GPU. Note that when using multiple GPUs, using float16 can still be faster than float32 because of reduction in communication costs.
 
 2. When you scale up the batch size ensure that IO and data pre-processing is not your bottleneck. If you see a slowdown this would be the first thing to check.
 
 3. It is advisable to use batch sizes that are multiples of 8 because of the above reason when training with float16. As always, batch sizes which are powers of 2 would be best when compared to those around it.
 
-4. You can check whether your program is using Tensor cores for fast float16 computation by profiling with `nvprof`.
-The operations with `s884cudnn` in their names represent the use of Tensor cores.
+4. You can check whether your program is using Tensor cores for fast float16 computation by profiling with `nvprof`. The operations with `s884cudnn` in their names represent the use of Tensor cores.
 
-5. When not limited by GPU memory, it can help to set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 2. This configures MXNet to run tuning tests and choose the fastest convolution algorithm whose memory requirements may exceed the default memory of CUDA workspace.
+5. When not limited by GPU memory, it can help to set the environment variable `MXNET_CUDNN_AUTOTUNE_DEFAULT` to `2`. This configures MXNet to run tuning tests and choose the fastest convolution algorithm whose memory requirements may exceed the default memory of CUDA workspace.
 
 6. Please note that float16 on CPU might not be supported for all operators, as in most cases float16 on CPU is much slower than float32.
 
-
 ### For accuracy
 
 #### Multi precision mode
+
 When training in float16, it is advisable to still store the master copy of the weights in float32 for better accuracy. The higher precision of float32 helps overcome cases where gradient update can become 0 if represented in float16. This mode can be activated by setting the parameter `multi_precision` of optimizer params to `True` as in the above example. It has been found that this is not required for all networks to achieve the same accuracy as with float32, but nevertheless recommended. Note that for distributed training, this is currently slightly slower than without `multi_precision`, but still much faster than using float32 for training.
 
-#### Large reductions 
-Since float16 has low precision for large numbers, it is best to leave layers which perform large reductions in float32. This includes BatchNorm and Softmax. Ensuring that Batchnorm performs reduction in float32 is handled by default in both Gluon and Module APIs. While Softmax is set to use float32 even during float16 training in Gluon, in the Module API there needs to be a cast to float32 before softmax as the above symbolic example code shows.
+#### Large reductions
+
+Since float16 has low precision for large numbers, it is best to leave layers which perform large reductions in float32. This includes BatchNorm and Softmax. Ensuring that Batchnorm performs reduction in float32 is handled by default in both Gluon and Module APIs. While Softmax is set to use float32 even during float16 training in Gluon, in the Module API it needs to be a cast to float32 before softmax as the above symbolic example code shows.
 
 #### Loss scaling
-For some networks just switching the training to float16 mode was not found to be enough to reach the same accuracy as when training with float32. This is because the activation gradients computed are too small and could not be represented in float16 representable range. Such networks can be made to achieve the accuracy reached by float32 with a couple of changes. 
+
+For some networks just switching the training to float16 mode was not found to be enough to reach the same accuracy as when training with float32. This is because the activation gradients computed are too small and could not be represented in float16 representable range. Such networks can be made to achieve the accuracy reached by float32 with a couple of changes.
 
 Most of the float16 representable range is not used by activation gradients generally. So you can shift the gradients into float16 range by scaling up the loss by a factor `S`. By the chain rule, this scales up the loss before backward pass, and then you can scale back the gradients before updating the weights. This ensures that training in float16 can use the same hyperparameters as used during float32 training.
 
 Here's how you can configure the loss to be scaled up by 128 and rescale the gradient down before updating the weights.
 
-*Gluon*
-```
+*Gluon API*
+
+```python
 loss = gluon.loss.SoftmaxCrossEntropyLoss(weight=128)
-optimizer = mx.optimizer.create('sgd', multi_precision=True, rescale_grad=1.0/128)
-```
-*Module*
+optimizer = mx.optimizer.create('sgd',
+                                multi_precision=True,
+                                rescale_grad=1.0/128)
 ```
+
+*Module API*
+
+```python
 mxnet.sym.SoftmaxOutput(other_args, grad_scale=128.0)
-optimizer = mx.optimizer.create('sgd', multi_precision=True, rescale_grad=1.0/128)
+optimizer = mx.optimizer.create('sgd',
+                                multi_precision=True,
+                                rescale_grad=1.0/128)
 ```
 
 Networks like Multibox SSD, R-CNN, bigLSTM and Seq2seq were found to exhibit such behavior.
-You can choose a constant scaling factor while ensuring that the absolute value of gradient when multiplied by this factor remains in the range of float16. Generally powers of 2 like 64,128,256,512 are chosen. Refer the linked articles below for more details on this.
-
-## Video Tutorial
-
-We also have a video tutorial for using Mixed Precision with MXNet. You can check that out [here](https://www.youtube.com/watch?v=pR4KMh1lGC0)
+You can choose a constant scaling factor while ensuring that the absolute value of gradient when multiplied by this factor remains in the range of float16. Generally powers of 2 like 64, 128, 256, 512 are chosen. Refer to the linked articles below for more details on this.
 
 ## References
+
 1. [Training with Mixed Precision User Guide](http://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
 2. [Mixed Precision Training at ICLR 2018](https://arxiv.org/pdf/1710.03740.pdf)
 3. [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
 
+## Recommended Next Steps
+
+* Check out our video tutorial on [Using Mixed Precision with MXNet](https://www.youtube.com/watch?v=pR4KMh1lGC0)
\ No newline at end of file
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 92f1ccde00f4..6c398d46471a 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -100,4 +100,4 @@ If you need help with using MXNet, have questions about applying it to a particu
 We track bugs and new feature requests in the MXNet Github repo in the issues folder: [mxnet/issues](https://github.com/apache/incubator-mxnet/issues).
 
 ## Roadmap
-MXNet is evolving fast. To see what's next and what we are working on internally, go to the [MXNet Roadmap](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Roadmap).
+MXNet is evolving fast. To see what's next and what we are working on internally, go to the [MXNet Roadmap](https://cwiki.apache.org/confluence/display/MXNET/Roadmap).
diff --git a/docs/faq/new_op.md b/docs/faq/new_op.md
index 4c10708b944d..2395379bafc1 100644
--- a/docs/faq/new_op.md
+++ b/docs/faq/new_op.md
@@ -292,6 +292,28 @@ output or nothing to calculating gradient.
 For more complicated patterns, use `MakeGradNode(op_name, n, heads, dict)` to create gradient entries,
 where heads are input entries to the backward op, composed from ograds and n->inputs.
 
+When assembling a return vector of `std::vector<nnvm::NodeEntry> ret;` a common pattern would be to
+either create nodes in place as in:
+
+```
+ret.emplace_back(MakeNode("zeros_like", n->attrs.name + "_xyz_backward",
+    {n->inputs[1]}, nullptr, &n))
+```
+
+Or create the node, modify and then move into NodeEntry's constructor if this node is not to be used
+again. This avoids uneccessary copies of the shared_ptr.
+
+```
+for (size_t i = 0; i < n->inputs.size(); ++i) {
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs.op = copy_op;
+  node->inputs = {ograds[0]};
+  ret.emplace_back(std::move(node));
+}
+```
+
+The first case uses RVO and the second in place construction.
+
 #### FCompute\<xpu\>
 
 Simple operators can register FCompute<xpu> with `.set_attr<FCompute>("FCompute<cpu>", ...)` and `.set_attr<FCompute>("FCompute<gpu>", ...)` for both CPU and (optionally) GPU computation.
diff --git a/docs/faq/perf.md b/docs/faq/perf.md
index e1318b843a03..62b40247081c 100644
--- a/docs/faq/perf.md
+++ b/docs/faq/perf.md
@@ -34,8 +34,13 @@ Performance is mainly affected by the following 4 factors:
 
 ## Intel CPU
 
-For using Intel Xeon CPUs for training and inference, we suggest enabling
-`USE_MKLDNN = 1` in `config.mk`. 
+When using Intel Xeon CPUs for training and inference, the `mxnet-mkl` package is recommended. Adding `--pre` installs a nightly build from master. Without it you will install the latest patched release of MXNet:
+
+```
+$ pip install mxnet-mkl [--pre]
+```
+
+Or build MXNet from source code with `USE_MKLDNN=1`. For Linux users, `USE_MKLDNN=1` will be turned on by default.
 
 We also find that setting the following environment variables can help:
 
diff --git a/docs/install/build_from_source.md b/docs/install/build_from_source.md
index 7b00b03abefe..dacac09c3d11 100644
--- a/docs/install/build_from_source.md
+++ b/docs/install/build_from_source.md
@@ -42,14 +42,14 @@ Building from source follows this general two-step flow of building the shared l
             * [non-Intel CPUs](#recommended-for-Systems-with-non-Intel-CPUs)
 2. [Install the language API binding(s)](#installing-mxnet-language-bindings) you would like to use for MXNet.
 MXNet's newest and most popular API is Gluon. Gluon is built into the Python binding. If Python isn't your preference, you still have more options. MXNet supports several other language APIs:
-    - [Python (includes Gluon)](../api/python/index.html)
-    - [C++](../api/c++/index.html)
-    - [Clojure](../api/clojure/index.html)
-    - [Java](../api/java/index.html)
-    - [Julia](../api/julia/index.html)
-    - [Perl](../api/perl/index.html)
-    - [R](../api/r/index.html)
-    - [Scala](../api/scala/index.html)
+    - [Python (includes Gluon)](../api/python/index.md)
+    - [C++](../api/c++/index.md)
+    - [Clojure](../api/clojure/index.md)
+    - [Java](../api/java/index.md)
+    - [Julia](../api/julia/index.md)
+    - [Perl](../api/perl/index.md)
+    - [R](../api/r/index.md)
+    - [Scala](../api/scala/index.md)
 
 <hr>
 
@@ -58,12 +58,11 @@ MXNet's newest and most popular API is Gluon. Gluon is built into the Python bin
 Detailed instructions are provided per operating system. Each of these guides also covers how to install the specific [Language Bindings](#installing-mxnet-language-bindings) you require.
 You may jump to those, but it is recommended that you continue reading to understand more general "build from source" options.
 
-* [Amazon Linux / CentOS / RHEL](centos_setup.html)
-* [macOS](osx_setup.html)
-* [Raspbian](raspian_setup.html)
-* [TX2](tx2_setup.html)
-* [Ubuntu](ubuntu_setup.html)
-* [Windows](windows_setup.html)
+* [Amazon Linux / CentOS / RHEL](centos_setup.md)
+* [macOS](osx_setup.md)
+* [Devices](https://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Devices&language=Python&processor=CPU)
+* [Ubuntu](ubuntu_setup.md)
+* [Windows](windows_setup.md)
 
 
 <hr>
@@ -231,7 +230,7 @@ For example, you can specify using all cores on Linux as follows:
 
 ```bash
 mkdir build && cd build
-cmake -GNinja .
+cmake -GNinja ..
 ninja -v
 ```
 
@@ -241,7 +240,7 @@ ninja -v
 
 ```bash
 mkdir build && cd build
-cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -GNinja .
+cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -GNinja ..
 ninja -v
 ```
 
@@ -250,7 +249,7 @@ ninja -v
 
 ```bash
 mkdir build && cd build
-cmake -DBLAS=open -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -GNinja .
+cmake -DBLAS=open -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -GNinja ..
 ninja -v
 ```
 
@@ -259,7 +258,7 @@ ninja -v
 
 ```bash
 mkdir build && cd build
-cmake -DUSE_CUDA=0 -DUSE_MKLDNN=1 -GNinja .
+cmake -DUSE_CUDA=0 -DUSE_MKLDNN=1 -GNinja ..
 ninja -v
 ```
 
@@ -268,7 +267,7 @@ ninja -v
 
 ```bash
 mkdir build && cd build
-cmake -DUSE_CUDA=0 -DBLAS=open -GNinja .
+cmake -DUSE_CUDA=0 -DBLAS=open -GNinja ..
 ninja -v
 ```
 
@@ -278,7 +277,7 @@ ninja -v
 
 ```bash
 mkdir build && cd build
-cmake -DUSE_OPENCV=0 -GNinja .
+cmake -DUSE_OPENCV=0 -GNinja ..
 ninja -v
 ```
 
@@ -286,7 +285,7 @@ ninja -v
 
 ```bash
 mkdir build && cd build
-cmake -DBLAS=apple -DUSE_OPENCV=0 -DUSE_OPENMP=0 -GNinja .
+cmake -DBLAS=apple -DUSE_OPENCV=0 -DUSE_OPENMP=0 -GNinja ..
 ninja -v
 ```
 
@@ -295,7 +294,7 @@ ninja -v
 ```bash
 brew install llvm
 mkdir build && cd build
-cmake -DBLAS=apple -DUSE_OPENMP=1 -GNinja .
+cmake -DBLAS=apple -DUSE_OPENMP=1 -GNinja ..
 ninja -v
 ```
 
diff --git a/docs/install/c_plus_plus.md b/docs/install/c_plus_plus.md
index ee21014bc5f1..13c1a87cbd5f 100644
--- a/docs/install/c_plus_plus.md
+++ b/docs/install/c_plus_plus.md
@@ -18,12 +18,12 @@
 ## Build the C++ package
 The C++ package has the same prerequisites as the MXNet library.
 
-To enable C++ package, just add `USE_CPP_PACKAGE=1` in the [build from source](build_from_source.html) options when building the MXNet shared library.
+To enable C++ package, just add `USE_CPP_PACKAGE=1` in the [build from source](build_from_source.md) options when building the MXNet shared library.
 
 For example to build MXNet with GPU support and the C++ package, OpenCV, and OpenBLAS, from the project root you would run:
 
 ```bash
-cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -DUSE_CPP_PACKAGE=1 -GNinja .
+cmake -DUSE_CUDA=1 -DUSE_CUDA_PATH=/usr/local/cuda -DUSE_CUDNN=1 -DUSE_MKLDNN=1 -DUSE_CPP_PACKAGE=1 -GNinja ..
 ninja -v
 ```
 
@@ -40,7 +40,7 @@ You can find C++ code examples in the `cpp-package/example` folder of the MXNet
 
 ## Tutorials
 
-* [MXNet C++ API Basics](https://mxnet.incubator.apache.org/tutorials/c++/basics.html)
+* [MXNet C++ API Basics](../tutorials/c++/basics.md)
 
 ## Related Topics
 
diff --git a/docs/install/download.md b/docs/install/download.md
index cf95c2344f14..808b4b8a72e5 100644
--- a/docs/install/download.md
+++ b/docs/install/download.md
@@ -21,6 +21,7 @@ These source archives are generated from tagged releases. Updates and patches wi
 
 | Version | Source                                                                                                      | PGP                                                                                                             | SHA                                                                                                                |
 |---------|-------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|
+| 1.4.1   | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.4.1/apache-mxnet-src-1.4.1-incubating.tar.gz)   | [Download](https://apache.org/dist/incubator/mxnet/1.4.1/apache-mxnet-src-1.4.1-incubating.tar.gz.asc)    | [Download](https://apache.org/dist/incubator/mxnet/1.4.1/apache-mxnet-src-1.4.1-incubating.tar.gz.sha512)      |
 | 1.4.0   | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz)   | [Download](https://apache.org/dist/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz.asc)    | [Download](https://apache.org/dist/incubator/mxnet/1.4.0/apache-mxnet-src-1.4.0-incubating.tar.gz.sha512)      |
 | 1.3.1   | [Download](https://www.apache.org/dyn/closer.cgi/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz)   | [Download](https://apache.org/dist/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz.asc)    | [Download](https://apache.org/dist/incubator/mxnet/1.3.1/apache-mxnet-src-1.3.1-incubating.tar.gz.sha512)      |
 | 1.3.0   | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz)   | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz.asc)    | [Download](https://archive.apache.org/dist/incubator/mxnet/1.3.0/apache-mxnet-src-1.3.0-incubating.tar.gz.sha512)      |
diff --git a/docs/install/index.md b/docs/install/index.md
index 10db8d95b44a..5fef5ca47e57 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -28,9 +28,7 @@
    download.md
    java_setup.md
    osx_setup.md
-   raspbian_setup.md
    scala_setup.md
-   tx2_setup.md
    ubuntu_setup.md
    validate_mxnet.md
    windows_setup.md
@@ -39,10 +37,10 @@
 Indicate your preferred configuration. Then, follow the customized commands to install MXNet.
 
 <div class="dropdown">
-  <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.4.0
+  <button class="btn current-version btn-primary dropdown-toggle" type="button" data-toggle="dropdown">v1.4.1
   <span class="caret"></span></button>
   <ul class="dropdown-menu opt-group">
-    <li class="opt active versions"><a href="#">v1.4.0</a></li>
+    <li class="opt active versions"><a href="#">v1.4.1</a></li>
     <li class="opt versions"><a href="#">v1.3.1</a></li>
     <li class="opt versions"><a href="#">v1.2.1</a></li>
     <li class="opt versions"><a href="#">v1.1.0</a></li>
@@ -118,12 +116,21 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <div class="python">
 <div class="cpu">
 <div class="pip">
-<div class="v1-4-0">
+<div class="v1-4-1">
+
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find performance numbers in the <a href="http://mxnet.io/faq/perf.html#intel-cpu">MXNet tuning guide</a>.
 
 ```
 $ pip install mxnet
 ```
 
+</div> <!-- End of v1-4-1 -->
+<div class="v1-4-0">
+
+```
+$ pip install mxnet==1.4.0
+```
+
 </div> <!-- End of v1-4-0 -->
 <div class="v1-3-1">
 
@@ -131,6 +138,12 @@ $ pip install mxnet
 $ pip install mxnet==1.3.1
 ```
 
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find performance numbers in the <a href="http://mxnet.io/faq/perf.html#intel-cpu">MXNet tuning guide</a>.
+
+```
+$ pip install mxnet-mkl==1.3.1
+```
+
 </div> <!-- End of v1-3-1 -->
 <div class="v1-2-1">
 
@@ -138,6 +151,12 @@ $ pip install mxnet==1.3.1
 $ pip install mxnet==1.2.1
 ```
 
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find performance numbers in the <a href="http://mxnet.io/faq/perf.html#intel-cpu">MXNet tuning guide</a>.
+
+```
+$ pip install mxnet-mkl==1.2.1
+```
+
 </div> <!-- End of v1-2-1 -->
 
 <div class="v1-1-0">
@@ -185,12 +204,18 @@ $ pip install mxnet==0.11.0
 $ pip install mxnet --pre
 ```
 
+MKL-DNN enabled pip packages are optimized for Intel hardware. You can find performance numbers in the <a href="http://mxnet.io/faq/perf.html#intel-cpu">MXNet tuning guide</a>.
+
+```
+$ pip install mxnet-mkl --pre
+```
+
 </div> <!-- End of master-->
 <hr> <!-- pip footer -->
-MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
+
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -324,7 +349,7 @@ $ pip install mxnet-cu92 --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -505,13 +530,13 @@ You can use the Maven packages defined in the following dependency to include MX
 <br/>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.1~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-    <version>[1.4.0, )</version>
+    <version>[1.4.1, )</version>
 </dependency>
 ```
 
@@ -522,13 +547,13 @@ You can use the Maven packages defined in the following dependency to include MX
 <br/>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.1~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-    <version>[1.4.0, )</version>
+    <version>[1.4.1, )</version>
 </dependency>
 ```
 <br>
@@ -640,7 +665,7 @@ $ pip install mxnet --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -740,7 +765,10 @@ To ensure MXNet R package runs with the version of OpenBLAS installed, create a
 ln -sf /usr/local/opt/openblas/lib/libopenblas.dylib /usr/local/opt/openblas/lib/libopenblasp-r0.3.1.dylib
 ```
 
-Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/macosx/).
+Note: packages for 3.6.x are not yet available.
+
+Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/macosx/). The latest is [v3.5.3](https://cran.r-project.org/bin/macosx/R-3.5.3.pkg).
+
 You can [build MXNet-R from source](osx_setup.html#install-the-mxnet-package-for-r), or you can use a pre-built binary:
 
 ```r
@@ -810,13 +838,13 @@ Not available at this time. <br>
 </br>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0~~"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.1~~"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-    <version>[1.4.0, )</version>
+    <version>[1.4.1, )</version>
 </dependency>
 ```
 <br>
@@ -928,7 +956,7 @@ $ pip install mxnet --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -1059,7 +1087,7 @@ $ pip install mxnet-cu92 --pre
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for other MXNet pip packages</a>, or <a href="validate_mxnet.html">validate your MXNet installation</a>.
 
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.4.0.png" alt="pip packages"/>
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png" alt="pip packages"/>
 
 **NOTES:**
 
@@ -1100,7 +1128,9 @@ To build from source, refer to the <a href="windows_setup.html">MXNet Windows in
 <div class="cpu">
 </br>
 
-Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/windows/).
+Note: packages for 3.6.x are not yet available.
+Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/windows/base/old/).
+
 You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
 
 ```r
@@ -1381,7 +1411,16 @@ then running:
   free -m # to verify the swapfile size has been increased
 ```
 
-**Step 2** Install MXNet Python Bindings
+**Step 2** Build cython modules (optional)
+
+```bash
+$ pip install Cython
+$ make cython # You can set the python executable with `PYTHON` flag, e.g., make cython PYTHON=python3
+```
+*MXNet* tries to use the cython modules unless the environment variable `MXNET_ENABLE_CYTHON` is set to `0`. If loading the cython modules fails, the default behavior is falling back to ctypes without any warning. To raise an exception at the failure, set the environment variable `MXNET_ENFORCE_CYTHON` to `1`. See [here](/faq/env_var.html) for more details.
+
+
+**Step 3** Install MXNet Python Bindings
 
 To install Python bindings run the following commands in the MXNet directory:
 
@@ -1442,7 +1481,8 @@ Install these dependencies using the following commands in any directory:
     sudo apt-get -y install git build-essential libatlas-base-dev libopencv-dev graphviz python-pip
     sudo pip install pip --upgrade
     sudo pip install setuptools numpy --upgrade
-    sudo pip install graphviz jupyter
+    sudo pip install graphviz==0.8.4 \
+                     jupyter
 ```
 
 Clone the MXNet source code repository using the following `git` command in your home directory:
diff --git a/docs/install/java_setup.md b/docs/install/java_setup.md
index ce4a4abf80b7..5029b52a3d3b 100644
--- a/docs/install/java_setup.md
+++ b/docs/install/java_setup.md
@@ -19,7 +19,7 @@
 
 The following instructions are provided for macOS and Ubuntu. Windows is not yet available.
 
-**Note:** If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Java on IntelliJ tutorial](../tutorials/java/mxnet_java_on_intellij.html) instead of these instructions.
+**Note:** If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Java on IntelliJ tutorial](../tutorials/java/mxnet_java_on_intellij.md) instead of these instructions.
 
 <hr>
 
@@ -99,8 +99,8 @@ The previously mentioned setup with Maven is recommended. Otherwise, the followi
 
 | OS | Step 1 | Step 2 |
 |---|---|---|
-|macOS | [Shared Library for macOS](../install/osx_setup.html#build-the-shared-library) | [Scala Package for macOS](http://mxnet.incubator.apache.org/install/osx_setup.html#install-the-mxnet-package-for-scala) |
-| Ubuntu | [Shared Library for Ubuntu](../install/ubuntu_setup.html#installing-mxnet-on-ubuntu) | [Scala Package for Ubuntu](http://mxnet.incubator.apache.org/install/ubuntu_setup.html#install-the-mxnet-package-for-scala) |
+|macOS | [Shared Library for macOS](osx_setup.html#build-the-shared-library) | [Scala Package for macOS](osx_setup.html#install-the-mxnet-package-for-scala) |
+| Ubuntu | [Shared Library for Ubuntu](ubuntu_setup.html#installing-mxnet-on-ubuntu) | [Scala Package for Ubuntu](ubuntu_setup.html#install-the-mxnet-package-for-scala) |
 | Windows | <a class="github-button" href="https://github.com/apache/incubator-mxnet/issues/10549" data-size="large" data-show-count="true" aria-label="Issue apache/incubator-mxnet on GitHub"> | <a class="github-button" href="https://github.com/apache/incubator-mxnet/issues/10549" data-size="large" data-show-count="true" aria-label="Issue apache/incubator-mxnet on GitHub">Call for Contribution</a> |
 
 
@@ -110,12 +110,12 @@ If you have already built MXNet **from source** and are looking to setup Java fr
 ```
 mvn install
 ```
-This will install both the Java Inference API and the required MXNet-Scala package. 
+This will install both the Java Inference API and the required MXNet-Scala package.
 <hr>
 
 ## Documentation
 
-Javadocs are generated as part of the docs build pipeline. You can find them published in the [Java API](../api/java/index.html) section of the website or by going to the [scaladocs output](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package) directly.
+Javadocs are generated as part of the docs build pipeline. You can find them published in the [Java API](../api/java/index.md) section of the website or by going to the [scaladocs output](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package) directly.
 
 To build the docs yourself, follow the [developer build docs instructions](https://github.com/apache/incubator-mxnet/tree/master/docs/build_version_doc#developer-instructions).
 
@@ -123,6 +123,6 @@ To build the docs yourself, follow the [developer build docs instructions](https
 
 ## Resources
 
-* [Java API](../api/java/index.html)
-* [javadocs](../api/java/docs/index.html#org.apache.mxnet.package)
-* [MXNet-Java Tutorials](../../tutorials/index.html#java-tutorials)
+* [Java API](../api/java/)
+* [javadocs](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package)
+* [MXNet-Java Tutorials](../tutorials/java/)
diff --git a/docs/install/osx_setup.md b/docs/install/osx_setup.md
index 84e58282d766..6d38c466001b 100644
--- a/docs/install/osx_setup.md
+++ b/docs/install/osx_setup.md
@@ -17,7 +17,7 @@
 
 # Installing MXNet from source on OS X (Mac)
 
-**NOTE:** For prebuild MXNet with Python installation, please refer to the [new install guide](http://mxnet.io/install/index.html).
+**NOTE:** For pre-built MXNet with Python, please refer to the [new install guide](index.md).
 
 Installing MXNet is a two-step process:
 
@@ -81,7 +81,7 @@ Install the dependencies, required for MXNet, with the following commands:
 	brew install graphviz
 	brew install openblas
 	brew tap homebrew/core
-	brew install opencv
+	brew install opencv@3
 
 	# If building with MKLDNN
 	brew install llvm
@@ -89,13 +89,13 @@ Install the dependencies, required for MXNet, with the following commands:
 	# Get pip
 	easy_install pip
 	# For visualization of network graphs
-	pip install graphviz
+	pip install graphviz==0.8.4
 	# Jupyter notebook
 	pip install jupyter
 ```
 
 ### Build MXNet Shared Library
-After you have installed the dependencies, pull the MXNet source code from Git and build MXNet to produce an MXNet library called ```libmxnet.so```. You can clone the repository as described in the following code block, or you may try the <a href="download.html">download links</a> for your desired MXNet version.
+After you have installed the dependencies, pull the MXNet source code from Git and build MXNet to produce an MXNet library called ```libmxnet.so```. You can clone the repository as described in the following code block, or you may try the [download links](download.md) for your desired MXNet version.
 
 The file called ```osx.mk``` has the configuration required for building MXNet on OS X. First copy ```make/osx.mk``` into ```config.mk```, which is used by the ```make``` command:
 
@@ -171,7 +171,10 @@ Add a soft link to the OpenBLAS installation. This example links the 0.3.1 versi
 ln -sf /usr/local/opt/openblas/lib/libopenblasp-r0.3.* /usr/local/opt/openblas/lib/libopenblasp-r0.3.1.dylib
 ```
 
-Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/macosx/).
+Note: packages for 3.6.x are not yet available.
+
+Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/macosx/). The latest is [v3.5.3](https://cran.r-project.org/bin/macosx/R-3.5.3.pkg).
+
 For OS X (Mac) users, MXNet provides a prebuilt binary package for CPUs. The prebuilt package is updated weekly. You can install the package directly in the R console using the following commands:
 
 ```r
@@ -261,6 +264,6 @@ After you build the shared library, run the following command from the MXNet sou
 
 ## Next Steps
 
-* [Tutorials](http://mxnet.io/tutorials/index.html)
-* [How To](http://mxnet.io/faq/index.html)
-* [Architecture](http://mxnet.io/architecture/index.html)
+* [Tutorials](../tutorials/index.md)
+* [How To](../faq/index.md)
+* [Architecture](../architecture/index.md)
diff --git a/docs/install/raspbian_setup.md b/docs/install/raspbian_setup.md
deleted file mode 100644
index 896d4721370b..000000000000
--- a/docs/install/raspbian_setup.md
+++ /dev/null
@@ -1,25 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-<!-- This page should be deleted after sometime (Allowing search engines
-to update links) -->
-<meta http-equiv="refresh" content="3; url=https://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Devices&language=Python&processor=CPU" />
-<!-- Just in case redirection does not work -->
-<p>
-  <a href="https://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Devices&language=Python&processor=CPU">
-    This content is moved to a new MXNet install page. Redirecting... </a>
-</p>
diff --git a/docs/install/requirements.txt b/docs/install/requirements.txt
index b3620d607740..38de91b9da7b 100644
--- a/docs/install/requirements.txt
+++ b/docs/install/requirements.txt
@@ -2,7 +2,7 @@ cpplint==1.3.0
 h5py==2.8.0rc1
 nose
 nose-timer
-numpy<=1.15.2,>=1.8.2
+numpy>1.16.0,<2.0.0
 pylint==2.3.1; python_version >= '3.0'
 requests<2.19.0,>=2.18.4
 scipy==1.0.1
diff --git a/docs/install/scala_setup.md b/docs/install/scala_setup.md
index 15a2def1ef38..2456c147ef93 100644
--- a/docs/install/scala_setup.md
+++ b/docs/install/scala_setup.md
@@ -19,7 +19,7 @@
 
 The following instructions are provided for macOS and Ubuntu. Windows is not yet available.
 
-**Note:** If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.html) instead of these instructions.
+**Note:** If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.md) instead of these instructions.
 **Note:** Currently, we only support scala 2.11
 
 <hr>
@@ -101,8 +101,8 @@ The previously mentioned setup with Maven is recommended. Otherwise, the followi
 
 | OS | Step 1 | Step 2 |
 |---|---|---|
-|macOS | [Shared Library for macOS](http://mxnet.incubator.apache.org/install/osx_setup.html#build-the-shared-library) | [Scala Package for macOS](http://mxnet.incubator.apache.org/install/osx_setup.html#install-the-mxnet-package-for-scala) |
-| Ubuntu | [Shared Library for Ubuntu](http://mxnet.incubator.apache.org/install/ubuntu_setup.html#installing-mxnet-on-ubuntu) | [Scala Package for Ubuntu](http://mxnet.incubator.apache.org/install/ubuntu_setup.html#install-the-mxnet-package-for-scala) |
+|macOS | [Shared Library for macOS](osx_setup.html#build-the-shared-library) | [Scala Package for macOS](osx_setup.html#install-the-mxnet-package-for-scala) |
+| Ubuntu | [Shared Library for Ubuntu](ubuntu_setup.html#installing-mxnet-on-ubuntu) | [Scala Package for Ubuntu](ubuntu_setup.html#install-the-mxnet-package-for-scala) |
 | Windows | <a class="github-button" href="https://github.com/apache/incubator-mxnet/issues/10549" data-size="large" data-show-count="true" aria-label="Issue apache/incubator-mxnet on GitHub"> | <a class="github-button" href="https://github.com/apache/incubator-mxnet/issues/10549" data-size="large" data-show-count="true" aria-label="Issue apache/incubator-mxnet on GitHub">Call for Contribution</a> |
 
 
@@ -144,7 +144,7 @@ If you receive a "NumberFormatException" when running the interpreter, run `expo
 
 ## Documentation
 
-Scaladocs are generated as part of the docs build pipeline. You can find them published in the [Scala API](http://mxnet.incubator.apache.org/api/scala/index.html) section of the website or by going to the [scaladocs output](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package) directly.
+Scaladocs are generated as part of the docs build pipeline. You can find them published in the [Scala API](../api/scala/index.md) section of the website or by going to the [scaladocs output](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package) directly.
 
 To build the docs yourself, follow the [developer build docs instructions](https://github.com/apache/incubator-mxnet/tree/master/docs/build_version_doc#developer-instructions).
 
@@ -152,6 +152,6 @@ To build the docs yourself, follow the [developer build docs instructions](https
 
 ## Resources
 
-* [Scala API](http://mxnet.incubator.apache.org/api/scala/index.html)
+* [Scala API](../api/scala/)
 * [scaladocs](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package)
-* [MXNet-Scala Tutorials](../tutorials/scala)
+* [MXNet-Scala Tutorials](../tutorials/scala/)
diff --git a/docs/install/ubuntu_setup.md b/docs/install/ubuntu_setup.md
index 01b11cdc11ab..ef700b4f353b 100644
--- a/docs/install/ubuntu_setup.md
+++ b/docs/install/ubuntu_setup.md
@@ -133,7 +133,7 @@ Or you can go through a manual process described next.
 
 #### Manual MXNet Installation
 
-It is recommended that you review the general [build from source](build_from_source.html) instructions before continuing.
+It is recommended that you review the general [build from source](build_from_source.md) instructions before continuing.
 
 On Ubuntu versions 16.04 or later, you need the following dependencies:
 
@@ -166,7 +166,7 @@ sudo make install
 
 **Step 2:** Install a Math Library.
 
-Details on the different math libraries are found in the build from source guide's [Math Library Selection](build_from_source.html#math-library-selection) section.
+Details on the different math libraries are found in the build from source guide's [Math Library Selection](build_from_source.md#math-library-selection) section.
 
 For OpenBLAS use:
 
@@ -174,7 +174,7 @@ For OpenBLAS use:
     sudo apt-get install -y libopenblas-dev
 ```
 
-For other libraries, visit the [Math Library Selection](build_from_source.html#math-library-selection) section.
+For other libraries, visit the [Math Library Selection](build_from_source.md#math-library-selection) section.
 
 **Step 3:** Install OpenCV.
 
@@ -210,7 +210,7 @@ Build with CMake and ninja, without GPU and without MKL.
     ninja
 ```
 
-If building on CPU and using MKL and MKL-DNN (make sure MKL is installed according to [Math Library Selection](build_from_source.html#math-library-selection) and [MKL-DNN README](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md)):
+If building on CPU and using MKL and MKL-DNN (make sure MKL is installed according to [Math Library Selection](build_from_source.md#math-library-selection) and [MKL-DNN README](https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md)):
 
 ```bash
     rm -rf build
@@ -241,7 +241,7 @@ Cuda 10.1 in Ubuntu 18.04 builds fine but is not currently tested in CI.
     ninja
 ```
 
-*Note* - You can explore and use more compilation options as they are delcared in the top of `CMakeLists.txt` and also review common [usage examples](build_from_source.html#usage-examples).
+*Note* - You can explore and use more compilation options as they are delcared in the top of `CMakeLists.txt` and also review common [usage examples](build_from_source.md#usage-examples).
 Optionally, you can also use a higher level, scripted version of the above with an editable CMake options file by doing the
 following:
 
@@ -290,15 +290,15 @@ Note that the `-e` flag is optional. It is equivalent to `--editable` and means
 You may optionally install ```graphviz``` library that is used for visualizing network graphs you build on MXNet. You may also install [Jupyter Notebook](http://jupyter.readthedocs.io/) which is used for running MXNet tutorials and examples.
 
 ```bash
-sudo pip install graphviz
-sudo pip install jupyter
+sudo pip install graphviz==0.8.4 \
+                 jupyter
 ```
 <hr>
 
 
 ### Install the MXNet Package for C++
 
-Refer to the [C++ Package setup guide](c_plus_plus.html).
+Refer to the [C++ Package setup guide](c_plus_plus.md).
 <hr>
 
 
@@ -407,7 +407,7 @@ $ sudo apt-get install -y libopenblas-dev liblapack-dev
 $ sudo apt-get install -y libopencv-dev
 ```
 
-**Step 4** Download MXNet sources and build MXNet core shared library. You can clone the repository as described in the following code block, or you may try the <a href="download.html">download links</a> for your desired MXNet version.
+**Step 4** Download MXNet sources and build MXNet core shared library. You can clone the repository as described in the following code block, or you may try the [download links](download.md) for your desired MXNet version.
 
 ```bash
 $ git clone --recursive https://github.com/apache/incubator-mxnet
@@ -458,29 +458,29 @@ You should see the following output:
 
 To use the MXNet-Scala package, you can acquire the Maven package as a dependency.
 
-Further information is in the [MXNet-Scala Setup Instructions](scala_setup.html).
+Further information is in the [MXNet-Scala Setup Instructions](scala_setup.md).
 
-If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.html) instead.
+If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.md) instead.
 <hr>
 
 ### Install the MXNet Package for Java
 
 To use the MXNet-Java package, you can acquire the Maven package as a dependency.
 
-Further information is in the [MXNet-Java Setup Instructions](java_setup.html).
+Further information is in the [MXNet-Java Setup Instructions](java_setup.md).
 
-If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Java on IntelliJ tutorial](../tutorials/java/mxnet_java_on_intellij.html) instead.
+If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Java on IntelliJ tutorial](../tutorials/java/mxnet_java_on_intellij.md) instead.
 <hr>
 
 ## Contributions
 
-You are more than welcome to contribute easy installation scripts for other operating systems and programming languages. See the [community contributions page](../community/contribute.html) for further information.
+You are more than welcome to contribute easy installation scripts for other operating systems and programming languages. See the [community contributions page](../community/contribute.md) for further information.
 
 ## Next Steps
 
-* [Tutorials](../tutorials/index.html)
-* [How To](../faq/index.html)
-* [Architecture](../architecture/index.html)
+* [Tutorials](../tutorials/index.md)
+* [How To](../faq/index.md)
+* [Architecture](../architecture/index.md)
 
 
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.1.0/css/all.css" integrity="sha384-lKuwvrZot6UHsBSfcMvOkWwlCMgc0TaWr+30HWe3a4ltaBwTZhyTEggF5tJv8tbt" crossorigin="anonymous">
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 96929692f1ec..f2561488be4d 100644
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -270,6 +270,9 @@ To install MXNet on a computer with a CPU processor, choose from two options:
 For Windows users, MXNet provides prebuilt binary packages.
 You can install the package directly in the R console.
 
+Note: packages for 3.6.x are not yet available.
+Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/windows/base/old/).
+
 For CPU-only package:
 
 ```r
@@ -379,6 +382,9 @@ Note: A pre-requisite to above softwares is [Nvidia-drivers](http://www.nvidia.c
 For Windows users, MXNet provides prebuilt binary packages.
 You can install the package directly in the R console after you have the above software installed.
 
+Note: packages for 3.6.x are not yet available.
+Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/windows/base/old/).
+
 For GPU package:
 
 ```r
@@ -497,6 +503,6 @@ MXNet-Scala is not yet available for Windows.
 
 ## Next Steps
 
-* [Tutorials](http://mxnet.io/tutorials/index.html)
-* [How To](http://mxnet.io/faq/index.html)
-* [Architecture](http://mxnet.io/architecture/index.html)
+* [Tutorials](../tutorials/index.md)
+* [How To](../faq/index.md)
+* [Architecture](../architecture/index.md)
diff --git a/docs/model_zoo/index.md b/docs/model_zoo/index.md
index cfe13caa0bf0..779ddac74f1c 100644
--- a/docs/model_zoo/index.md
+++ b/docs/model_zoo/index.md
@@ -53,14 +53,14 @@ For instructions on using these models, see [the python tutorial on using pre-tr
 
 | Model Definition | Dataset | Model Weights | Research Basis | Contributors |
 | --- | --- | --- | --- | --- |
-| [CaffeNet](http://data.dmlc.ml/mxnet/models/imagenet/caffenet/caffenet-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/caffenet/caffenet-0000.params) |   [Krizhevsky, 2012](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) | @jspisak |
-| [Network in Network (NiN)](http://data.dmlc.ml/models/imagenet/nin/nin-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/nin/nin-0000.params) |  [Lin et al.., 2014](https://arxiv.org/pdf/1312.4400v3.pdf) | @jspisak |
-| [SqueezeNet v1.1](http://data.dmlc.ml/models/imagenet/squeezenet/squeezenet_v1.1-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/squeezenet/squeezenet_v1.1-0000.params) | [Iandola et al.., 2016](https://arxiv.org/pdf/1602.07360v4.pdf) | @jspisak |
-| [VGG16](http://data.dmlc.ml/models/imagenet/vgg/vgg16-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/vgg/vgg16-0000.params)| [Simonyan et al.., 2015](https://arxiv.org/pdf/1409.1556v6.pdf) | @jspisak |
-| [VGG19](http://data.dmlc.ml/models/imagenet/vgg/vgg19-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/vgg/vgg19-0000.params) | [Simonyan et al.., 2015](https://arxiv.org/pdf/1409.1556v6.pdf) | @jspisak |
-| [Inception w/ BatchNorm](http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/inception-bn/Inception-BN-0126.params) | [Szegedy et al.., 2015](https://arxiv.org/pdf/1502.03167.pdf) | @jspisak |
-| [ResidualNet152](http://data.dmlc.ml/models/imagenet/resnet/152-layers/resnet-152-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/resnet/152-layers/resnet-152-0000.params) | [He et al.., 2015](https://arxiv.org/pdf/1512.03385v1.pdf) | @jspisak |
-| [ResNext101-64x4d](http://data.dmlc.ml/models/imagenet/resnext/101-layers/resnext-101-64x4d-symbol.json) | ImageNet | [Param File](http://data.dmlc.ml/models/imagenet/resnext/101-layers/resnext-101-64x4d-0000.params) | [Xie et al.., 2016](https://arxiv.org/pdf/1611.05431.pdf) | @Jerryzcn |
+| [CaffeNet](http://data.mxnet.io/mxnet/models/imagenet/caffenet/caffenet-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/caffenet/caffenet-0000.params) |   [Krizhevsky, 2012](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) | @jspisak |
+| [Network in Network (NiN)](http://data.mxnet.io/models/imagenet/nin/nin-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/nin/nin-0000.params) |  [Lin et al.., 2014](https://arxiv.org/pdf/1312.4400v3.pdf) | @jspisak |
+| [SqueezeNet v1.1](http://data.mxnet.io/models/imagenet/squeezenet/squeezenet_v1.1-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/squeezenet/squeezenet_v1.1-0000.params) | [Iandola et al.., 2016](https://arxiv.org/pdf/1602.07360v4.pdf) | @jspisak |
+| [VGG16](http://data.mxnet.io/models/imagenet/vgg/vgg16-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/vgg/vgg16-0000.params)| [Simonyan et al.., 2015](https://arxiv.org/pdf/1409.1556v6.pdf) | @jspisak |
+| [VGG19](http://data.mxnet.io/models/imagenet/vgg/vgg19-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/vgg/vgg19-0000.params) | [Simonyan et al.., 2015](https://arxiv.org/pdf/1409.1556v6.pdf) | @jspisak |
+| [Inception w/ BatchNorm](http://data.mxnet.io/models/imagenet/inception-bn/Inception-BN-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/inception-bn/Inception-BN-0126.params) | [Szegedy et al.., 2015](https://arxiv.org/pdf/1502.03167.pdf) | @jspisak |
+| [ResidualNet152](http://data.mxnet.io/models/imagenet/resnet/152-layers/resnet-152-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/resnet/152-layers/resnet-152-0000.params) | [He et al.., 2015](https://arxiv.org/pdf/1512.03385v1.pdf) | @jspisak |
+| [ResNext101-64x4d](http://data.mxnet.io/models/imagenet/resnext/101-layers/resnext-101-64x4d-symbol.json) | ImageNet | [Param File](http://data.mxnet.io/models/imagenet/resnext/101-layers/resnext-101-64x4d-0000.params) | [Xie et al.., 2016](https://arxiv.org/pdf/1611.05431.pdf) | @Jerryzcn |
 | Fast-RCNN | PASCAL VOC | [Param File] | [Girshick, 2015](https://arxiv.org/pdf/1504.08083v2.pdf) | |
 | Faster-RCNN | PASCAL VOC | [Param File] | [Ren et al..,2016](https://arxiv.org/pdf/1506.01497v3.pdf) | |
 | Single Shot Detection (SSD) | PASCAL VOC | [Param File] | [Liu et al.., 2016](https://arxiv.org/pdf/1512.02325v4.pdf) | |
diff --git a/docs/tutorials/amp/amp_tutorial.md b/docs/tutorials/amp/amp_tutorial.md
new file mode 100644
index 000000000000..be18929e23a9
--- /dev/null
+++ b/docs/tutorials/amp/amp_tutorial.md
@@ -0,0 +1,257 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Using AMP (Automatic Mixed Precision) in MXNet
+
+Training Deep Learning networks is a very computationally intensive task. Novel model architectures tend to have increasing number of layers and parameters, which slows down training. Fortunately, new generations of training hardware as well as software optimizations, make it a feasible task. 
+
+However, where most of the (both hardware and software) optimization opportunities exists is in exploiting lower precision (like FP16) to, for example, utilize Tensor Cores available on new Volta and Turing GPUs. While training in FP16 showed great success in image classification tasks, other more complicated neural networks typically stayed in FP32 due to difficulties in applying the FP16 training guidelines.
+
+That is where AMP (Automatic Mixed Precision) comes into play. It automatically applies the guidelines of FP16 training, using FP16 precision where it provides the most benefit, while conservatively keeping in full FP32 precision operations unsafe to do in FP16.
+
+This tutorial shows how to get started with mixed precision training using AMP for MXNet. As an example of a network we will use SSD network from GluonCV.
+
+## Data loader and helper functions
+
+For demonstration purposes we will use synthetic data loader.
+
+
+```python
+import logging
+import warnings
+import time
+import mxnet as mx
+import mxnet.gluon as gluon
+from mxnet import autograd
+import gluoncv as gcv
+from gluoncv.model_zoo import get_model
+
+data_shape = 512
+batch_size = 8
+lr = 0.001
+wd = 0.0005
+momentum = 0.9
+
+# training contexts
+ctx = [mx.gpu(0)]
+
+# set up logger
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+ce_metric = mx.metric.Loss('CrossEntropy')
+smoothl1_metric = mx.metric.Loss('SmoothL1')
+```
+
+
+```python
+class SyntheticDataLoader(object):
+    def __init__(self, data_shape, batch_size):
+        super(SyntheticDataLoader, self).__init__()
+        self.counter = 0
+        self.epoch_size = 200
+        shape = (batch_size, 3, data_shape, data_shape)
+        cls_targets_shape = (batch_size, 6132)
+        box_targets_shape = (batch_size, 6132, 4)
+        self.data = mx.nd.random.uniform(-1, 1, shape=shape, ctx=mx.cpu_pinned())
+        self.cls_targets = mx.nd.random.uniform(0, 1, shape=cls_targets_shape, ctx=mx.cpu_pinned())
+        self.box_targets = mx.nd.random.uniform(0, 1, shape=box_targets_shape, ctx=mx.cpu_pinned())
+    
+    def next(self):
+        if self.counter >= self.epoch_size:
+            self.counter = self.counter % self.epoch_size
+            raise StopIteration
+        self.counter += 1
+        return [self.data, self.cls_targets, self.box_targets]
+    
+    __next__ = next
+    
+    def __iter__(self):
+        return self
+    
+train_data = SyntheticDataLoader(data_shape, batch_size)
+```
+
+
+```python
+def get_network():
+    # SSD with RN50 backbone
+    net_name = 'ssd_512_resnet50_v1_coco'
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("ignore")
+        net = get_model(net_name, pretrained_base=True, norm_layer=gluon.nn.BatchNorm)
+        net.initialize()
+        net.collect_params().reset_ctx(ctx)
+
+    return net
+```
+
+# Training in FP32
+
+First, let us create the network.
+
+
+```python
+net = get_network()
+net.hybridize(static_alloc=True, static_shape=True)
+```
+
+
+
+Next, we need to create a Gluon Trainer.
+
+
+```python
+trainer = gluon.Trainer(
+    net.collect_params(), 'sgd',
+    {'learning_rate': lr, 'wd': wd, 'momentum': momentum})
+```
+
+
+```python
+mbox_loss = gcv.loss.SSDMultiBoxLoss()
+
+for epoch in range(1):
+    ce_metric.reset()
+    smoothl1_metric.reset()
+    tic = time.time()
+    btic = time.time()
+
+    for i, batch in enumerate(train_data):
+        batch_size = batch[0].shape[0]
+        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
+        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
+        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
+        with autograd.record():
+            cls_preds = []
+            box_preds = []
+            for x in data:
+                cls_pred, box_pred, _ = net(x)
+                cls_preds.append(cls_pred)
+                box_preds.append(box_pred)
+            sum_loss, cls_loss, box_loss = mbox_loss(
+                cls_preds, box_preds, cls_targets, box_targets)
+            autograd.backward(sum_loss)
+        trainer.step(1)
+        ce_metric.update(0, [l * batch_size for l in cls_loss])
+        smoothl1_metric.update(0, [l * batch_size for l in box_loss])
+        if not (i + 1) % 50:
+            name1, loss1 = ce_metric.get()
+            name2, loss2 = smoothl1_metric.get()
+            logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
+                epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2))
+        btic = time.time()
+```
+
+    INFO:root:[Epoch 0][Batch 49], Speed: 58.105 samples/sec, CrossEntropy=1.190, SmoothL1=0.688
+    INFO:root:[Epoch 0][Batch 99], Speed: 58.683 samples/sec, CrossEntropy=0.693, SmoothL1=0.536
+    INFO:root:[Epoch 0][Batch 149], Speed: 58.915 samples/sec, CrossEntropy=0.500, SmoothL1=0.453
+    INFO:root:[Epoch 0][Batch 199], Speed: 58.422 samples/sec, CrossEntropy=0.396, SmoothL1=0.399
+
+
+## Training with AMP
+
+### AMP initialization
+
+In order to start using AMP, we need to import and initialize it. This has to happen before we create the network.
+
+
+```python
+from mxnet.contrib import amp
+
+amp.init()
+```
+
+    INFO:root:Using AMP
+
+
+After that, we can create the network exactly the same way we did in FP32 training.
+
+
+```python
+net = get_network()
+net.hybridize(static_alloc=True, static_shape=True)
+```
+
+For some models that may be enough to start training in mixed precision, but the full FP16 recipe recommends using dynamic loss scaling to guard against over- and underflows of FP16 values. Therefore, as a next step, we create a trainer and initialize it with support for AMP's dynamic loss scaling. Currently, support for dynamic loss scaling is limited to trainers created with `update_on_kvstore=False` option, and so we add it to our trainer initialization.
+
+
+```python
+trainer = gluon.Trainer(
+    net.collect_params(), 'sgd',
+    {'learning_rate': lr, 'wd': wd, 'momentum': momentum},
+    update_on_kvstore=False)
+
+amp.init_trainer(trainer)
+```
+
+### Dynamic loss scaling in the training loop
+
+The last step is to apply the dynamic loss scaling during the training loop and . We can achieve that using the `amp.scale_loss` function.
+
+
+```python
+mbox_loss = gcv.loss.SSDMultiBoxLoss()
+
+for epoch in range(1):
+    ce_metric.reset()
+    smoothl1_metric.reset()
+    tic = time.time()
+    btic = time.time()
+
+    for i, batch in enumerate(train_data):
+        batch_size = batch[0].shape[0]
+        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
+        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
+        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
+        with autograd.record():
+            cls_preds = []
+            box_preds = []
+            for x in data:
+                cls_pred, box_pred, _ = net(x)
+                cls_preds.append(cls_pred)
+                box_preds.append(box_pred)
+            sum_loss, cls_loss, box_loss = mbox_loss(
+                cls_preds, box_preds, cls_targets, box_targets)
+            with amp.scale_loss(sum_loss, trainer) as scaled_loss:
+                autograd.backward(scaled_loss)
+        trainer.step(1)
+        ce_metric.update(0, [l * batch_size for l in cls_loss])
+        smoothl1_metric.update(0, [l * batch_size for l in box_loss])
+        if not (i + 1) % 50:
+            name1, loss1 = ce_metric.get()
+            name2, loss2 = smoothl1_metric.get()
+            logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
+                epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2))
+        btic = time.time()
+```
+
+    INFO:root:[Epoch 0][Batch 49], Speed: 93.585 samples/sec, CrossEntropy=1.166, SmoothL1=0.684
+    INFO:root:[Epoch 0][Batch 99], Speed: 93.773 samples/sec, CrossEntropy=0.682, SmoothL1=0.533
+    INFO:root:[Epoch 0][Batch 149], Speed: 93.399 samples/sec, CrossEntropy=0.493, SmoothL1=0.451
+    INFO:root:[Epoch 0][Batch 199], Speed: 93.674 samples/sec, CrossEntropy=0.391, SmoothL1=0.397
+
+
+We got 60% speed increase from 3 additional lines of code!
+
+## Current limitations of AMP
+
+- AMP's dynamic loss scaling currently supports only Gluon trainer with `update_on_kvstore=False` option set
+- Using `SoftmaxOutput`, `LinearRegressionOutput`, `LogisticRegressionOutput`, `MAERegressionOutput` with dynamic loss scaling does not work when training networks with multiple Gluon trainers and so multiple loss scales
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/install/tx2_setup.md b/docs/tutorials/amp/index.md
similarity index 71%
rename from docs/install/tx2_setup.md
rename to docs/tutorials/amp/index.md
index a432d4815662..faf6526fb824 100644
--- a/docs/install/tx2_setup.md
+++ b/docs/tutorials/amp/index.md
@@ -15,11 +15,11 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-<!-- This page should be deleted after sometime (Allowing search engines
-to update links) -->
-<meta http-equiv="refresh" content="3; url=http://mxnet.io/install/index.html" />
-<!-- Just in case redirection does not work -->
-<p>
-  <a href="http://mxnet.io/install/index.html">
-    This content is moved to a new MXNet install page. Redirecting... </a>
-</p>
+# Tutorials
+
+```eval_rst
+.. toctree::
+   :glob:
+
+   *
+```
diff --git a/docs/tutorials/c++/basics.md b/docs/tutorials/c++/basics.md
index ddc5595cf3c0..f9fdb31dee34 100644
--- a/docs/tutorials/c++/basics.md
+++ b/docs/tutorials/c++/basics.md
@@ -32,13 +32,13 @@ and decompress them into `data/mnist_data` folder.
 Except linking the MXNet shared library, the C++ package itself is a header-only package,
 which means all you need to do is to include the header files. Among the header files,
 `op.h` is special since it is generated dynamically. The generation should be done when
-[building the C++ package](http://mxnet.io/get_started/build_from_source.html#build-the-c++-package).
+[building the C++ package](http://mxnet.incubator.apache.org/versions/master/api/c++/index.html).
 It is important to note that you need to **copy the shared library** (`libmxnet.so` in Linux and MacOS,
 `libmxnet.dll` in Windows) from `/path/to/mxnet/lib` to the working directory.
 We do not recommend you to use pre-built binaries because MXNet is under heavy development,
 the operator definitions in `op.h` may be incompatible with the pre-built version.
 
-In order to use functionalities provides by the C++ package, first we include the general 
+In order to use functionalities provides by the C++ package, first we include the general
 header file `MxNetCpp.h` and specify the namespaces.
 
 ```cpp
diff --git a/docs/tutorials/embedded/wine_detector.md b/docs/tutorials/embedded/wine_detector.md
index 6a9372cda3b7..a646de8bfe1d 100644
--- a/docs/tutorials/embedded/wine_detector.md
+++ b/docs/tutorials/embedded/wine_detector.md
@@ -48,7 +48,7 @@ To complete this tutorial, you need:
 
 * Raspbian Wheezy or later, which can be downloaded [here](https://www.raspberrypi.org/downloads/raspbian/), loaded onto a 8GB+ micro SD card (with at least 4GB+ free)
 * A [Raspberry Pi 3](https://www.raspberrypi.org/blog/raspberry-pi-3-on-sale/) or equivalent Raspberry Pi with 1GB+ of RAM
-* A [Raspberry Pi Camera Module](https://www.raspberrypi.org/products/camera-module/) [activated and running with the corresponding Python module](http://www.pyimagesearch.com/2015/02/23/install-opencv-and-python-on-your-raspberry-pi-2-and-b/) (for the real-time video analysis with the deep network model)
+* A [Raspberry Pi Camera Module](https://www.raspberrypi.org/products/camera-module-v2/) [activated and running with the corresponding Python module](http://www.pyimagesearch.com/2015/02/23/install-opencv-and-python-on-your-raspberry-pi-2-and-b/) (for the real-time video analysis with the deep network model)
 * An AWS account With AWS IoT enabled and the [AWS IoT Python SDK](https://github.com/aws/aws-iot-device-sdk-python) (for remote, real-time managing and monitoring of the model running on the Pi)
 * The [cv2 Python library](http://www.pyimagesearch.com/2015/02/23/install-opencv-and-python-on-your-raspberry-pi-2-and-b/) for the Pi
 
diff --git a/docs/tutorials/gluon/data_augmentation.md b/docs/tutorials/gluon/data_augmentation.md
index 356d335a8ca2..0e320fc2890e 100644
--- a/docs/tutorials/gluon/data_augmentation.md
+++ b/docs/tutorials/gluon/data_augmentation.md
@@ -15,102 +15,221 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# Methods of applying data augmentation (Gluon API)
+# Image Augmentation
 
-Data Augmentation is a regularization technique that's used to avoid overfitting when training Machine Learning models. Although the technique can be applied in a variety of domains, it's very common in Computer Vision. Adjustments are made to the original images in the training dataset before being used in training. Some example adjustments include translating, cropping, scaling, rotating, changing brightness and contrast. We do this to reduce the dependence of the model on spurious characteristics; e.g. training data may only contain faces that fill 1/4 of the image, so the model trained without data augmentation might unhelpfully learn that faces can only be of this size.
+Augmentation is the process of randomly adjusting the dataset samples used for training. As a result, a greater diversity of samples will be seen by the network and it is therefore less likely to overfit the training dataset. Some of the spurious characteristics of the dataset can be reduced using this technique. One example would be a dataset of images from the same camera having the same color tint: it's unhelpful when you want to apply this model to images from other cameras. You can avoid this by randomly shifting the colours of each image slightly and training your network on these augmented images.
+
+Although this technique can be applied in a variety of domains, it's very common in Computer Vision, and we will focus on image augmentations in this tutorial. Some example image augmentations include random crops and flips, and adjustments to the brightness and contrast.
+
+#### What are the prerequisites?
+
+You should be familiar with the concept of a transform and how to apply it to a dataset before reading this tutorial. Check out the [Data Transforms tutorial]() if this is new to you or you need a quick refresher.
+
+#### Where can I find the augmentation transforms?
+
+You can find them in the `mxnet.gluon.data.vision.transforms` module, alongside the deterministic transforms we've seen previously, such as `ToTensor`, `Normalize`, `CenterCrop` and `Resize`. Augmentations involve an element of randomness and all the augmentation transforms are prefixed with `Random`, such as `RandomResizedCrop` and `RandomBrightness`. We'll start by importing MXNet and the `transforms`.
 
-In this tutorial we demonstrate a method of applying data augmentation with Gluon [`mxnet.gluon.data.Dataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.Dataset)s, specifically the [`ImageFolderDataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.datasets.ImageFolderDataset).
 
 ```python
-%matplotlib inline
-import mxnet as mx # used version '1.0.0' at time of writing
-import numpy as np
-from matplotlib.pyplot import imshow
-import multiprocessing
-import os
+import matplotlib.pyplot as plt
+import mxnet as mx
+from mxnet.gluon.data.vision import transforms
+```
+
+#### Sample Image
+
+So that we can see the effects of all the vision augmentations, we'll take a sample image of a giraffe and apply various augmentations to it. We can see what it looks like to begin with.
+
 
-mx.random.seed(42) # set seed for repeatability
+```python
+image_url = 'https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/data_aug/inputs/0.jpg'
+mx.test_utils.download(image_url, "giraffe.jpg")
+example_image = mx.image.imread("giraffe.jpg")
+plt.imshow(example_image.asnumpy())
+```
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_5_1.png)
+
+
+Since these augmentations are random, we'll apply the same augmentation a few times and plot all of the outputs. We define a few utility functions to help with this.
+
+
+```python
+def show_images(imgs, num_rows, num_cols, scale=2):
+    # show augmented images in a grid layout 
+    aspect_ratio = imgs[0].shape[0]/imgs[0].shape[1]
+    figsize = (num_cols * scale, num_rows * scale * aspect_ratio)
+    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
+    for i in range(num_rows):
+        for j in range(num_cols):
+            axes[i][j].imshow(imgs[i * num_cols + j].asnumpy())
+            axes[i][j].axes.get_xaxis().set_visible(False)
+            axes[i][j].axes.get_yaxis().set_visible(False)
+    plt.subplots_adjust(hspace=0.1, wspace=0)
+    return axes
+
+def apply(img, aug, num_rows=2, num_cols=4, scale=3):
+    # apply augmentation multiple times to obtain different samples
+    Y = [aug(img) for _ in range(num_rows * num_cols)]
+    show_images(Y, num_rows, num_cols, scale)
 ```
 
-We define a utility function below, that will be used for visualising the augmentations in the tutorial.
+# Spatial Augmentation
+
+One form of augmentation affects the spatial position of pixel values. Using combinations of slicing, scaling, translating, rotating and flipping the values of the original image can be shifted to create new images. Some operations (like scaling and rotation) require interpolation as pixels in the new image are combinations of pixels in the original image.
+
+### `RandomResizedCrop`
+
+Many Computer Visions tasks, such as image classification and object detection, should be robust to changes in the scale and position of objects in the image. You can incorporate this into the network using pooling layers, but an alternative method is to crop random regions of the original image. 
+
+As an example, we randomly (using a uniform distribution) crop a region of the image with:
+
+* an area of 10% to 100% of the original area
+* a ratio of width to height between 0.5 and 2
+
+And then we resize this cropped region to 200 by 200 pixels.
 
 
 ```python
-def plot_mx_array(array):
-    """
-    Array expected to be height x width x 3 (channels), and values are floats between 0 and 255.
-    """
-    assert array.shape[2] == 3, "RGB Channel should be last"
-    imshow((array.clip(0, 255)/255).asnumpy())
+shape_aug = transforms.RandomResizedCrop(size=(200, 200),
+                                         scale=(0.1, 1),
+                                         ratio=(0.5, 2))
+apply(example_image, shape_aug)
 ```
 
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_12_0.png)
+
+
+### `RandomFlipLeftRight`
+
+A simple augmentation technique is flipping. Usually flipping horizontally doesn't change the category of object and results in an image that's still plausible in the real world. Using `RandomFlipLeftRight`, we randomly flip the image horizontally 50% of the time.
+
+
 ```python
-image_folder = os.path.join('data','images')
-mx.test_utils.download('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/data_aug/inputs/0.jpg', dirname=image_folder)
+apply(example_image, transforms.RandomFlipLeftRight())
 ```
 
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_15_0.png)
+
+
+### `RandomFlipTopBottom`
+
+Although it's not as common as flipping left and right, you can flip the image vertically 50% of the time with `RandomFlipTopBottom`. With our giraffe example, we end up with less plausible samples that horizontal flipping, with the ground above the sky in some cases.
+
+
 ```python
-example_image = mx.image.imread(os.path.join(image_folder, "0.jpg")).astype("float32")
-plot_mx_array(example_image)
+apply(example_image, transforms.RandomFlipTopBottom())
+```
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_18_0.png)
+
+
+# Color Augmentation
+
+Usually, exact coloring doesn't play a significant role in the classification or detection of objects, so augmenting the colors of images is a good technique to make the network invariant to color shifts. Color properties that can be changed include brightness, contrast, saturation and hue.
+
+### `RandomBrightness`
+
+Use `RandomBrightness` to add a random brightness jitter to images. Use the `brightness` parameter to control the amount of jitter in brightness, with value from 0 (no change) to 1 (potentially large change). `brightness` doesn't specify whether the brightness of the augmented image will be lighter or darker, just the potential strength of the effect. Specifically the augmentation is given by:
+
+```
+alpha = 1.0 + random.uniform(-brightness, brightness)
+image *= alpha
 ```
 
+So by setting this to 0.5 we randomly change the brightness of the image to a value between 50% ($1-0.5$) and 150% ($1+0.5$) of the original image.
 
-![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/data_aug/outputs/use/output_5_0.png)<!--notebook-skip-line-->
 
+```python
+apply(example_image, transforms.RandomBrightness(0.5))
+```
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_23_0.png)
 
-## Quick start with [`ImageFolderDataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.datasets.ImageFolderDataset)
 
-Using Gluon, it's simple to add data augmentation to your training pipeline. When creating either [`ImageFolderDataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.datasets.ImageFolderDataset) or [`ImageRecordDataset`](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.datasets.ImageRecordDataset), you can pass a `transform` function that will be applied to each image in the dataset, every time it's loaded from disk. Augmentations are intended to be random, so you'll pass a slightly different version of the image to the network on each epoch.
+### `RandomContrast`
 
-We define `aug_transform` below to perform a selection of augmentation steps and pass it to our dataset. It's worth noting that augmentations should only be applied to the training data (and not the test data), so you don't want to pass this augmentation transform function to the testing dataset.
+Use `RandomContrast` to add a random contrast jitter to an image. Contrast can be thought of as the degree to which light and dark colors in the image differ. Use the `contrast` parameter to control the amount of jitter in contrast, with value from 0 (no change) to 1 (potentially large change). `contrast` doesn't specify whether the contrast of the augmented image will be higher or lower, just the potential strength of the effect. Specifically, the augmentation is given by:
 
-[`mxnet.image.CreateAugmenter`](https://mxnet.incubator.apache.org/api/python/image/image.html?highlight=createaugmenter#mxnet.image.CreateAugmenter) is a useful function for creating a diverse set of augmentations at once. Despite the singular `CreateAugmenter`, this function actually returns a list of Augmenters. We can then loop through this list and apply each type of augmentation one after another. Although the parameters of `CreateAugmenter` are fixed, the random augmentations (such as `rand_mirror` and `brightness`) will be different each time `aug_transform` is called.
+```
+coef = nd.array([[[0.299, 0.587, 0.114]]])
+alpha = 1.0 + random.uniform(-contrast, contrast)
+gray = image * coef
+gray = (3.0 * (1.0 - alpha) / gray.size) * nd.sum(gray)
+image *= alpha
+image += gray
+```
 
 
 ```python
-def aug_transform(data, label):
-    data = data.astype('float32')/255
-    augs = mx.image.CreateAugmenter(data_shape=(3, 300, 300),
-                                    rand_crop=0.5, rand_mirror=True, inter_method=10,
-                                    brightness=0.125, contrast=0.125, saturation=0.125,
-                                    pca_noise=0.02)
-    for aug in augs:
-        data = aug(data)
-    return data, label
+apply(example_image, transforms.RandomContrast(0.5))
+```
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_26_0.png)
+
+
+### `RandomSaturation`
 
+Use `RandomSaturation` to add a random saturation jitter to an image. Saturation can be thought of as the 'amount' of color in an image. Use the `saturation` parameter to control the amount of jitter in saturation, with value from 0 (no change) to 1 (potentially large change). `saturation` doesn't specify whether the saturation of the augmented image will be higher or lower, just the potential strength of the effect. Specifically the augmentation is using the method detailed [here](https://beesbuzz.biz/code/16-hsv-color-transforms).
 
-training_dataset = mx.gluon.data.vision.ImageFolderDataset('data', transform=aug_transform)
+
+```python
+apply(example_image, transforms.RandomSaturation(0.5))
 ```
 
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_29_0.png)
+
 
-We can quickly inspect the augmentations by indexing the dataset (which calls the `__getitem__` method of the dataset). When this method is called (with an index) the correct image is read from disk, and the `transform` is applied. We can see the result of the augmentations when comparing the image below with the original image above.
+### `RandomHue`
+
+Use `RandomHue` to add a random hue jitter to images. Hue can be thought of as the 'shade' of the colors in an image. Use the `hue` parameter to control the amount of jitter in hue, with value from 0 (no change) to 1 (potentially large change). `hue` doesn't specify whether the hue of the augmented image will be shifted one way or the other, just the potential strength of the effect. Specifically the augmentation is using the method detailed [here](https://beesbuzz.biz/code/16-hsv-color-transforms).
 
 
 ```python
-sample = training_dataset[0]
-sample_data = sample[0]
-plot_mx_array(sample_data*255)
+apply(example_image, transforms.RandomHue(0.5))
 ```
 
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_32_0.png)
 
-![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/data_aug/outputs/use/output_10_0.png)<!--notebook-skip-line-->
 
+### `RandomColorJitter`
 
-In practice you should load images from a dataset with a [`mxnet.gluon.data.DataLoader`](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=dataloader#mxnet.gluon.data.DataLoader) to take advantage of automatic batching and shuffling. Under the hood the `DataLoader` calls `__getitem__`, but you shouldn't need to call directly for anything other than debugging. Some practitioners pre-augment their datasets by applying a fixed number of augmentations to each image and saving the outputs to disk with the aim of increased throughput. With the `num_workers` parameter of `DataLoader` you can use all CPU cores to apply the augmentations, which often mitigates the need to perform pre-augmentation; reducing complexity and saving disk space.
+`RandomColorJitter` is a convenience transform that can be used to perform multiple color augmentations at once. You can set the `brightness`, `contrast`, `saturation` and `hue` jitters, that function the same as above for their individual transforms.
 
 
 ```python
-batch_size = 1
-training_data_loader = mx.gluon.data.DataLoader(training_dataset, batch_size=1, shuffle=True)
+color_aug = transforms.RandomColorJitter(brightness=0.5,
+                                         contrast=0.5,
+                                         saturation=0.5,
+                                         hue=0.5)
+apply(example_image, color_aug)
+```
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_35_0.png)
+
+
+### `RandomLighting`
+
+Use `RandomLighting` for an AlexNet-style PCA-based noise augmentation.
 
-for data_batch, label_batch in training_data_loader:
-    plot_mx_array(data_batch[0]*255)
-    assert data_batch.shape == (1, 300, 300, 3)
-    assert label_batch.shape == (1,)
-    break
+
+```python
+apply(example_image, transforms.RandomLighting(alpha=1))
 ```
 
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_38_0.png)
+
+# Composed Augmentations
+
+In practice, we apply multiple augmentation techniques to an image to increase the variety of images in the dataset. Using the `Compose` transform that was introduced in the [Data Transforms tutorial](), we can apply 3 of the transforms we previously used above.
+
+
+```python
+augs = transforms.Compose([
+    transforms.RandomFlipLeftRight(), color_aug, shape_aug])
+apply(example_image, augs)
+```
 
-![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/data_aug/outputs/use/output_12_1.png)<!--notebook-skip-line-->
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_41_0.png)
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
diff --git a/docs/tutorials/gluon/datasets.md b/docs/tutorials/gluon/datasets.md
index c029124af8b6..6f645bb4384f 100644
--- a/docs/tutorials/gluon/datasets.md
+++ b/docs/tutorials/gluon/datasets.md
@@ -157,7 +157,7 @@ def construct_net():
     return net
 
 # construct and initialize network.
-ctx =  mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
 
 net = construct_net()
 net.hybridize()
diff --git a/docs/tutorials/gluon/fit_api_tutorial.md b/docs/tutorials/gluon/fit_api_tutorial.md
new file mode 100644
index 000000000000..5f8fcc52bf47
--- /dev/null
+++ b/docs/tutorials/gluon/fit_api_tutorial.md
@@ -0,0 +1,251 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# MXNet Gluon Fit API
+
+In this tutorial, we will see how to use the [Gluon Fit API](https://cwiki.apache.org/confluence/display/MXNET/Gluon+Fit+API+-+Tech+Design) which is the easiest way to train deep learning models using the [Gluon API](http://mxnet.incubator.apache.org/versions/master/gluon/index.html) in Apache MXNet. 
+
+With the Fit API, you can train a deep learning model with miminal amount of code. Just specify the network, loss function and the data you want to train on. You don't need to worry about the boiler plate code to loop through the dataset in batches(often called as 'training loop'). Advanced users can still do this for bespoke training loops, but most use cases will be covered by the Fit API.
+
+To demonstrate the Fit API, this tutorial will train an Image Classification model using the [ResNet-18](https://arxiv.org/abs/1512.03385) architecture for the neural network. The model will be trained using the [Fashion-MNIST dataset](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/). 
+
+## Prerequisites
+
+To complete this tutorial, you will need:
+
+- [MXNet](https://mxnet.incubator.apache.org/install/#overview) (The version of MXNet will be >= 1.5.0)
+- [Jupyter Notebook](https://jupyter.org/index.html) (For interactively running the provided .ipynb file)
+
+
+
+
+```python
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+from mxnet.gluon.contrib.estimator import estimator, event_handler
+from mxnet.gluon.contrib.estimator.event_handler import EventHandler
+
+gpu_count = mx.context.num_gpus()
+ctx = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu()
+mx.random.seed(7) # Set a fixed seed
+```
+
+## Dataset
+
+[Fashion-MNIST](https://research.zalando.com/welcome/mission/research-projects/fashion-mnist/) dataset consists of fashion items divided into ten categories: t-shirt/top, trouser, pullover, dress, coat, sandal, shirt, sneaker, bag and ankle boot. 
+
+- It has 60,000 gray scale images of size 28 * 28 for training.  
+- It has 10,000 gray scale images os size 28 * 28 for testing/validation. 
+
+We will use the ```gluon.data.vision``` package to directly import the Fashion-MNIST dataset and perform pre-processing on it.
+
+
+```python
+# Get the training data 
+fashion_mnist_train = gluon.data.vision.FashionMNIST(train=True)
+
+# Get the validation data
+fashion_mnist_val = gluon.data.vision.FashionMNIST(train=False)
+```
+
+
+```python
+transforms = [gluon.data.vision.transforms.Resize(224), # We pick 224 as the model we use takes an input of size 224.
+                gluon.data.vision.transforms.ToTensor()]
+
+# Now we will stack all these together.
+transforms = gluon.data.vision.transforms.Compose(transforms)
+```
+
+
+```python
+# Apply the transformations
+fashion_mnist_train = fashion_mnist_train.transform_first(transforms)
+fashion_mnist_val = fashion_mnist_val.transform_first(transforms)
+```
+
+
+```python
+batch_size = 256 # Batch size of the images
+num_workers = 4 # The number of parallel workers for loading the data using Data Loaders.
+
+train_data_loader = gluon.data.DataLoader(fashion_mnist_train, batch_size=batch_size, 
+                                          shuffle=True, num_workers=num_workers)
+val_data_loader = gluon.data.DataLoader(fashion_mnist_val, batch_size=batch_size, 
+                                        shuffle=False, num_workers=num_workers)
+```
+
+## Model and Optimizers
+
+Let's load the resnet-18 model architecture from [Gluon Model Zoo](http://mxnet.apache.org/api/python/gluon/model_zoo.html) and initialize it's parameters. The Gluon Model Zoo contains a repository of pre-trained models as well the model architecture definitions. We are using the model architecture from the model zoo in order to train it from scratch.
+
+
+```python
+resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes = 10)
+resnet_18_v1.initialize(init = mx.init.Xavier(), ctx=ctx)
+```
+
+We will be using ```SoftmaxCrossEntropyLoss``` as the loss function since this is a multi-class classification problem. We will be using ```sgd``` (Stochastic Gradient Descent) as the optimizer. You can experiment with a different optimizer as well. 
+
+
+```python
+loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
+```
+
+Let's define the trainer object for training the model.
+
+
+```python
+learning_rate = 0.04 # You can experiment with your own learning rate here
+num_epochs = 2 # You can run training for more epochs
+trainer = gluon.Trainer(resnet_18_v1.collect_params(), 
+                        'sgd', {'learning_rate': learning_rate})
+```
+
+## Train using Fit API
+
+As stated earlier, Fit API greatly simplifies the boiler plate code and complexity for training using MXNet Gluon.
+
+In the basic usage example, with just 2 lines of code, we will set up our model for training.
+
+### Basic Usage
+
+
+```python
+train_acc = mx.metric.Accuracy() # Metric to monitor
+
+# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
+est = estimator.Estimator(net=resnet_18_v1, 
+                          loss=loss_fn, 
+                          metrics=train_acc, 
+                          trainer=trainer, 
+                          context=ctx)
+
+# Magic line
+est.fit(train_data=train_data_loader,
+        epochs=num_epochs)
+```
+
+    Training begin: using optimizer SGD with current learning rate 0.0400 <!--notebook-skip-line-->
+    Train for 2 epochs. <!--notebook-skip-line-->
+    
+    [Epoch 0] finished in 25.110s: train_accuracy : 0.7877 train_softmaxcrossentropyloss0 : 0.5905 <!--notebook-skip-line-->
+    
+    [Epoch 1] finished in 23.595s: train_accuracy : 0.8823 train_softmaxcrossentropyloss0 : 0.3197 <!--notebook-skip-line-->
+    Train finished using total 48s at epoch 1. train_accuracy : 0.8823 train_softmaxcrossentropyloss0 : 0.3197 <!--notebook-skip-line-->
+
+
+### Advanced Usage
+
+Fit API is also customizable with several `Event Handlers` which give a fine grained control over the steps in training and exposes callback methods that provide control over the stages involved in training. Available callback methods are: `train_begin`, `train_end`, `batch_begin`, `batch_end`, `epoch_begin` and `epoch_end`.
+
+One can use built-in event handlers such as `LoggingHandler`, `CheckpointHandler` or `EarlyStoppingHandler` to log and save the model at certain timesteps during training and stopping the training when the model's performance plateaus. One can also create a custom handler by inheriting [`EventHandler`](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/estimator/event_handler.py#L31).
+
+### Custom Event Handler
+
+Here we will showcase an example to create a custom event handler by inheriting from `EventHandler` class. Our custom event handler is a simple one, that just records the loss values at the end of every epoch in our training phase.
+
+Note : The `EventHandler` holds a reference to the `Estimator` object. The Estimator object reference is updated when the Fit API is called.
+
+
+```python
+class LossRecordHandler(EventHandler):
+    def __init__(self):
+        super(LossRecordHandler, self).__init__()
+        self.losses = []
+    
+    def train_begin(self):
+        print ("Training begin")
+
+    def train_end(self):
+        # Print all the losses at the end of training
+        for i, loss in enumerate(self.losses):
+            print ("Epoch {}, loss {}".format(i, loss)) 
+
+    def epoch_end(self):
+        loss_name = self.estimator.loss[0].name # Access the loss from estimator
+        loss_val = self.estimator.train_stats['train_'+ loss_name] # Get the loss value at current epoch
+        self.losses.append(loss_val) # Append it to losses
+```
+
+
+```python
+# Let's reset the model, trainer and accuracy objects from above
+
+resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), ctx=ctx)
+trainer = gluon.Trainer(resnet_18_v1.collect_params(), 
+                        'sgd', {'learning_rate': learning_rate})
+train_acc = mx.metric.Accuracy()
+```
+
+
+```python
+# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
+est = estimator.Estimator(net=resnet_18_v1,
+                          loss=loss_fn,
+                          metrics=train_acc,
+                          trainer=trainer, 
+                          context=ctx)
+
+# Define the handlers, let's say in built Checkpointhandler
+checkpoint_handler = event_handler.CheckpointHandler(filepath='./my_best_model.params',
+                                                     monitor='val_accuracy', # Monitors a metric
+                                                     save_best_only=True) # Save the best model in terms of 
+                                                                         # training accuracy
+# Let's instantiate another handler which we defined above 
+loss_record_handler = LossRecordHandler()
+# Magic line
+est.fit(train_data=train_data_loader,
+        val_data=val_data_loader,
+        epochs=num_epochs,
+        event_handlers=[checkpoint_handler, loss_record_handler]) # Add the event handlers
+```
+
+    Training begin: using optimizer SGD with current learning rate 0.0400 <!--notebook-skip-line-->
+    Train for 2 epochs. <!--notebook-skip-line-->
+    
+    [Epoch 0] finished in 25.236s: train_accuracy : 0.7917 train_softmaxcrossentropyloss0 : 0.5741 val_accuracy : 0.6612 val_softmaxcrossentropyloss0 : 0.8627 <!--notebook-skip-line-->
+    
+    [Epoch 1] finished in 24.892s: train_accuracy : 0.8826 train_softmaxcrossentropyloss0 : 0.3229 val_accuracy : 0.8474 val_softmaxcrossentropyloss0 : 0.4262 <!--notebook-skip-line-->
+    
+    Train finished using total 50s at epoch 1. train_accuracy : 0.8826 train_softmaxcrossentropyloss0 : 0.3229 val_accuracy : 0.8474 val_softmaxcrossentropyloss0 : 0.4262 <!--notebook-skip-line-->
+
+    Training begin <!--notebook-skip-line-->
+    Epoch 1, loss 0.5741 <!--notebook-skip-line-->
+    Epoch 2, loss 0.3229 <!--notebook-skip-line-->
+
+You can load the saved model, by using ```load_parameters``` API in Gluon. For more details refer to the [Loading model parameters from file tutorial](http://mxnet.incubator.apache.org/versions/master/tutorials/gluon/save_load_params.html#saving-model-parameters-to-file)
+
+
+```python
+resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes = 10)
+resnet_18_v1.load_parameters('./my_best_model.params', ctx=ctx)
+```
+
+## Summary
+
+In this tutorial, we learnt how to use ```Gluon Fit APIs``` for training a deep learning model and also saw an option to customize it with the use of Event Handlers.
+For more references and advanced usage details can be found in the [documentation](http://mxnet.apache.org/api/python/gluon/gluon.html).
+
+## Next Steps 
+
+- To learn more about deep learning with MXNet Gluon, see [Deep Learning - The Straight Dope](https://gluon.mxnet.io)
+- For more hands on learning about deep learning, check out [Dive into Deep Learning](https://d2l.ai)
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/info_gan.md b/docs/tutorials/gluon/info_gan.md
index 93fd6cb5e7fd..91adf6c75108 100644
--- a/docs/tutorials/gluon/info_gan.md
+++ b/docs/tutorials/gluon/info_gan.md
@@ -51,7 +51,7 @@ batch_size   = 64
 z_dim        = 100
 n_continuous = 2
 n_categories = 10
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 ```
 
 Some functions to load and normalize images.
diff --git a/docs/tutorials/gluon/learning_rate_finder.md b/docs/tutorials/gluon/learning_rate_finder.md
index 30c66e302766..b580bee7c5fb 100644
--- a/docs/tutorials/gluon/learning_rate_finder.md
+++ b/docs/tutorials/gluon/learning_rate_finder.md
@@ -231,7 +231,7 @@ Using a Pre-activation ResNet-18 from the Gluon model zoo, we instantiate our Le
 
 
 ```python
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
 learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
 lr_finder = LRFinder(learner)
diff --git a/docs/tutorials/gluon/learning_rate_schedules.md b/docs/tutorials/gluon/learning_rate_schedules.md
index 46c79ebc249b..119677373577 100644
--- a/docs/tutorials/gluon/learning_rate_schedules.md
+++ b/docs/tutorials/gluon/learning_rate_schedules.md
@@ -140,7 +140,7 @@ As discussed above, the schedule should return a learning rate given an (1-based
 
 ```python
 # Use GPU if one exists, else use CPU
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 
 # MNIST images are 28x28. Total pixels in input layer is 28x28 = 784
 num_inputs = 784
diff --git a/docs/tutorials/gluon/multi_gpu.md b/docs/tutorials/gluon/multi_gpu.md
new file mode 100644
index 000000000000..9a8c06e1f2c5
--- /dev/null
+++ b/docs/tutorials/gluon/multi_gpu.md
@@ -0,0 +1,193 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Multiple GPUs training with Gluon API
+
+In this tutorial we will walk through how one can train deep learning neural networks on multiple GPUs within a single machine. This tutorial focuses on data parallelism as opposed to model parallelism. Data parallelism approach assumes, that you can fit whole your model in a GPU and only training data needs to be partitioned. This is different from model parallelism, where the model is so big, that it doesn't fit into a single GPU, so it needs to be partitioned as well. Model parallelism is not supported by Apache MXNet out of the box, and one has to manually route the data among different devices to achieve model parallelism. Check out [model parallelism tutorial](https://mxnet.incubator.apache.org/versions/master/faq/model_parallel_lstm.html) to learn more about it.
+Here we will focus on implementing data parallel training for a convolutional neural network called LeNet.
+
+## Prerequisites
+
+- Two or more GPUs 
+- CUDA 9 or higher
+- cuDNN v7 or higher
+- Knowledge of how to train a model using Gluon API
+
+## Storing data on GPU
+
+The basic primitive in Apache MXNet to specify a tensor is [NDArray](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#module-mxnet.ndarray). When you create NDArray you have to provide the context - the device where this tensor is going to be stored. The context can be either CPU or GPU and both can be indexed: if your machine has multiple GPUs, you can provide an index to specify which GPU to use. By default, CPU context is used, and that means that the tensor will live in main RAM. Below is an example how to create two tensors where one is stored on the first GPU and the second is stored on the second GPU. Notice, that this example will work even when you have one or no GPUs at all. We use [mx.context.num_gpus](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/context.py#L262) to find the number of available GPUs.
+
+```python
+import mxnet as mx
+
+n_gpu = mx.context.num_gpus()
+context = [mx.gpu(0), mx.gpu(1)] if n_gpu >= 2 else \
+          [mx.gpu(), mx.gpu()] if n_gpu == 1 else \
+          [mx.cpu(), mx.cpu()]
+
+a = mx.nd.array([1, 2, 3], ctx=context[0])
+b = mx.nd.array([5, 6, 7], ctx=context[1])
+```
+
+The next step would be to do operations on these 2 NDArrays. But, unfortunately, if we try to do any operation involved both these arrays, Apache MXNet will return an error: `Check failed: e == cudaSuccess CUDA: an illegal memory access was encountered`. This error is returned because we tried to use arrays that are stored on different contexts: Apache MXNet wants users to explicitly control memory allocation and doesn't automatically copy data between GPUs. If we want to do an operation on these arrays we have to have them in the same GPU. The result of the operation is going to be also stored on that GPU as well.
+
+We can manually copy data between GPUs using [as_in_context method](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?#mxnet.ndarray.NDArray.as_in_context). We can get the current context of an NDArray via [context property](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?#mxnet.ndarray.NDArray.context).
+
+```python
+c = a + b.as_in_context(a.context)
+```
+
+Using this example, we have learnt that we can perform operations with NDArrays only if they are stored on the same GPU. So, how can we split the data between GPUs, but use the same model for training? We will answer this question in the next section.
+
+## Storing the network on multiple GPUs
+
+When you create a network using [Blocks](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block) the parameters of blocks are also stored in NDArrays. When you initialize your network, you have to specify which context you are going to use for the underlying NDArrays. The feature of the [initialize method](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.initialize) is that it can accept the list of contexts, meaning that you can provide more than one context to store underlying parameters. In the example below, we create the LeNet network and initialize it to be stored on GPU(0) and GPU(1) simultaneously. Each GPU will receive its own copy of the parameters:
+
+```python
+from mxnet import init
+from mxnet.gluon import nn
+
+net = nn.Sequential()
+net.add(nn.Conv2D(channels=6, kernel_size=5, activation='relu'),
+        nn.MaxPool2D(pool_size=2, strides=2),
+        nn.Conv2D(channels=16, kernel_size=3, activation='relu'),
+        nn.MaxPool2D(pool_size=2, strides=2),
+        nn.Flatten(),
+        nn.Dense(120, activation="relu"),
+        nn.Dense(84, activation="relu"),
+        nn.Dense(10))
+
+net.initialize(init=init.Xavier(), ctx=context)
+```
+
+The actual initialization will happen once we do the first forward pass on the network, but at this stage Apache MXNet knows that we are expecting parameters of the network to be on both GPUs.
+
+## Multiple GPUs training schema
+
+At this moment, we have learnt how to define NDArrays in different contexts and that a network can be initialized on two GPUs at the same time.
+
+To do multiple GPU training with a given batch of the data, we divide the examples in the batch into number of portions equal to the number of GPUs we use and distribute one to each GPU. Then, each GPU will individually calculate the local gradient of the model parameters based on the batch subset it was assigned and the model parameters it maintains. Next, we sum together the local gradients on the GPUs to get the current batch stochastic gradient. After that, each GPU uses this batch stochastic gradient to update the complete set of model parameters that it maintains. Figure below depicts the batch stochastic gradient calculation using data parallelism and two GPUs.
+
+![data-parallel](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/data-parallel.svg)
+
+This approach allows us to avoid the limitation of doing operations on different GPUs - we move subsets of data to each GPU and the operations are happening inside each individual GPU only. After that we aggregate the resulting gradients and each GPU receives a copy of the gradients to do model parameters update.
+
+Using that approach, knowing a way to move data between contexts and how to initialize a model on multiple contexts, we already know everything that is needed to do multiple GPU training. But Apache MXNet also provides us a convenient method to distribute data between multiple GPUs, which we are going to cover in the section below.
+
+## Splitting data between GPUs
+
+Apache MXNet provides a utility method [gluon.utils.split_and_load](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.utils.split_and_load) to split the data between multiple contexts. The result of the method's call is a list of NDArrays each of which is stored on a separate context provided in the `ctx_list` argument. The code below demonstrates how to use the method:
+
+```python
+data = mx.random.uniform(shape=(100, 10))
+result = mx.gluon.utils.split_and_load(data, ctx_list=context)
+```
+
+If we explore the result, we will notice, that `split_and_load` method divided the data in two chunks of the same shape `(50, 10)`. If the number of elements is uneven, we have to specify `even_split=False` to instruct the method to do uneven split.
+
+At this point we are ready to assemble a complete multiple GPUs training example.
+
+## Multiple GPUs classification of MNIST images
+
+In the first step, we are going to load the MNIST images and use [ToTensor](https://mxnet.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.transforms.ToTensor) to convert the format of the data from `height x width x channel` to `channel x height x width` and divide it by 255.
+
+```python
+train_data = mx.gluon.data.vision.MNIST(train=True).transform_first(mx.gluon.data.vision.transforms.ToTensor())
+val_data = mx.gluon.data.vision.MNIST(train=False).transform_first(mx.gluon.data.vision.transforms.ToTensor())
+```
+
+The next step is to create a [DataLoader](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.DataLoader) which constructs batches from the dataset. We create one for the training and one for the validation datasets.
+
+```python
+batch_size = 128
+train_loader = mx.gluon.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)
+val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_size)
+```
+
+After that we define the [Trainer](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#trainer) that defines the optimization algorithm to be used and hyperparameters as well as the [Loss](https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss) function and a [metric](https://mxnet.incubator.apache.org/api/python/metric/metric.html#mxnet.metric.Accuracy) to track:
+
+```python
+trainer = mx.gluon.Trainer(
+    params=net.collect_params(),
+    optimizer='sgd',
+    optimizer_params={'learning_rate': 0.04},
+)
+
+metric = mx.metric.Accuracy()
+loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss()
+```
+
+After these preparations we are ready to define the training loop. In the training loop we will split the data between GPUs, pass them all through the individual GPU, do the backward step on each loss to accumulate the gradients, and call [trainer.step](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Trainer.step) to actually update the parameters of the model:
+
+```python
+num_epochs = 10
+
+for epoch in range(num_epochs):
+    for inputs, labels in train_loader:
+        actual_batch_size = inputs.shape[0]
+        # Split data among GPUs. Since split_and_load is a deterministic function
+        # inputs and labels are going to be split in the same way between GPUs.
+        inputs = mx.gluon.utils.split_and_load(inputs, ctx_list=context, even_split=False)
+        labels = mx.gluon.utils.split_and_load(labels, ctx_list=context, even_split=False)
+
+        # The forward pass and the loss computation need to be wrapped
+        # in a `record()` scope to make sure the computational graph is
+        # recorded in order to automatically compute the gradients
+        # during the backward pass.
+        with mx.autograd.record():
+            outputs = [net(input_slice) for input_slice in inputs]
+            losses = [loss_function(o, l) for o, l in zip(outputs, labels)]
+
+        # Iterate over losses to compute gradients for each input slice
+        for loss in losses:
+            loss.backward()
+
+        # update metric for each output
+        for l, o in zip(labels, outputs):
+            metric.update(l, o)
+
+        # Update the parameters by stepping the trainer; the batch size
+        # is required to normalize the gradients by `1 / batch_size`.
+        trainer.step(batch_size=actual_batch_size, ignore_stale_grad=True)
+
+    # Print the evaluation metric and reset it for the next epoch
+    name, acc = metric.get()
+    print('After epoch {}: {} = {}'.format(epoch + 1, name, acc))
+    metric.reset()
+```
+
+If you run this example and run `nvidia-smi` tool from NVIDIA, you will notice that both GPUs are used to perform calculations.
+
+## Advanced topic
+
+As we mentioned above, the gradients for each data split are calculated independently and then later summed together. We haven't mentioned yet where exactly this aggregation happens.
+
+Apache MXNet uses [KVStore](https://mxnet.incubator.apache.org/versions/master/api/scala/kvstore.html) - a virtual place for data sharing between different devices, including machines and GPUs. The KVStore is responsible for storing and, by default, aggregating the gradients of the model. The physical location of the KVStore is defined when we create a [Trainer](https://mxnet.incubator.apache.org/versions/master/api/python/gluon/gluon.html#mxnet.gluon.Trainer) and by default is set to `device`, which mean it will aggregate gradients and update weights on GPUs. The actual data is distributed in round-robin fashion among available GPUs per block. This statement means two things, which are important to know from practical perspective.
+
+The first thing is there is an additional memory allocation that happens on GPUs that is not directly related to your data and your model to store auxiliary information for GPUs sync-up. Depending on the complexity of your model, the amount of required memory can be significant, and you may even experience CUDA out of memory exceptions. If that is the case, and you cannot decrease batch size anymore, you may want to consider switching `KVStore` storage to RAM by setting `kvstore` argument to `local` during instantiation of the `Trainer`. Often this decreases the wall-clock performance time of your model, because the gradients and parameters would need to be copied to RAM and back.
+
+The second thing is that since  the auxiliary information is distributed among GPUs in round-robin fashion on per block level, `KVStore` may use more memory on some GPUs and less on others. For example, if your model has a very big embedding layer, you may see that your first GPU uses 90% of your memory while others use only 50%. That affects how much data you actually can load in a single batch, because the data between devices is split evenly. If that is the case and you have to keep or increase your batch size, you may want to switch to the `local` mode.
+
+## Conclusion
+
+With Apache MXNet training using multiple GPUs doesn't need a lot of extra code. To do the multiple GPUs training you need to initialize a model on all GPUs, split the batches of data into separate splits where each is stored on a different GPU and run the model separately on every split. The synchronization of gradients and parameters between GPUs is done automatically by Apache MXNet.
+
+## Recommended Next Steps
+
+* Check out our two video tutorial on improving your code performance. In the [first video](https://www.youtube.com/watch?v=n8tN6pRZBdE) we explain how to visualize the performance, and in the [second video](https://www.youtube.com/watch?v=Cqo7FPftNyo) we show how to optimize it.
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
diff --git a/docs/tutorials/gluon/save_load_params.md b/docs/tutorials/gluon/save_load_params.md
index 26d6b8924b3c..c82ec5ac35a7 100644
--- a/docs/tutorials/gluon/save_load_params.md
+++ b/docs/tutorials/gluon/save_load_params.md
@@ -50,7 +50,7 @@ Let's define a helper function to build a LeNet model and another helper to trai
 
 ```python
 # Use GPU if one exists, else use CPU
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 
 # MNIST images are 28x28. Total pixels in input layer is 28x28 = 784
 num_inputs = 784
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 01c59b16def7..c03678fce584 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -38,6 +38,7 @@
    tensorrt/index.md
    unsupervised_learning/index.md
    vision/index.md
+   amp/index.md
 ```
 
 MXNet tutorials can be found in this section. A variety of language bindings are available for MXNet (including Python, Scala, Java, Clojure, C++ and R) and we have a different tutorial section for each language.
@@ -90,7 +91,7 @@ Select API:&nbsp;
    * [Image similiarity search with InfoGAN](/tutorials/gluon/info_gan.html)
 * Practitioner Guides
     * [Gotchas using NumPy](/tutorials/gluon/gotchas_numpy_in_mxnet.html)
-    * [Multi-GPU training](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+    * [Multi-GPU training](/tutorials/gluon/multi_gpu.html)<span style="color:red"> (new!) ([Alternative](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
     * [Checkpointing and Model Serialization (a.k.a. saving and loading)](/tutorials/gluon/save_load_params.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/> ([Alternative](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html))
     * [Distributed Training](https://github.com/apache/incubator-mxnet/tree/master/example/distributed_training)
     * [Inference using an ONNX model](/tutorials/onnx/inference_on_onnx_model.html)
@@ -100,8 +101,9 @@ Select API:&nbsp;
     * [Learning Rate Schedules](/tutorials/gluon/learning_rate_schedules.html)
     * [Advanced Learning Rate Schedules](/tutorials/gluon/learning_rate_schedules_advanced.html)
     * [Profiling MXNet Models](/tutorials/python/profiler.html)
-    * [Module to Gluon API](/tutorials/python/module_to_gluon.html)<span style="color:red"> (new!)</span>
+    * [Module to Gluon API](/tutorials/python/module_to_gluon.html)</span>
     * [Gluon end to end from training to inference](/tutorials/gluon/gluon_from_experiment_to_deployment.html)
+    * [Automatic Mixed Precision in Gluon](/tutorials/amp/amp_tutorial.html)
 
 * API Guides
     * Core APIs
@@ -125,7 +127,7 @@ Select API:&nbsp;
             * [HybridBlocks](/tutorials/gluon/hybrid.html) ([Alternative](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
             * [Block Naming](/tutorials/gluon/naming.html)
             * [Custom Operators](/tutorials/gluon/customop.html)
-            * [Control Flow operators](/tutorials/control_flow/ControlFlowTutorial.html)<span style="color:red"> (new!)</span>
+            * [Control Flow operators](/tutorials/control_flow/ControlFlowTutorial.html)
         * Autograd
             * [AutoGrad API](/tutorials/gluon/autograd.html)
             * [AutoGrad API with chain rule](http://gluon.mxnet.io/chapter01_crashcourse/autograd.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
@@ -134,6 +136,8 @@ Select API:&nbsp;
             * [Datasets and DataLoaders](/tutorials/gluon/datasets.html)
             * [Applying Data Augmentation](/tutorials/gluon/data_augmentation.html)
             * [Data Augmentation with Masks (for Object Segmentation)](https://mxnet.incubator.apache.org/tutorials/python/data_augmentation_with_masks.html)
+        * Fit API
+            * [Using Fit API](/tutorials/gluon/fit_api_tutorial.html)
 </div> <!--end of gluon-->
 
 <div class="module">
diff --git a/docs/tutorials/java/mxnet_java_on_intellij.md b/docs/tutorials/java/mxnet_java_on_intellij.md
index e2f5a6c6cad2..3b3245dd8d51 100644
--- a/docs/tutorials/java/mxnet_java_on_intellij.md
+++ b/docs/tutorials/java/mxnet_java_on_intellij.md
@@ -123,7 +123,7 @@ Click "Import Changes" in this prompt.
 **Step 5.** Build the project:
 - To build the project, from the menu choose Build, and then choose Build Project.
 
-**Step 6.** Navigate to the App.java class in the project and paste the code in `main` method from HelloWorld.java from [Java Demo project](https://github.com/apache/incubator-mxnet/blob/java-api/scala-package/mxnet-demo/java-demo/src/main/java/sample/HelloWorld.java) on MXNet repository, overwriting the original hello world code.
+**Step 6.** Navigate to the App.java class in the project and paste the code in `main` method from HelloWorld.java from [Java Demo project](https://github.com/apache/incubator-mxnet/blob/java-api/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java) on MXNet repository, overwriting the original hello world code.
 You can also grab the entire [Java Demo project](https://github.com/apache/incubator-mxnet/tree/java-api/scala-package/mxnet-demo/java-demo) and run it by following the instructions on the [README](https://github.com/apache/incubator-mxnet/blob/java-api/scala-package/mxnet-demo/java-demo/README.md).
 
 **Step 7.** Now run the App.java. 
diff --git a/docs/tutorials/java/ssd_inference.md b/docs/tutorials/java/ssd_inference.md
index 1117bbdcfa5b..38c29be23be4 100644
--- a/docs/tutorials/java/ssd_inference.md
+++ b/docs/tutorials/java/ssd_inference.md
@@ -24,10 +24,10 @@ The SSD model is trained on the Pascal VOC 2012 dataset. The network is a SSD mo
 ## Prerequisites
 
 To complete this tutorial, you need the following:
-* [MXNet Java Setup on IntelliJ IDEA](/java/mxnet_java_on_intellij.html) (Optional)
-* [wget](https://www.gnu.org/software/wget/) To download model artifacts 
+* [MXNet Java Setup on IntelliJ IDEA](mxnet_java_on_intellij.md) (Optional)
+* [wget](https://www.gnu.org/software/wget/) To download model artifacts
 * SSD Model artifacts
-    * Use the following script to get the SSD Model files : 
+    * Use the following script to get the SSD Model files :
 ```bash
 data_path=/tmp/resnet50_ssd
 mkdir -p "$data_path"
@@ -36,7 +36,7 @@ wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_mode
 wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/synset.txt -P $data_path
 ```
 * Test images  : A few sample images to run inference on.
-    * Use the following script to download sample images : 
+    * Use the following script to download sample images :
 ```bash
 image_path=/tmp/resnet50_ssd/images
 mkdir -p "$image_path"
@@ -46,9 +46,9 @@ wget https://cloud.githubusercontent.com/assets/3307514/20012563/cbb41382-a27d-1
 ```
 
 Alternately, you can get the entire SSD Model artifacts + images in one single script from the MXNet Repository by running [get_ssd_data.sh script](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/scripts/infer/objectdetector/get_ssd_data.sh)  
-     
-## Time to code! 
-1\. Following the [MXNet Java Setup on IntelliJ IDEA](/java/mxnet_java_on_intellij.html) tutorial, in the same project `JavaMXNet`, create a new empty class called : `ObjectDetectionTutorial.java`.
+
+## Time to code!
+1\. Following the [MXNet Java Setup on IntelliJ IDEA](mxnet_java_on_intellij.md) tutorial, in the same project `JavaMXNet`, create a new empty class called : `ObjectDetectionTutorial.java`.
 
 2\. In the `main` function of `ObjectDetectionTutorial.java` define the downloaded model path and the image data paths. This is the same path where we downloaded the model artifacts and images in a previous step.
 
@@ -58,9 +58,9 @@ String inputImagePath = "/tmp/resnet50_ssd/images/dog.jpg";
 ```
 
 3\. We can run the inference code in this example on either CPU or GPU (if you have a GPU backed machine) by choosing the appropriate context.
-    
+
 ```java
-        
+
 List<Context> context = getContext();
 ...
 
@@ -175,7 +175,7 @@ To run the ObjectDetectionTutorial.java use the following command from the proje
 ```bash
 java -cp "target/javaMXNet-1.0-SNAPSHOT.jar:target/dependency/*" mxnet.ObjectDetectionTutorial
 ```
-    
+
 You should see a similar output being generated for the dog image that we used:
 ```bash
 Class: car
@@ -188,16 +188,16 @@ Class: dog
 Probabilties: 0.82268167
 Coord:83.82356, 179.14001, 206.63783, 476.78754
 ```
-     
+
 ![dog_1](https://cloud.githubusercontent.com/assets/3307514/20012567/cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg)
-    
+
 The results returned by the inference call translate into the regions in the image where the model detected objects.
-     
+
 ![dog_2](https://cloud.githubusercontent.com/assets/3307514/19171063/91ec2792-8be0-11e6-983c-773bd6868fa8.png)
 
 ## Next Steps
 For more information about MXNet Java resources, see the following:
 
-* [Java Inference API](/api/java/index.html)
+* [Java Inference API](/api/java/index.md)
 * [Java Inference Examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-* [MXNet Tutorials Index](/tutorials/index.html)
+* [MXNet Tutorials Index](/tutorials/index.md)
diff --git a/docs/tutorials/mkldnn/MKLDNN_README.md b/docs/tutorials/mkldnn/MKLDNN_README.md
index c5779670cd87..460fa200cc46 100644
--- a/docs/tutorials/mkldnn/MKLDNN_README.md
+++ b/docs/tutorials/mkldnn/MKLDNN_README.md
@@ -15,324 +15,277 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-# Build/Install MXNet with MKL-DNN
-
-A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
-In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
-
-The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
-
-
-<h2 id="0">Contents</h2>
-
-* [1. Linux](#1)
-* [2. MacOS](#2)
-* [3. Windows](#3)
-* [4. Verify MXNet with python](#4)
-* [5. Enable MKL BLAS](#5)
-* [6. Enable graph optimization](#6)
-* [7. Quantization](#7)
-* [8. Support](#8)
-
-<h2 id="1">Linux</h2>
-
-### Prerequisites
-
-```
-sudo apt-get update
-sudo apt-get install -y build-essential git
-sudo apt-get install -y libopenblas-dev liblapack-dev
-sudo apt-get install -y libopencv-dev
-sudo apt-get install -y graphviz
-```
-
-### Clone MXNet sources
-
-```
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-cd incubator-mxnet
-```
-
-### Build MXNet with MKL-DNN
-
-```
-make -j $(nproc) USE_OPENCV=1 USE_MKLDNN=1 USE_BLAS=mkl USE_INTEL_PATH=/opt/intel
-```
-
-If you don't have the full [MKL](https://software.intel.com/en-us/intel-mkl) library installation, you might use OpenBLAS as the blas library, by setting USE_BLAS=openblas.
-
-<h2 id="2">MacOS</h2>
-
-### Prerequisites
-
-Install the dependencies, required for MXNet, with the following commands:
-
-- [Homebrew](https://brew.sh/)
-- llvm (clang in macOS does not support OpenMP)
-- OpenCV (for computer vision operations)
-
-```
-# Paste this command in Mac terminal to install Homebrew
-/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-
-# install dependency
-brew update
-brew install pkg-config
-brew install graphviz
-brew tap homebrew/core
-brew install opencv
-brew tap homebrew/versions
-brew install llvm
-```
-
-### Clone MXNet sources
-
-```
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-cd incubator-mxnet
-```
-
-### Build MXNet with MKL-DNN
-
-```
-LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ USE_OPENCV=1 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple USE_PROFILER=1
-```
-
-<h2 id="3">Windows</h2>
-
-On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
-[Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.
-
-**Visual Studio 2015**
-
-To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
-
-1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and Install [CMake 3](https://cmake.org/) if it is not already installed.
-3. Download and install [OpenCV 3](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
-4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
-6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
-```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
-7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
-8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
-
-After you have installed all of the required dependencies, build the MXNet source code:
-
-1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
-```
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-```
-
-2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
-
-3. Start a Visual Studio command prompt.
-
-4. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
-[CMake 3](https://cmake.org/) command:
-```
-mkdir build
-cd build
-cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
-```
-
-5. In Visual Studio, open the solution file,```.sln```, and compile it.
-These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
-Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
-
-6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
-
-**Visual Studio 2017**
-
-To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
-
-1. If [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and install [CMake 3](https://cmake.org/files/v3.11/cmake-3.11.0-rc4-win64-x64.msi) if it is not already installed.
-3. Download and install [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download).
-4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g., ```OpenCV_DIR = C:\utils\opencv\build```).
-6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.20/OpenBLAS%200.2.20%20version.zip/download).
-7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories (e.g., ```OpenBLAS_HOME = C:\utils\OpenBLAS```).
-
-After you have installed all of the required dependencies, build the MXNet source code:
-
-1. Start ```cmd``` in windows.
-
-2. Download the MXNet source code from GitHub by using following command:
-
-```r
-cd C:\
-git clone --recursive https://github.com/apache/incubator-mxnet.git
-```
-
-3. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
-
-4. Follow [this link](https://docs.microsoft.com/en-us/visualstudio/install/modify-visual-studio) to modify ```Individual components```, and check ```VC++ 2017 version 15.4 v14.11 toolset```, and click ```Modify```.
-
-5. Change the version of the Visual studio 2017 to v14.11 using the following command (by default the VS2017 is installed in the following path):
-
-```r
-"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
-```
-
-6. Create a build dir using the following command and go to the directory, for example:
-
-```r
-mkdir C:\build
-cd C:\build
-```
-
-7. CMake the MXNet source code by using following command:
-
-```r
-cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
-```
-
-8. After the CMake successfully completed, compile the the MXNet source code by using following command:
-
-```r
-msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
-```
-
-9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
-
-<h2 id="4">Verify MXNet with python</h2>
-
-```
-cd python
-sudo python setup.py install
-python -c "import mxnet as mx;print((mx.nd.ones((2, 3))*2).asnumpy());"
-
-Expected Output:
-
-[[ 2.  2.  2.]
- [ 2.  2.  2.]]
-```
-
-### Verify whether MKL-DNN works
-
-After MXNet is installed, you can verify if MKL-DNN backend works well with a single Convolution layer.
-
-```
-import mxnet as mx
-import numpy as np
-
-num_filter = 32
-kernel = (3, 3)
-pad = (1, 1)
-shape = (32, 32, 256, 256)
-
-x = mx.sym.Variable('x')
-w = mx.sym.Variable('w')
-y = mx.sym.Convolution(data=x, weight=w, num_filter=num_filter, kernel=kernel, no_bias=True, pad=pad)
-exe = y.simple_bind(mx.cpu(), x=shape)
-
-exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
-exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
-
-exe.forward(is_train=False)
-o = exe.outputs[0]
-t = o.asnumpy()
-```
-
-More detailed debugging and profiling information can be logged by setting the environment variable 'MKLDNN_VERBOSE':
-```
-export MKLDNN_VERBOSE=1
-```
-For example, by running above code snippet, the following debugging logs providing more insights on MKL-DNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
-```
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nchw out:f32_nChw16c,num:1,32x32x256x256,6.47681
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0429688
-mkldnn_verbose,exec,convolution,jit:avx512_common,forward_inference,fsrc:nChw16c fwei:OIhw16i16o fbia:undef fdst:nChw16c,alg:convolution_direct,mb32_g1ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1,9.98193
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0510254
-mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nChw16c out:f32_nchw,num:1,32x32x256x256,20.4819
-```
-
-<h2 id="5">Enable MKL BLAS</h2>
-
-With MKL BLAS, the performace is expected to furtherly improved with variable range depending on the computation load of the models.
-You can redistribute not only dynamic libraries but also headers, examples and static libraries on accepting the license [Intel Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
-Installing the full MKL installation enables MKL support for all operators under the linalg namespace.
-
-  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
-
-  2. Run `make -j ${nproc} USE_BLAS=mkl`
-
-  3. Navigate into the python directory
-
-  4. Run `sudo python setup.py install`
-
-### Verify whether MKL works
-
-After MXNet is installed, you can verify if MKL BLAS works well with a single dot layer.
-
-```
-import mxnet as mx
-import numpy as np
-
-shape_x = (1, 10, 8)
-shape_w = (1, 12, 8)
-
-x_npy = np.random.normal(0, 1, shape_x)
-w_npy = np.random.normal(0, 1, shape_w)
-
-x = mx.sym.Variable('x')
-w = mx.sym.Variable('w')
-y = mx.sym.batch_dot(x, w, transpose_b=True)
-exe = y.simple_bind(mx.cpu(), x=x_npy.shape, w=w_npy.shape)
-
-exe.forward(is_train=False)
-o = exe.outputs[0]
-t = o.asnumpy()
-```
-
-You can open the `MKL_VERBOSE` flag by setting environment variable:
-```
-export MKL_VERBOSE=1
-```
-Then by running above code snippet, you probably will get the following output message which means `SGEMM` primitive from MKL are called. Layout information and primitive execution performance are also demonstrated in the log message.
-```
-Numpy + Intel(R) MKL: THREADING LAYER: (null)
-Numpy + Intel(R) MKL: setting Intel(R) MKL to use INTEL OpenMP runtime
-Numpy + Intel(R) MKL: preloading libiomp5.so runtime
-MKL_VERBOSE Intel(R) MKL 2018.0 Update 1 Product build 20171007 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
-MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1380,0x7f7f7400a280,12) 8.93ms CNR:OFF Dyn:1 FastMM:1 TID:0  NThr:40 WDiv:HOST:+0.000
-```
-
-<h2 id="6">Enable graph optimization</h2>
-
-Graph optimization by subgraph feature are available in master branch. You can build from source and then use below command to enable this *experimental* feature for better performance:
-
-```
-export MXNET_SUBGRAPH_BACKEND=MKLDNN
-```
-
-When `MKLDNN` backend is enabled, advanced control options are avaliable:
-
-```
-export MXNET_DISABLE_MKLDNN_CONV_OPT=1 # disable MKLDNN convolution optimization pass
-export MXNET_DISABLE_MKLDNN_FC_OPT=1 # disable MKLDNN FullyConnected optimization pass
-```
-
-
-This limitations of this experimental feature are:
-
-- Use this feature only for inference. When training, be sure to turn the feature off by unsetting the `MXNET_SUBGRAPH_BACKEND` environment variable.
-
-- This feature will only run on the CPU, even if you're using a GPU-enabled build of MXNet. 
-
-- [MXNet Graph Optimization and Quantization Technical Information and Performance Details](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN).
-
-<h2 id="7">Quantization and Inference with INT8</h2>
-
-Benefiting from Intel MKL-DNN, MXNet built with Intel MKL-DNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
-
-- [CNN Quantization Examples](https://github.com/apache/incubator-mxnet/tree/master/example/quantization).
-
-<h2 id="8">Next Steps and Support</h2>
-
-- For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl) website.
-
-- For questions or support specific to MKL, visit the [Intel MKLDNN](https://github.com/intel/mkl-dnn) website.
-
-- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with MKLDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
+# Build/Install MXNet with MKL-DNN
+
+A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
+In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
+
+Please find MKL-DNN optimized operators and other features in the [MKL-DNN operator list](../mkldnn/operator_list.md).
+
+The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
+
+
+<h2 id="0">Contents</h2>
+
+* [1. Linux](#1)
+* [2. MacOS](#2)
+* [3. Windows](#3)
+* [4. Verify MXNet with python](#4)
+* [5. Enable MKL BLAS](#5)
+* [6. Enable graph optimization](#6)
+* [7. Quantization](#7)
+* [8. Support](#8)
+
+<h2 id="1">Linux</h2>
+
+### Prerequisites
+
+```
+sudo apt-get update
+sudo apt-get install -y build-essential git
+sudo apt-get install -y libopenblas-dev liblapack-dev
+sudo apt-get install -y libopencv-dev
+sudo apt-get install -y graphviz
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Build MXNet with MKL-DNN
+
+```
+make -j $(nproc) USE_OPENCV=1 USE_MKLDNN=1 USE_BLAS=mkl USE_INTEL_PATH=/opt/intel
+```
+
+If you don't have the full [MKL](https://software.intel.com/en-us/intel-mkl) library installation, you might use OpenBLAS as the blas library, by setting USE_BLAS=openblas.
+
+<h2 id="2">MacOS</h2>
+
+### Prerequisites
+
+Install the dependencies, required for MXNet, with the following commands:
+
+- [Homebrew](https://brew.sh/)
+- llvm (clang in macOS does not support OpenMP)
+- OpenCV (for computer vision operations)
+
+```
+# Paste this command in Mac terminal to install Homebrew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+# install dependency
+brew update
+brew install pkg-config
+brew install graphviz
+brew tap homebrew/core
+brew install opencv
+brew tap homebrew/versions
+brew install llvm
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Build MXNet with MKL-DNN
+
+```
+LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ USE_OPENCV=1 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple USE_PROFILER=1
+```
+
+<h2 id="3">Windows</h2>
+
+On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
+[Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.  
+
+**Visual Studio 2015**
+
+To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake 3](https://cmake.org/files/v3.14/cmake-3.14.0-win64-x64.msi) if it is not already installed.
+3. Download [OpenCV 3](https://sourceforge.net/projects/opencvlibrary/files/3.4.5/opencv-3.4.5-vc14_vc15.exe/download), and unzip the OpenCV package, set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g.,```OpenCV_DIR = C:\opencv\build ```). Also, add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
+4. If you have Intel Math Kernel Library (Intel MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in ```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl```.
+5. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/), or build the latest version of OpenBLAS from source. Note that you should also download ```mingw64.dll.zip``` along with openBLAS and add them to PATH. 
+6. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Downloads\OpenBLAS\```. 
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Start a Visual Studio command prompt by click windows Start menu>>Visual Studio 2015>>VS2015 X64 Native Tools Command Prompt, and download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet) by the command:
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd C:\incubator-mxent
+```
+2. Enable Intel MKL-DNN by -DUSE_MKLDNN=1. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build```. Make sure to specify the architecture in the 
+command:
+```
+>mkdir build
+>cd build
+>cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+3. Enable Intel MKL-DNN and Intel MKL as BLAS library by the command:  
+```
+>"C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\bin\mklvars.bat" intel64
+>cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=mkl -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release -DMKL_ROOT="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl" 
+```
+4. After the CMake successfully completed, in Visual Studio, open the solution file ```.sln``` and compile it, or compile the the MXNet source code by using following command:
+```r
+msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
+```
+   These commands produce mxnet library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder. Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
+
+5. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml*.dll`, `libiomp5.dll`, `libopenblas*.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
+
+**Visual Studio 2017**
+
+User can follow the same steps of Visual Studio 2015 to build MXNET with MKL-DNN, but change the version related command, for example,```C:\opencv\build\x64\vc15\bin``` and build command is as below:
+
+```
+>cmake -G "Visual Studio 15 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=mkl -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release -DMKL_ROOT="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl"
+
+```
+
+<h2 id="4">Verify MXNet with python</h2>
+
+Preinstall python and some dependent modules: 
+```
+pip install numpy graphviz
+set PYTHONPATH=[workdir]\incubator-mxnet\python
+```
+or install mxnet
+```
+cd python
+sudo python setup.py install
+python -c "import mxnet as mx;print((mx.nd.ones((2, 3))*2).asnumpy());"
+```
+Expected Output:
+```
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
+```
+### Verify whether MKL-DNN works
+
+After MXNet is installed, you can verify if MKL-DNN backend works well with a single Convolution layer.
+```
+import mxnet as mx
+import numpy as np
+
+num_filter = 32
+kernel = (3, 3)
+pad = (1, 1)
+shape = (32, 32, 256, 256)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.Convolution(data=x, weight=w, num_filter=num_filter, kernel=kernel, no_bias=True, pad=pad)
+exe = y.simple_bind(mx.cpu(), x=shape)
+
+exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
+exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+More detailed debugging and profiling information can be logged by setting the environment variable 'MKLDNN_VERBOSE':
+```
+export MKLDNN_VERBOSE=1
+```
+For example, by running above code snippet, the following debugging logs providing more insights on MKL-DNN primitives `convolution` and `reorder`. That includes: Memory layout, infer shape and the time cost of primitive execution.
+```
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nchw out:f32_nChw16c,num:1,32x32x256x256,6.47681
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0429688
+mkldnn_verbose,exec,convolution,jit:avx512_common,forward_inference,fsrc:nChw16c fwei:OIhw16i16o fbia:undef fdst:nChw16c,alg:convolution_direct,mb32_g1ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1,9.98193
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0510254
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nChw16c out:f32_nchw,num:1,32x32x256x256,20.4819
+```
+
+<h2 id="5">Enable MKL BLAS</h2>
+
+With MKL BLAS, the performace is expected to furtherly improved with variable range depending on the computation load of the models.
+You can redistribute not only dynamic libraries but also headers, examples and static libraries on accepting the license [Intel Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
+Installing the full MKL installation enables MKL support for all operators under the linalg namespace.
+
+  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
+
+  2. Run `make -j ${nproc} USE_BLAS=mkl`
+
+  3. Navigate into the python directory
+
+  4. Run `sudo python setup.py install`
+
+### Verify whether MKL works
+
+After MXNet is installed, you can verify if MKL BLAS works well with a single dot layer.
+
+```
+import mxnet as mx
+import numpy as np
+
+shape_x = (1, 10, 8)
+shape_w = (1, 12, 8)
+
+x_npy = np.random.normal(0, 1, shape_x)
+w_npy = np.random.normal(0, 1, shape_w)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.batch_dot(x, w, transpose_b=True)
+exe = y.simple_bind(mx.cpu(), x=x_npy.shape, w=w_npy.shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+You can open the `MKL_VERBOSE` flag by setting environment variable:
+```
+export MKL_VERBOSE=1
+```
+Then by running above code snippet, you probably will get the following output message which means `SGEMM` primitive from MKL are called. Layout information and primitive execution performance are also demonstrated in the log message.
+```
+Numpy + Intel(R) MKL: THREADING LAYER: (null)
+Numpy + Intel(R) MKL: setting Intel(R) MKL to use INTEL OpenMP runtime
+Numpy + Intel(R) MKL: preloading libiomp5.so runtime
+MKL_VERBOSE Intel(R) MKL 2019.0 Update 3 Product build 20190125 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
+MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1380,0x7f7f7400a280,12) 8.93ms CNR:OFF Dyn:1 FastMM:1 TID:0  NThr:40 WDiv:HOST:+0.000
+```
+
+<h2 id="6">Enable graph optimization</h2>
+
+Graph optimization by subgraph feature are available in master branch. You can build from source and then use below command to enable this *experimental* feature for better performance:
+
+```
+export MXNET_SUBGRAPH_BACKEND=MKLDNN
+```
+
+This limitations of this experimental feature are:
+
+- Use this feature only for inference. When training, be sure to turn the feature off by unsetting the `MXNET_SUBGRAPH_BACKEND` environment variable.
+
+- This feature will only run on the CPU, even if you're using a GPU-enabled build of MXNet. 
+
+
+<h2 id="7">Quantization and Inference with INT8</h2>
+
+Benefiting from Intel MKL-DNN, MXNet built with Intel MKL-DNN brings outstanding performance improvement on quantization and inference with INT8 Intel CPU Platform on Intel Xeon Scalable Platform.
+
+- [CNN Quantization Examples](https://github.com/apache/incubator-mxnet/tree/master/example/quantization).
+
+- [Model Quantization for Production-Level Neural Network Inference](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN).
+
+<h2 id="8">Next Steps and Support</h2>
+
+- For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl) website.
+
+- For questions or support specific to MKL, visit the [Intel MKLDNN](https://github.com/intel/mkl-dnn) website.
+
+- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with MKLDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN).
diff --git a/docs/tutorials/mkldnn/operator_list.md b/docs/tutorials/mkldnn/operator_list.md
new file mode 100644
index 000000000000..4958f8d9b602
--- /dev/null
+++ b/docs/tutorials/mkldnn/operator_list.md
@@ -0,0 +1,88 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MKL-DNN Operator list
+
+MXNet MKL-DNN backend provides optimized implementations for various operators covering a broad range of applications including image classification, object detection, natural language processing. 
+
+To help users understanding MKL-DNN backend better, the following table summarizes the list of supported operators, data types and functionalities.  A subset of operators support faster training and inference by using a lower precision version. Refer to the following table's `INT8 Inference` column to see which operators are supported.
+
+| Operator           | Function                   | FP32 Training (backward) | FP32 Inference | INT8 Inference |
+| ---                | ---                        | ---                      | ---            | ---            |
+| **Convolution**    | 1D Convolution             | Y                        | Y              | N              |
+|                    | 2D Convolution             | Y                        | Y              | Y              |
+|                    | 3D Convolution             | Y                        | Y              | N              |
+| **Deconvolution**  | 2D Deconvolution           | Y                        | Y              | N              |
+|                    | 3D Deconvolution           | Y                        | Y              | N              |
+| **FullyConnected** | 1D-4D input, flatten=True  | N                        | Y              | Y              |
+|                    | 1D-4D input, flatten=False | N                        | Y              | Y              |
+| **Pooling**        | 2D max Pooling             | Y                        | Y              | Y              |
+|                    | 2D avg pooling             | Y                        | Y              | Y              |
+| **BatchNorm**      | 2D BatchNorm               | Y                        | Y              | N              |
+| **LRN**            | 2D LRN                     | Y                        | Y              | N              |
+| **Activation**     | ReLU                       | Y                        | Y              | Y              |
+|                    | Tanh                       | Y                        | Y              | N              |
+|                    | SoftReLU                   | Y                        | Y              | N              |
+|                    | Sigmoid                    | Y                        | Y              | N              |
+| **softmax**        | 1D-4D input                | Y                        | Y              | N              |
+| **Softmax_output** | 1D-4D input                | N                        | Y              | N              |
+| **Transpose**      | 1D-4D input                | N                        | Y              | N              |
+| **elemwise_add**   | 1D-4D input                | Y                        | Y              | Y              |
+| **Concat**         | 1D-4D input                | Y                        | Y              | Y              |
+| **slice**          | 1D-4D input                | N                        | Y              | N              |
+| **Quantization**   | 1D-4D input                | N                        | N              | Y              |
+| **Dequantization** | 1D-4D input                | N                        | N              | Y              |
+| **Requantization** | 1D-4D input                | N                        | N              | Y              |
+
+Besides direct operator optimizations, we also provide graph fusion passes listed in the table below. Users can choose to enable or disable these fusion patterns through environmental variables.
+
+For example, you can enable all FP32 fusion passes in the following table by:
+
+```
+export MXNET_SUBGRAPH_BACKEND=MKLDNN
+```
+
+And disable `Convolution + Activation` fusion by:
+
+```
+export MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU=1
+```
+
+When generating the corresponding INT8 symbol, users can enable INT8 operator fusion passes as following:
+
+```
+# get qsym after model quantization
+qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
+qsym.save(symbol_name) # fused INT8 operators will be save into the symbol JSON file
+```
+
+| Fusion pattern                                            | Disable                             |
+| ---                                                       | ---                                 |
+| Convolution + Activation                                  | MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU |
+| Convolution + elemwise_add                                | MXNET_DISABLE_MKLDNN_FUSE_CONV_SUM  |
+| Convolution + BatchNorm                                   | MXNET_DISABLE_MKLDNN_FUSE_CONV_BN   |
+| Convolution + Activation + elemwise_add                   |                                     |
+| Convolution + BatchNorm + Activation + elemwise_add       |                                     |
+| FullyConnected + Activation(ReLU)                         | MXNET_DISABLE_MKLDNN_FUSE_FC_RELU   |
+| Convolution (INT8) + re-quantization                      |                                     |
+| FullyConnected (INT8) + re-quantization                   |                                     |
+| FullyConnected (INT8) + re-quantization + de-quantization |                                     |
+
+
+To install MXNet MKL-DNN backend, please refer to [MKL-DNN backend readme](MKLDNN_README.md)
+
+For performance numbers, please refer to [performance on Intel CPU](../../faq/perf.md#intel-cpu)
diff --git a/docs/tutorials/nlp/cnn.md b/docs/tutorials/nlp/cnn.md
index e671de3a1f57..105bf03f9e2b 100644
--- a/docs/tutorials/nlp/cnn.md
+++ b/docs/tutorials/nlp/cnn.md
@@ -300,7 +300,7 @@ import time
 CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])
 
 # Define what device to train/test on, use GPU if available
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 
 arg_names = cnn.list_arguments()
 
diff --git a/docs/tutorials/python/kvstore.md b/docs/tutorials/python/kvstore.md
index 42debab9b83e..ad7d6bf8c2c1 100644
--- a/docs/tutorials/python/kvstore.md
+++ b/docs/tutorials/python/kvstore.md
@@ -53,15 +53,11 @@ print(a.asnumpy())
 
 The data for pushing can be stored on any device. Furthermore, you can push multiple
 values into the same key, where KVStore will first sum all of these
-values and then push the aggregated value:
+values and then push the aggregated value. Here we will just demonstrate pushing a list of values on CPU.
+Please note summation only happens if the value list is longer than one
 
 ```python
-# The numbers used below assume 4 GPUs
-gpus = mx.test_utils.list_gpus()
-if len(gpus) > 1:
-    contexts = [mx.gpu(i) for i in gpus]
-else:
-    contexts = [mx.cpu(i) for i in range(4)]
+contexts = [mx.cpu(i) for i in range(4)]
 b = [mx.nd.ones(shape, ctx) for ctx in contexts]
 kv.push(3, b)
 kv.pull(3, out = a)
@@ -87,7 +83,6 @@ print(a.asnumpy())
 
 ```python
 kv.push(3, mx.nd.ones(shape))
-#
 kv.pull(3, out=a)
 print(a.asnumpy())
 ```
@@ -173,4 +168,4 @@ When the distributed version is ready, we will update this section.
 ## Next Steps
 * [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/python/mnist.md b/docs/tutorials/python/mnist.md
index 9d641b36c202..ac965617e347 100644
--- a/docs/tutorials/python/mnist.md
+++ b/docs/tutorials/python/mnist.md
@@ -50,7 +50,7 @@ mnist = mx.test_utils.get_mnist()
 mx.random.seed(42)
 
 # Set the compute context, GPU is available otherwise CPU
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 ```
 
 After running the above source code, the entire MNIST dataset should be fully loaded into memory. Note that for large datasets it is not feasible to pre-load the entire dataset first like we did here. What is needed is a mechanism by which we can quickly and efficiently stream data directly from the source. MXNet Data iterators come to the rescue here by providing exactly that. Data iterator is the mechanism by which we feed input data into an MXNet training algorithm and they are very simple to initialize and use and are optimized for speed. During training, we typically process training samples in small batches and over the entire training lifetime will end up processing each training example multiple times. In this tutorial, we'll configure the data iterator to feed examples in batches of 100. Keep in mind that each example is a 28x28 grayscale image and the corresponding label.
diff --git a/docs/tutorials/python/profiler.md b/docs/tutorials/python/profiler.md
index d3e3355b8f4a..9eed452c2e27 100644
--- a/docs/tutorials/python/profiler.md
+++ b/docs/tutorials/python/profiler.md
@@ -17,11 +17,11 @@
 
 # Profiling MXNet Models
 
-It is often helpful to understand what operations take how much time while running a model. This helps optimize the model to run faster. In this tutorial, we will learn how to profile MXNet models to measure their running time and memory consumption using the MXNet profiler.
+It is often helpful to check the execution time of each operation in a neural network. You can then determine where to focus your effort to speed up model training or inference. In this tutorial, we will learn how to profile MXNet models to measure their running time and memory consumption using the MXNet profiler.
 
 ## The incorrect way to profile
 
-If you have just begun using MXNet, you might be tempted to measure the execution time of your model using Python's `time` module like shown below:
+If you have just started to use MXNet, you might be tempted to measure the execution time of your model using Python's `time` module like shown below:
 
 ```python
 from time import time
@@ -34,35 +34,19 @@ y = nd.dot(x, x)
 print('Time for matrix multiplication: %f sec\n' % (time() - start))
 
 start = time()                                
-print(y.asnumpy())                                
-print('Time for printing the output: %f sec' % (time() - start))
+y_np = y.asnumpy()                             
+print('Time for converting to numpy: %f sec' % (time() - start))
 ```
 
-
 **Time for matrix multiplication: 0.005051 sec**<!--notebook-skip-line-->
 
-[[501.1584  508.29724 495.65237 ... 492.84705 492.69092 490.0481 ]<!--notebook-skip-line-->
-
- [508.81058 507.1822  495.1743  ... 503.10526 497.29315 493.67917]<!--notebook-skip-line-->
-
- [489.56598 499.47015 490.17722 ... 490.99945 488.05008 483.28836]<!--notebook-skip-line-->
-
- ...<!--notebook-skip-line-->
-
- [484.0019  495.7179  479.92142 ... 493.69952 478.89194 487.2074 ]<!--notebook-skip-line-->
-
- [499.64932 507.65094 497.5938  ... 493.0474  500.74512 495.82712]<!--notebook-skip-line-->
-
- [516.0143  519.1715  506.354   ... 510.08878 496.35608 495.42523]]<!--notebook-skip-line-->
+**Time for converting to numpy: 0.167693 sec**<!--notebook-skip-line-->
 
-**Time for printing the output: 0.167693 sec**<!--notebook-skip-line-->
+From the timings above, it seems as if converting to numpy takes lot more time than multiplying two large matrices. That doesn't seem right.
 
+This is because, in MXNet, all operations are executed asynchronously. So, when `nd.dot(x, x)` returns, the matrix multiplication is not complete, it has only been queued for execution. However, [`asnumpy`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=asnumpy#mxnet.ndarray.NDArray.asnumpy) has to wait for the result to be calculated in order to convert it to numpy array on CPU, hence taking a longer time. Other examples of 'blocking' operations include [`asscalar`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=asscalar#mxnet.ndarray.NDArray.asscalar) and [`wait_to_read`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=wait_to_read#mxnet.ndarray.NDArray.wait_to_read).
 
-From the output above, it seems as if printing the output takes lot more time that multiplying two large matrices. That doesn't feel right. 
-
-This is because, in MXNet, all operations are executed asynchronously. So, when `nd.dot(x, x)` returns, the matrix multiplication is not complete, it has only been queued for execution. `asnumpy` in `print(y.asnumpy())` however, waits for the result to be computed and hence takes longer time.
-
-While it is possible to use `NDArray.waitall()` before and after operations to get running time of operations, it is not a scalable method to measure running time of multiple sets of operations, especially in a Sequential or Hybrid network.
+While it is possible to use [`NDArray.waitall()`](http://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=waitall#mxnet.ndarray.waitall) before and after operations to get running time of operations, it is not a scalable method to measure running time of multiple sets of operations, especially in a [`Sequential`](http://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=sequential#mxnet.gluon.nn.Sequential) or hybridized network.
 
 ## The correct way to profile
 
@@ -70,7 +54,10 @@ The correct way to measure running time of MXNet models is to use MXNet profiler
 
 ```python
 from mxnet import profiler
-profiler.set_config(profile_all=True, aggregate_stats=True, filename='profile_output.json')
+
+profiler.set_config(profile_all=True,
+                    aggregate_stats=True,
+                    filename='profile_output.json')
 ```
 
 `profile_all` enables all types of profiling. You can also individually enable the following types of profiling:
@@ -84,10 +71,11 @@ profiler.set_config(profile_all=True, aggregate_stats=True, filename='profile_ou
 
 ### Setup: Build a model
 
-Let's build a small convolutional neural network that we can use for profiling.
+Let's build a small convolutional neural network that we can use to demonstrate profiling.
 
 ```python
 from mxnet import gluon
+
 net = gluon.nn.HybridSequential()
 with net.name_scope():
     net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
@@ -103,15 +91,17 @@ We need data that we can run through the network for profiling. We'll use the MN
 
 ```python
 from mxnet.gluon.data.vision import transforms
-train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True).transform_first(transforms.ToTensor()),
-                                   batch_size=64, shuffle=True)
+
+dataset = gluon.data.vision.MNIST(train=True)
+dataset = dataset.transform_first(transforms.ToTensor())
+dataloader = gluon.data.DataLoader(dataset, batch_size=64, shuffle=True)
 ```
 
-Let's define a method that will run one training iteration given data and label.
+Let's define a function that will run a single training iteration given `data` and `label`.
 
 ```python
 # Use GPU if available
-if len(mx.test_utils.list_gpus())!=0:
+if mx.context.num_gpus():
     ctx=mx.gpu()
 else:
     ctx=mx.cpu()
@@ -120,37 +110,33 @@ else:
 net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
 
 # Use SGD optimizer
-trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})
+trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
 
-# Softmax Cross Entropy is a frequently used loss function for multi-classs classification
+# Softmax Cross Entropy is a frequently used loss function for multi-class classification
 softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 
 # A helper function to run one training iteration
 def run_training_iteration(data, label):
-    
     # Load data and label is the right context
     data = data.as_in_context(ctx)
     label = label.as_in_context(ctx)
-    
     # Run the forward pass
     with autograd.record():
         output = net(data)
         loss = softmax_cross_entropy(output, label)
-    
     # Run the backward pass
     loss.backward()
-    
     # Apply changes to parameters
     trainer.step(data.shape[0])
 ```
 
 ### Starting and stopping the profiler from Python
 
-When the first forward pass is run on a network, MXNet does a number of housekeeping tasks including inferring the shapes of various parameters, allocating memory for intermediate and final outputs, etc. For these reasons, profiling the first iteration doesn't provide accurate results. We will, therefore skip the first iteration.
+When the first forward pass is run on a network, MXNet does a number of housekeeping tasks including inferring the shapes of various parameters, allocating memory for intermediate and final outputs, etc. For these reasons, profiling the first iteration doesn't provide representative results for the rest of training. We will, therefore, skip the first iteration.
 
 ```python
 # Run the first iteration without profiling
-itr = iter(train_data)
+itr = iter(dataloader)
 run_training_iteration(*next(itr))
 ```
 
@@ -164,18 +150,21 @@ profiler.set_state('run')
 
 run_training_iteration(*next(itr))
 
-# Ask the profiler to stop recording after operations have completed
+# Make sure all operations have completed
 mx.nd.waitall()
+# Ask the profiler to stop recording
 profiler.set_state('stop')
 ```
 
 Between running and stopping the profiler, you can also pause and resume the profiler using `profiler.pause()` and `profiler.resume()` respectively to profile only parts of the code you want to profile.
 
-### Starting profiler automatically using environment variable
+### Starting the profiler automatically using an environment variable
 
 The method described above requires code changes to start and stop the profiler. You can also start the profiler automatically and profile the entire code without any code changes using the `MXNET_PROFILER_AUTOSTART` environment variable.
 
-MXNet will start the profiler automatically if you run your code with the environment variable `MXNET_PROFILER_AUTOSTART` set to `1`. The profiler output is stored into `profile.json` in the current directory.
+`$ MXNET_PROFILER_AUTOSTART=1 python my_script.py`
+
+MXNet will start the profiler automatically if you run your code with the environment variable `MXNET_PROFILER_AUTOSTART` set to `1`. The profiler output is stored in `profile.json` inside the current directory.
 
 Note that the profiler output could be large depending on your code. It might be helpful to profile only sections of your code using the `set_state` API described in the previous section.
 
@@ -183,9 +172,11 @@ Note that the profiler output could be large depending on your code. It might be
 
 MXNet executes computation graphs in 'bulk mode' which reduces kernel launch gaps in between symbolic operators for faster execution. This could reduce the granularity of the profiler output. If you need profiling result of every operator, please set the environment variables `MXNET_EXEC_BULK_EXEC_INFERENCE` and `MXNET_EXEC_BULK_EXEC_TRAIN` to `0` to disable the bulk execution mode.
 
+When working with networks created using the Gluon API, you will get a more granular profiling outputs if you profile networks that haven't been hybridized. Operations can appear fused together in the profiling outputs after hybridization, which can make debugging tricky.
+
 ### Viewing profiler output
 
-There are a few ways to view the information collected by the profiler. You can view it in the console, you can view a more graphical version in a browser, or you can use a vendor tool such as Intel VTune or Nvidia NVProf to view output. For most scenarios the information you need can be obtained with MXNet's built in profiler support, but if you want to investigate the performance of operators along side extra context about your hardware (e.g. cache hit rates, or CUDA kernel timings) then profiling jointly with vendor tools is recommended.
+There are a few ways to view the information collected by the profiler. You can view it in the console, you can view a more graphical version in a browser, or you can use a vendor tool such as Intel VTune or Nvidia NVProf to view output. For most scenarios the information you need can be obtained with MXNet's built in profiler support, but if you want to investigate the performance of operators alongside extra context about your hardware (e.g. cache hit rates, or CUDA kernel timings) then profiling jointly with vendor tools is recommended.
 
 #### 1. View in console
 
@@ -215,29 +206,44 @@ Let's zoom in to check the time taken by operators
 
 The above picture visualizes the sequence in which the operators were executed and the time taken by each operator.
 
-#### 3. View in NVProf
+## Advanced: Using NVIDIA Profiling Tools
 
-You can view all MXNet profiler information alongside CUDA kernel information by using the MXNet profiler along with NVProf.  Use the MXNet profiler as in the samples above, but invoke your python script with the following wrapper process available on most systems that support CUDA:
+MXNet's Profiler is the recommended starting point for profiling MXNet code, but NVIDIA also provides a couple of tools for low-level profiling of CUDA code: [NVProf](https://devblogs.nvidia.com/cuda-pro-tip-nvprof-your-handy-universal-gpu-profiler/), [Visual Profiler](https://developer.nvidia.com/nvidia-visual-profiler) and [Nsight Compute](https://developer.nvidia.com/nsight-compute). You can use these tools to profile all kinds of executables, so they can be used for profiling Python scripts running MXNet. And you can use these in conjunction with the MXNet Profiler to see high-level information from MXNet alongside the low-level CUDA kernel information.
 
-```bash
-nvprof -o my_profile.nvvp python my_profiler_script.py
-==11588== NVPROF is profiling process 11588, command: python my_profiler_script.py
-==11588== Generated result file: /home/kellen/Development/incubator-mxnet/ci/my_profile.nvvp
-```
-Your my_profile.nvvp file will automatically be annotated with NVTX ranges displayed alongside your standard NVProf timeline.  This can be very useful when you're trying to find patterns between operators run by MXNet, and their associated CUDA kernel calls.
+#### NVProf and Visual Profiler
+
+NVProf and Visual Profiler are available in CUDA 9 and CUDA 10 toolkits. You can get a timeline view of CUDA kernel executions, and also analyse the profiling results to get automated recommendations. It is useful for profiling end-to-end training but the interface can sometimes become slow and unresponsive.
+
+You can initiate the profiling directly from inside Visual Profiler or from the command line with `nvprof` which wraps the execution of your Python script. If it's not on your path already, you can find `nvprof` inside your CUDA directory. See [this discussion post](https://discuss.mxnet.io/t/using-nvidia-profiling-tools-visual-profiler-and-nsight-compute/) for more details on setup.
+
+`$ nvprof -o my_profile.nvvp python my_profiler_script.py`
+
+`==11588== NVPROF is profiling process 11588, command: python my_profiler_script.py`
 
-![Operator profiling](profiler_nvprof.png)
+`==11588== Generated result file: /home/user/Development/incubator-mxnet/ci/my_profile.nvvp`
 
-In this picture we see a rough overlay of a few types of information plotted on a horizontal timeline.  At the top of the plot we have CPU tasks such as driver operations, memory copy calls, MXNet engine operator invocations, and imperative MXNet API calls.  Below we see the kernels active on the GPU during the same time period.
+We specified an output file called `my_profile.nvvp` and this will be annotated with NVTX ranges (for MXNet operations) that will be displayed alongside the standard NVProf timeline. This can be very useful when you're trying to find patterns between operators run by MXNet, and their associated CUDA kernel calls.
 
-![Operator profiling](profiler_nvprof_zoomed.png)
+You can open this file in Visual Profiler to visualize the results.
+
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_nvprof.png)
+
+At the top of the plot we have CPU tasks such as driver operations, memory copy calls, MXNet engine operator invocations, and imperative MXNet API calls.  Below we see the kernels active on the GPU during the same time period.
+
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_nvprof_zoomed.png)
 
 Zooming in on a backwards convolution operator we can see that it is in fact made up of a number of different GPU kernel calls, including a cuDNN winograd convolution call, and a fast-fourier transform call.
 
-![Operator profiling](profiler_winograd.png)
+![Operator profiling](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profiler_winograd.png)
 
 Selecting any of these kernel calls (the winograd convolution call shown here) will get you some interesting GPU performance information such as occupancy rates (vs theoretical), shared memory usage and execution duration.
 
+#### Nsight Compute
+
+Nsight Compute is available in CUDA 10 toolkit, but can be used to profile code running CUDA 9. You don't get a timeline view, but you get many low level statistics about each individual kernel executed and can compare multiple runs (i.e. create a baseline).
+
+![Nsight Compute](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/python/profiler/profile_nsight_compute.png)
+
 ### Further reading
 
 - [Examples using MXNet profiler.](https://github.com/apache/incubator-mxnet/tree/master/example/profiler)
diff --git a/docs/tutorials/python/profiler_nvprof.png b/docs/tutorials/python/profiler_nvprof.png
deleted file mode 100644
index 37d8615c2b54..000000000000
Binary files a/docs/tutorials/python/profiler_nvprof.png and /dev/null differ
diff --git a/docs/tutorials/python/profiler_nvprof_zoomed.png b/docs/tutorials/python/profiler_nvprof_zoomed.png
deleted file mode 100644
index 9b6b6e88e93d..000000000000
Binary files a/docs/tutorials/python/profiler_nvprof_zoomed.png and /dev/null differ
diff --git a/docs/tutorials/python/profiler_winograd.png b/docs/tutorials/python/profiler_winograd.png
deleted file mode 100644
index 5b4fcc3155fb..000000000000
Binary files a/docs/tutorials/python/profiler_winograd.png and /dev/null differ
diff --git a/docs/tutorials/sparse/train.md b/docs/tutorials/sparse/train.md
index 2f315cfa1010..5eec460c6630 100644
--- a/docs/tutorials/sparse/train.md
+++ b/docs/tutorials/sparse/train.md
@@ -59,7 +59,7 @@ import random
 # set the seeds for repeatability
 random.seed(42)
 np.random.seed(42)
-mx.random.seed(42) 
+mx.random.seed(42)
 
 # Create a variable to hold an NDArray
 a = mx.sym.Variable('a')
@@ -118,7 +118,7 @@ eval_b = b_exec.outputs[0]
 
 
 
-    {'eval_b': 
+    {'eval_b':
      <CSRNDArray 2x2 @cpu(0)>, 'eval_b.asnumpy()': array([[ 1.,  1.],
             [ 1.,  1.]], dtype=float32)}
 
@@ -154,7 +154,7 @@ f = mx.sym.sparse.elemwise_add(c, c)
 ### Storage Type Inference
 
 What will be the output storage types of sparse symbols? In MXNet, for any sparse symbol, the result storage types are inferred based on storage types of inputs.
-You can read the [Sparse Symbol API](http://mxnet.io/versions/master/api/python/symbol/sparse.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
+You can read the [Sparse Symbol API](https://mxnet.incubator.apache.org/versions/master/api/python/symbol/sparse.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
 
 
 ```python
@@ -197,10 +197,10 @@ fallback_log = fallback_exec.outputs[1]
 
 
 
-    {'fallback_add': 
+    {'fallback_add':
      [[ 0.  0.]
       [ 0.  0.]]
-     <NDArray 2x2 @cpu(0)>, 'fallback_log': 
+     <NDArray 2x2 @cpu(0)>, 'fallback_log':
      [[-inf -inf]
       [-inf -inf]]
      <NDArray 2x2 @cpu(0)>}
diff --git a/docs/tutorials/tensorrt/inference_with_trt.md b/docs/tutorials/tensorrt/inference_with_trt.md
index ff3cddf3d574..409c96e4ab2a 100644
--- a/docs/tutorials/tensorrt/inference_with_trt.md
+++ b/docs/tutorials/tensorrt/inference_with_trt.md
@@ -122,13 +122,13 @@ We run timing with a warmup once more, and on the same machine, run in **18.99s*
 
 ## Operators and Subgraph Fusion
 
-Behind the scenes a number of interesting things are happening to make these optimizations possible, and most revolve around subgraphs and operator fusion.  As we can see in the images below, neural networks can be represented as computation graphs of operators (nodes in the graphs).  Operators can perform a variety of functions, but most run simple mathematics and linear algebra on tensors.  Often these operators run more efficiently if fused together into a large CUDA kernel that is executed on the GPU in a single call.  What the MXNet TensorRT integration enables is the ability to scan the entire computation graph, identify interesting subgraphs and optimize them with TensorRT. 
+Behind the scenes a number of interesting things are happening to make these optimizations possible, and most revolve around subgraphs and operator fusion.  As we can see in the images below, neural networks can be represented as computation graphs of operators (nodes in the graphs).  Operators can perform a variety of functions, but most run simple mathematics and linear algebra on tensors.  Often these operators run more efficiently if fused together into a large CUDA kernel that is executed on the GPU in a single call.  What the MXNet TensorRT integration enables is the ability to scan the entire computation graph, identify interesting subgraphs and optimize them with TensorRT.
 
 This means that when an MXNet computation graph is constructed, it will be parsed to determine if there are any sub-graphs that contain operator types that are supported by TensorRT.  If MXNet determines that there are one (or many) compatible subgraphs during the graph-parse, it will extract these graphs and replace them with special TensorRT nodes (visible in the diagrams below).  As the graph is executed, whenever a TensorRT node is reached the graph will make a library call to TensorRT.  TensorRT will then run its own implementation of the subgraph, potentially with many operators fused together into a single CUDA kernel.
 
 During this process MXNet will take care of passing along the input to the node and fetching the results.  MXNet will also attempt to remove any duplicated weights (parameters) during the graph initialization to keep memory usage low.  That is, if there are graph weights that are used only in the TensorRT sections of the graph, they will be removed from the MXNet set of parameters, and their memory will be freed.
 
-The examples below shows a Gluon implementation of a Wavenet before and after a TensorRT graph pass. You can see that for this network TensorRT supports a subset of the operators involved. This makes it an interesting example to visualize, as several subgraphs are extracted and replaced with special TensorRT nodes. The Resnet used as an example above would be less interesting to visualization. The entire Resnet graph is supported by TensorRT, and hence the optimized graph would be a single TensorRT node.  If your browser is unable to render svg files you can view the graphs in png format: [unoptimized](wavenet_unoptimized.png) and [optimized](wavenet_optimized.png).
+The examples below shows a Gluon implementation of a Wavenet before and after a TensorRT graph pass. You can see that for this network TensorRT supports a subset of the operators involved. This makes it an interesting example to visualize, as several subgraphs are extracted and replaced with special TensorRT nodes. The Resnet used as an example above would be less interesting to visualization. The entire Resnet graph is supported by TensorRT, and hence the optimized graph would be a single TensorRT node.  If your browser is unable to render svg files you can view the graphs in png format: [unoptimized](_static/tutorials/tensorrt/wavenet_unoptimized.png) and [optimized](_static/tutorials/tensorrt/wavenet_optimized.png).
 
 ## Before
 ![before](wavenet_unoptimized.svg)
diff --git a/docs/tutorials/unsupervised_learning/gan.md b/docs/tutorials/unsupervised_learning/gan.md
index ca0fb15e01c5..0416593b6c1e 100644
--- a/docs/tutorials/unsupervised_learning/gan.md
+++ b/docs/tutorials/unsupervised_learning/gan.md
@@ -240,7 +240,7 @@ sigma = 0.02
 lr = 0.0002
 beta1 = 0.5
 # Define the compute context, use GPU if available
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 
 #=============Generator Module=============
 generator = mx.mod.Module(symbol=generatorSymbol, data_names=('rand',), label_names=None, context=ctx)
diff --git a/example/adversary/adversary_generation.ipynb b/example/adversary/adversary_generation.ipynb
index 0b45366504e3..76c5f4cff569 100644
--- a/example/adversary/adversary_generation.ipynb
+++ b/example/adversary/adversary_generation.ipynb
@@ -45,7 +45,7 @@
    },
    "outputs": [],
    "source": [
-    "ctx = mx.gpu() if len(mx.test_utils.list_gpus()) else mx.cpu()\n",
+    "ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()\n",
     "batch_size = 128"
    ]
   },
diff --git a/example/autoencoder/convolutional_autoencoder.ipynb b/example/autoencoder/convolutional_autoencoder.ipynb
index c42ad900ec98..a49eba0fcc10 100644
--- a/example/autoencoder/convolutional_autoencoder.ipynb
+++ b/example/autoencoder/convolutional_autoencoder.ipynb
@@ -50,7 +50,7 @@
    "outputs": [],
    "source": [
     "batch_size = 512\n",
-    "ctx = mx.gpu() if len(mx.test_utils.list_gpus()) > 0 else mx.cpu()"
+    "ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()"
    ]
   },
   {
diff --git a/example/bi-lstm-sort/bi-lstm-sort.ipynb b/example/bi-lstm-sort/bi-lstm-sort.ipynb
index 085117674b58..5d18be35e079 100644
--- a/example/bi-lstm-sort/bi-lstm-sort.ipynb
+++ b/example/bi-lstm-sort/bi-lstm-sort.ipynb
@@ -39,7 +39,7 @@
     "seq_len = 5\n",
     "split = 0.8\n",
     "batch_size = 512\n",
-    "ctx = mx.gpu() if len(mx.test_utils.list_gpus()) > 0 else mx.cpu()"
+    "ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()"
    ]
   },
   {
diff --git a/example/cnn_text_classification/README.md b/example/cnn_text_classification/README.md
index f4ebc43afa96..b9e37211c8e4 100644
--- a/example/cnn_text_classification/README.md
+++ b/example/cnn_text_classification/README.md
@@ -29,6 +29,10 @@ Finally, I got a best dev accuracy 80.1%, close to 81% that reported in the orig
 ## Data
 Please download the corpus from this repository [cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf), :)
 
+Note: The dataset is from cnn-text-classification-tf [repository](https://github.com/dennybritz/cnn-text-classification-tf/tree/master/data/rt-polaritydata).
+The dataset is copyright to Denny Britz and licensed under Apache License 2.0.
+For full text of the license, see [repository license](https://github.com/dennybritz/cnn-text-classification-tf/blob/master/LICENSE)
+    
 'data/rt.vec', this file was trained on the corpus by word2vec tool. I recommend to use GoogleNews word2vec, which could get better performance, since
 this corpus is small (contains about 10K sentences).
 
diff --git a/example/cnn_text_classification/data_helpers.py b/example/cnn_text_classification/data_helpers.py
index 78dacb034a17..8e135675cd9f 100644
--- a/example/cnn_text_classification/data_helpers.py
+++ b/example/cnn_text_classification/data_helpers.py
@@ -55,6 +55,9 @@ def load_data_and_labels():
     Returns split sentences and labels.
     """
     # Load data from files
+    # The dataset is from https://github.com/dennybritz/cnn-text-classification-tf/tree/master/data/rt-polaritydata
+    # The dataset is copyright to Denny Britz and licensed under Apache License 2.0.
+    # For full text of the license, see https://github.com/dennybritz/cnn-text-classification-tf/blob/master/LICENSE
     pos_path = "./data/rt-polaritydata/rt-polarity.pos"
     neg_path = "./data/rt-polaritydata/rt-polarity.neg"
     if not os.path.exists(pos_path):
diff --git a/example/distributed_training-horovod/README.md b/example/distributed_training-horovod/README.md
index c4776044a385..bfaf9d97dbd9 100644
--- a/example/distributed_training-horovod/README.md
+++ b/example/distributed_training-horovod/README.md
@@ -21,7 +21,7 @@ excellent scaling efficiency for dense models running on a large number of nodes
 supports mainstream deep learning frameworks such as MXNet, TensorFlow, Keras, and PyTorch. 
 It is created at Uber and currently hosted by the [Linux Foundation Deep Learning](https://lfdl.io)(LF DL). 
 
-MXNet is supported in Horovod 0.16.0 [release](https://eng.uber.com/horovod-pyspark-apache-mxnet-support/).
+MXNet is supported starting from Horovod 0.16.0 [release](https://eng.uber.com/horovod-pyspark-apache-mxnet-support/).
 
 ## What's New?
 Compared with the standard distributed training script in MXNet which uses parameter server to 
@@ -35,7 +35,7 @@ there are a large number of workers and network bandwidth is the bottleneck.
 ```bash
 $ pip install mxnet
 ```
-**Note**: There is a [known issue](https://github.com/horovod/horovod/issues/884) when running Horovod with MXNet on a Linux system with GCC version 5.X and above. We recommend users to build MXNet from source following this [guide](https://mxnet.incubator.apache.org/install/build_from_source.html) as a workaround for now. Also mxnet-mkl package in 1.4.0 release does not support Horovod.
+**Note**: The [known issue](https://github.com/horovod/horovod/issues/884) when running Horovod with MXNet on a Linux system with GCC version 5.X and above has been resolved. Please use MXNet 1.4.1 or later releases with Horovod 0.16.2 or later releases to avoid the GCC incompatibility issue. MXNet 1.4.0 release works with Horovod 0.16.0 and 0.16.1 releases with the GCC incompatibility issue unsolved.
 
 ## Install Horovod
 ```bash
@@ -66,8 +66,8 @@ To run MXNet with Horovod, make the following additions to your training script:
 3. Scale the learning rate by number of workers. Effective batch size in synchronous distributed training is scaled by
     the number of workers. An increase in learning rate compensates for the increased batch size.
 
-4. Wrap optimizer in `hvd.DistributedOptimizer`.  The distributed optimizer delegates gradient computation
-    to the original optimizer, averages gradients using *allreduce* or *allgather*, and then applies those averaged
+4. Create `hvd.DistributedTrainer` with optimizer when using Gluon API or wrap optimizer in `hvd.DistributedOptimizer` when using Module API.  The distributed trainer or optimizer delegates gradient computation
+    to the original optimizer, averages gradients using *allreduce*, and then applies those averaged
     gradients.
 
 5. Add `hvd.broadcast_parameters` to broadcast initial variable states from rank 0 to all other processes.
@@ -97,12 +97,13 @@ num_workers = hvd.size()
 model = ...
 model.hybridize()
 
-# Define hyper parameters
-optimizer_params = ...
 
-# Add Horovod Distributed Optimizer
+# Create optimizer
+optimizer_params = ...
 opt = mx.optimizer.create('sgd', **optimizer_params)
-opt = hvd.DistributedOptimizer(opt)
+
+# Create DistributedTrainer, a subclass of gluon.Trainer
+trainer = hvd.DistributedTrainer(params, opt)
 
 # Initialize parameters
 model.initialize(initializer, ctx=context)
@@ -112,8 +113,7 @@ params = model.collect_params()
 if params is not None:
     hvd.broadcast_parameters(params, root_rank=0)
 
-# Create trainer and loss function
-trainer = gluon.Trainer(params, opt, kvstore=None)
+# Create loss function
 loss_fn = ...
 
 # Train model
@@ -178,7 +178,7 @@ model.fit(train_data,
 
 The example commands below show how to run distributed training. See the 
 [Running Horovod](https://github.com/horovod/horovod/blob/master/docs/running.md)
-page for more instructions, including RoCE/InfiniBand tweaks and tips for dealing with hangs.
+page for more instructions.
 
 1. To run on a machine with 4 CPUs:
 
diff --git a/example/distributed_training-horovod/gluon_mnist.py b/example/distributed_training-horovod/gluon_mnist.py
index 7e4be58cc2ef..7b39f5776a42 100644
--- a/example/distributed_training-horovod/gluon_mnist.py
+++ b/example/distributed_training-horovod/gluon_mnist.py
@@ -39,10 +39,15 @@
                     help='learning rate (default: 0.01)')
 parser.add_argument('--momentum', type=float, default=0.9,
                     help='SGD momentum (default: 0.9)')
-parser.add_argument('--use-gpu', action='store_true', default=False,
-                    help='run training on GPU (default: False)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disable training on GPU (default: False)')
 args = parser.parse_args()
 
+if not args.no_cuda:
+    # Disable CUDA if there are no GPUs.
+    if mx.context.num_gpus() == 0:
+        args.no_cuda = True
+
 logging.basicConfig(level=logging.INFO)
 logging.info(args)
 
@@ -113,7 +118,7 @@ def evaluate(model, data_iter, context):
 hvd.init()
 
 # Horovod: pin context to local rank
-context = mx.gpu(hvd.local_rank()) if args.use_gpu else mx.cpu(hvd.local_rank())
+context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
 num_workers = hvd.size()
 
 # Load training and validation data
@@ -124,27 +129,25 @@ def evaluate(model, data_iter, context):
 model.cast(args.dtype)
 model.hybridize()
 
-# Define hyper parameters
+# Create optimizer
 optimizer_params = {'momentum': args.momentum,
-                    'learning_rate': args.lr * hvd.size(),
-                    'rescale_grad': 1.0 / args.batch_size}
-
-# Add Horovod Distributed Optimizer
+                    'learning_rate': args.lr * hvd.size()}
 opt = mx.optimizer.create('sgd', **optimizer_params)
-opt = hvd.DistributedOptimizer(opt)
 
 # Initialize parameters
 initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
                              magnitude=2)
 model.initialize(initializer, ctx=context)
 
-# Fetch and broadcast parameters
+# Horovod: fetch and broadcast parameters
 params = model.collect_params()
 if params is not None:
     hvd.broadcast_parameters(params, root_rank=0)
 
-# Create trainer, loss function and train metric
-trainer = gluon.Trainer(params, opt, kvstore=None)
+# Horovod: create DistributedTrainer, a subclass of gluon.Trainer
+trainer = hvd.DistributedTrainer(params, opt)
+
+# Create loss function and train metric
 loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
 metric = mx.metric.Accuracy()
 
diff --git a/example/distributed_training-horovod/module_mnist.py b/example/distributed_training-horovod/module_mnist.py
index 5c02aaed966c..4fcb02a46996 100644
--- a/example/distributed_training-horovod/module_mnist.py
+++ b/example/distributed_training-horovod/module_mnist.py
@@ -42,7 +42,7 @@
 
 if not args.no_cuda:
     # Disable CUDA if there are no GPUs.
-    if not mx.test_utils.list_gpus():
+    if mx.context.num_gpus() == 0:
         args.no_cuda = True
 
 logging.basicConfig(level=logging.INFO)
@@ -94,7 +94,6 @@ def get_mnist_iterator(rank):
 # Step 2: load data
 train_iter, val_iter = get_mnist_iterator(hvd.rank())
 
-
 # Step 3: define network
 def conv_net():
     # placeholder for data
@@ -119,17 +118,10 @@ def conv_net():
     loss = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
     return loss
 
-
-# Step 4: fit the model
 net = conv_net()
 model = mx.mod.Module(symbol=net, context=context)
-optimizer_params = {'learning_rate': args.lr * hvd.size(),
-                    'rescale_grad': 1.0 / args.batch_size}
-opt = mx.optimizer.create('sgd', **optimizer_params)
-
-# Horovod: wrap optimizer with DistributedOptimizer
-opt = hvd.DistributedOptimizer(opt)
 
+# Step 4: initialize parameters
 initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
                              magnitude=2)
 model.bind(data_shapes=train_iter.provide_data,
@@ -144,15 +136,27 @@ def conv_net():
     hvd.broadcast_parameters(aux_params, root_rank=0)
 model.set_params(arg_params=arg_params, aux_params=aux_params)
 
+# Step 5: create optimizer
+optimizer_params = {'learning_rate': args.lr * hvd.size(),
+                    'rescale_grad': 1.0 / args.batch_size}
+opt = mx.optimizer.create('sgd', **optimizer_params)
+
+# Horovod: wrap optimizer with DistributedOptimizer
+opt = hvd.DistributedOptimizer(opt)
+
+# Step 6: fit and train model
+batch_cb = None
+if hvd.rank() == 0:
+    batch_cb = mx.callback.Speedometer(args.batch_size * hvd.size())
 model.fit(train_iter,  # train data
           kvstore=None,  # no kvstore
           eval_data=val_iter,  # validation data
           optimizer=opt,  # use SGD to train
           eval_metric='acc',  # report accuracy during training
-          batch_end_callback=mx.callback.Speedometer(args.batch_size),
+          batch_end_callback=batch_cb,  # report training speed
           num_epoch=args.epochs)  # train for at most 10 dataset passes
 
-# Step 5: evaluate model accuracy
+# Step 7: evaluate model accuracy
 acc = mx.metric.Accuracy()
 model.score(val_iter, acc)
 
diff --git a/example/distributed_training-horovod/resnet50_imagenet.py b/example/distributed_training-horovod/resnet50_imagenet.py
index 9b993403a9f0..5e5169e98ece 100644
--- a/example/distributed_training-horovod/resnet50_imagenet.py
+++ b/example/distributed_training-horovod/resnet50_imagenet.py
@@ -279,18 +279,6 @@ def reset(self):
 initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
                              magnitude=2)
 
-# Create optimizer
-optimizer_params = {'wd': args.wd,
-                    'momentum': args.momentum,
-                    'rescale_grad': 1.0 / batch_size,
-                    'lr_scheduler': lr_sched}
-if args.dtype == 'float16':
-    optimizer_params['multi_precision'] = True
-opt = mx.optimizer.create('sgd', **optimizer_params)
-
-# Horovod: wrap optimizer with DistributedOptimizer
-opt = hvd.DistributedOptimizer(opt)
-
 
 def train_gluon():
     def evaluate(epoch):
@@ -320,8 +308,18 @@ def evaluate(epoch):
     if params is not None:
         hvd.broadcast_parameters(params, root_rank=0)
 
-    # Create trainer, loss function and train metric
-    trainer = gluon.Trainer(params, opt, kvstore=None)
+    # Create optimizer
+    optimizer_params = {'wd': args.wd,
+                        'momentum': args.momentum,
+                        'lr_scheduler': lr_sched}
+    if args.dtype == 'float16':
+        optimizer_params['multi_precision'] = True
+    opt = mx.optimizer.create('sgd', **optimizer_params)
+
+    # Horovod: create DistributedTrainer, a subclass of gluon.Trainer
+    trainer = hvd.DistributedTrainer(params, opt)
+
+    # Create loss function and train metric
     loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
     metric = mx.metric.Accuracy()
 
@@ -410,6 +408,22 @@ def train_module():
         hvd.broadcast_parameters(aux_params, root_rank=0)
     mod.set_params(arg_params=arg_params, aux_params=aux_params)
 
+    # Create optimizer
+    # Note that when using Module API, we need to specify rescale_grad since
+    # we create optimizer first and wrap it with DistributedOptimizer. For
+    # Gluon API, it is handled in Trainer.step() function so there is no need
+    # to specify rescale_grad (see above train_gluon() function). 
+    optimizer_params = {'wd': args.wd,
+                        'momentum': args.momentum,
+                        'rescale_grad': 1.0 / batch_size,
+                        'lr_scheduler': lr_sched}
+    if args.dtype == 'float16':
+        optimizer_params['multi_precision'] = True
+    opt = mx.optimizer.create('sgd', **optimizer_params)
+
+    # Horovod: wrap optimizer with DistributedOptimizer
+    opt = hvd.DistributedOptimizer(opt)
+
     # Setup validation data and callback during training
     eval_data = None
     if args.eval_epoch:
diff --git a/example/gluon/embedding_learning/README.md b/example/gluon/embedding_learning/README.md
index ce1fb536a859..ee3a0eae5c39 100644
--- a/example/gluon/embedding_learning/README.md
+++ b/example/gluon/embedding_learning/README.md
@@ -22,6 +22,10 @@ This example implements embedding learning based on a Margin-based Loss with dis
 
 ## Usage
 Download the data
+
+Note: the dataset is from [Caltech-UCSD Birds 200](http://www.vision.caltech.edu/visipedia/CUB-200.html).
+These datasets are copyright Caltech Computational Vision Group and licensed CC BY 4.0 Attribution.
+See [original dataset source](http://www.vision.caltech.edu/archive.html) for details
 ```bash
 ./get_cub200_data.sh
 ```
diff --git a/example/gluon/embedding_learning/get_cub200_data.sh b/example/gluon/embedding_learning/get_cub200_data.sh
index c2f2fe45a4d2..4cf83e757dea 100755
--- a/example/gluon/embedding_learning/get_cub200_data.sh
+++ b/example/gluon/embedding_learning/get_cub200_data.sh
@@ -26,5 +26,9 @@ if [[ ! -d "${DATA_DIR}" ]]; then
   mkdir -p ${DATA_DIR}
 fi
 
+# the dataset is from Caltech-UCSD Birds 200
+# http://www.vision.caltech.edu/visipedia/CUB-200.html
+# These datasets are copyright Caltech Computational Vision Group and licensed CC BY 4.0 Attribution.
+# See http://www.vision.caltech.edu/archive.html for details
 wget -P ${DATA_DIR} http://www.vision.caltech.edu/visipedia-data/CUB-200-2011/CUB_200_2011.tgz
-cd ${DATA_DIR}; tar -xf CUB_200_2011.tgz
\ No newline at end of file
+cd ${DATA_DIR}; tar -xf CUB_200_2011.tgz
diff --git a/example/gluon/style_transfer/README.md b/example/gluon/style_transfer/README.md
index c182f3f8fce8..1d4ef43721be 100644
--- a/example/gluon/style_transfer/README.md
+++ b/example/gluon/style_transfer/README.md
@@ -112,6 +112,10 @@ python main.py optim --content-image images/content/venice-boat.jpg --style-imag
 
 ### Train Your Own MSG-Net Model
 0. Download the style images and COCO dataset
+Note: Dataset from [COCO 2014](http://cocodataset.org/#download).
+The dataset annotations and site are Copyright COCO Consortium and licensed CC BY 4.0 Attribution.
+The images within the dataset are available under the Flickr Terms of Use.
+See original [dataset source](http://cocodataset.org/#termsofuse) for details
 	```bash
     python download_images.py 
 	python dataset/download_dataset.py
diff --git a/example/gluon/style_transfer/dataset/download_dataset.py b/example/gluon/style_transfer/dataset/download_dataset.py
index 538001511172..6d32d94abedc 100644
--- a/example/gluon/style_transfer/dataset/download_dataset.py
+++ b/example/gluon/style_transfer/dataset/download_dataset.py
@@ -26,6 +26,10 @@ def unzip_file(filename, outpath):
         z.extract(name, outpath)
     fh.close()
 
+# Dataset from COCO 2014: http://cocodataset.org/#download
+# The dataset annotations and site are Copyright COCO Consortium and licensed CC BY 4.0 Attribution.
+# The images within the dataset are available under the Flickr Terms of Use.
+# See http://cocodataset.org/#termsofuse for details
 download('http://msvocds.blob.core.windows.net/coco2014/train2014.zip', 'dataset/train2014.zip')
 download('http://msvocds.blob.core.windows.net/coco2014/val2014.zip', 'dataset/val2014.zip')
 
diff --git a/example/gluon/super_resolution/README.md b/example/gluon/super_resolution/README.md
index ddcbe8b0a202..2239533a36fd 100644
--- a/example/gluon/super_resolution/README.md
+++ b/example/gluon/super_resolution/README.md
@@ -17,6 +17,9 @@
 
 # Superresolution
 
+Note: this example use The BSDS500 Dataset which is copyright Berkeley Computer Vision Group.
+For more details, see [dataset website](https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/resources.html#bsds500)
+
 This example trains a convolutional neural network to enhance the resolution of images (also known as superresolution). 
 The script takes the following commandline arguments:
 
@@ -43,3 +46,11 @@ Once the network is trained you can use the following command to increase the re
 ```
 python  super_resolution.py --resolve_img myimage.jpg
 ```
+
+## Citation
+<b>Contour Detection and Hierarchical Image Segmentation
+P. Arbelaez, M. Maire, C. Fowlkes and J. Malik.
+IEEE TPAMI, Vol. 33, No. 5, pp. 898-916, May 2011.
+[PDF](http://web.archive.org/web/20160306133802/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/papers/amfm_pami2010.pdf)
+[BibTex](http://web.archive.org/web/20160306133802/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/papers/amfm_pami2011.bib)
+</b>
\ No newline at end of file
diff --git a/example/gluon/super_resolution/super_resolution.py b/example/gluon/super_resolution/super_resolution.py
index 198f6fe0611b..4a3e8d92aa39 100644
--- a/example/gluon/super_resolution/super_resolution.py
+++ b/example/gluon/super_resolution/super_resolution.py
@@ -59,7 +59,9 @@
 batch_size, test_batch_size = opt.batch_size, opt.test_batch_size
 color_flag = 0
 
-# Get data
+# Get data from https://github.com/BIDS/BSDS500/
+# The BSDS500 Dataset is copyright Berkeley Computer Vision Group
+# For more details, see https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/resources.html#bsds500
 datasets_dir = path.expanduser(path.join("~", ".mxnet", "datasets"))
 datasets_tmpdir = path.join(datasets_dir, "tmp")
 dataset_url = "https://github.com/BIDS/BSDS500/archive/master.zip"
diff --git a/example/image-classification/test_score.py b/example/image-classification/test_score.py
index 0789c9270fff..e41d4e62ea8e 100644
--- a/example/image-classification/test_score.py
+++ b/example/image-classification/test_score.py
@@ -51,11 +51,10 @@ def test_imagenet1k_inception_bn(**kwargs):
     assert r > g and r < g + .1
 
 if __name__ == '__main__':
-    gpus = mx.test_utils.list_gpus()
-    assert len(gpus) > 0
-    batch_size = 16 * len(gpus)
-    gpus = ','.join([str(i) for i in gpus])
-
+    num_gpus = mx.context.num_gpus()
+    assert num_gpus > 0
+    batch_size = 16 * num_gpus
+    gpus = ','.join(map(str, range(num_gpus)))
     kwargs = {'gpus':gpus, 'batch_size':batch_size, 'max_num_examples':500}
     download_data()
     test_imagenet1k_resnet(**kwargs)
diff --git a/example/model-parallel/matrix_factorization/get_data.py b/example/model-parallel/matrix_factorization/get_data.py
index bb2503a716a6..775c3860e624 100644
--- a/example/model-parallel/matrix_factorization/get_data.py
+++ b/example/model-parallel/matrix_factorization/get_data.py
@@ -21,6 +21,10 @@
 
 def get_movielens_data(prefix):
     if not os.path.exists("%s.zip" % prefix):
+        # MovieLens 10M dataset from https://grouplens.org/datasets/movielens/
+        # This dataset is copy right to GroupLens Research Group at the University of Minnesota,
+        # and licensed under their usage license.
+        # For full text of the license, see http://files.grouplens.org/datasets/movielens/ml-10m-README.html
         print("Dataset MovieLens 10M not present. Downloading now ...")
         os.system("wget http://files.grouplens.org/datasets/movielens/%s.zip" % prefix)
         os.system("unzip %s.zip" % prefix)
diff --git a/example/multi-task/multi-task-learning.ipynb b/example/multi-task/multi-task-learning.ipynb
index 6e03e2b61f8c..048d6d9862b8 100644
--- a/example/multi-task/multi-task-learning.ipynb
+++ b/example/multi-task/multi-task-learning.ipynb
@@ -58,7 +58,7 @@
    "source": [
     "batch_size = 128\n",
     "epochs = 5\n",
-    "ctx = mx.gpu() if len(mx.test_utils.list_gpus()) > 0 else mx.cpu()\n",
+    "ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()\n",
     "lr = 0.01"
    ]
   },
diff --git a/example/named_entity_recognition/src/metrics.py b/example/named_entity_recognition/src/metrics.py
index d3d73782c62e..ef5f64fb1af3 100644
--- a/example/named_entity_recognition/src/metrics.py
+++ b/example/named_entity_recognition/src/metrics.py
@@ -50,15 +50,20 @@ def classifer_metrics(label, pred):
     correct_entitites = np.sum(corr_pred[pred_is_entity])
 
     #precision: when we predict entity, how often are we right?
-    precision = correct_entitites/entity_preds
     if entity_preds == 0:
         precision = np.nan
+    else:
+        precision = correct_entitites/entity_preds
 
     #recall: of the things that were an entity, how many did we catch?
     recall = correct_entitites / num_entities
     if num_entities == 0:
         recall = np.nan
-    f1 = 2 * precision * recall / (precision + recall)
+    # To prevent dozens of warning: RuntimeWarning: divide by zero encountered in long_scalars
+    if precision + recall == 0:
+        f1 = 0
+    else:
+        f1 = 2 * precision * recall / (precision + recall)
     return precision, recall, f1
 
 def entity_precision(label, pred):
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 06a1272caf21..482127ba355c 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -187,7 +187,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
         prefix, epoch = download_model(model_name=args.model, logger=logger)
         sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
 
-    sym = sym.get_backend_symbol('MKLDNN')
+    sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
 
     # get batch size
     batch_size = args.batch_size
@@ -315,7 +315,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
             raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
                              % calib_mode)
         sym_name = '%s-symbol.json' % (prefix + suffix)
-    qsym = qsym.get_backend_symbol('MKLDNN_POST_QUANTIZE')
+    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
     save_symbol(sym_name, qsym, logger)
     param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
     save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index e773089f7219..a12ba2f53b72 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -43,8 +43,8 @@ Make a directory `data` and follow `py-faster-rcnn` for data preparation instruc
 * [MSCOCO](http://mscoco.org/dataset/) should be in `data/coco` containing `train2017`, `val2017` and `annotations/instances_train2017.json`, `annotations/instances_val2017.json`.
 
 ### Download pretrained ImageNet models
-* [VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/) should be at `model/vgg16-0000.params` from [MXNet model zoo](http://data.dmlc.ml/models/imagenet/vgg/).
-* [ResNet](https://github.com/tornadomeet/ResNet) should be at `model/resnet-101-0000.params` from [MXNet model zoo](http://data.dmlc.ml/models/imagenet/resnet/).
+* [VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/) should be at [/vgg/vgg16-0000.params](http://data.mxnet.io/mxnet/models/imagenet/vgg/vgg16-0000.params) from MXNet model zoo.
+* [ResNet](https://github.com/tornadomeet/ResNet) should be at [/resnet/101-layers/resnet-101-0000.params](http://data.mxnet.io/mxnet/models/imagenet/resnet/101-layers/resnet-101-0000.params) from MXNet model zoo.
 
 ### Training and evaluation
 Use `python3 train.py --dataset $Dataset$ --network $Network$ --pretrained $IMAGENET_MODEL_FILE$ --gpus $GPUS$` to train,
diff --git a/example/recommenders/README.md b/example/recommenders/README.md
index 806e51f24c8d..3adb3f042ac7 100644
--- a/example/recommenders/README.md
+++ b/example/recommenders/README.md
@@ -43,3 +43,6 @@ Thanks to [xlvector](https://github.com/xlvector/) for the first Matrix Factoriz
 that provided the basis for these examples.
 
 [MovieLens](http://grouplens.org/datasets/movielens/) data from [GroupLens](http://grouplens.org/).
+Note: MovieLens 100K and 10M dataset are copy right to GroupLens Research Group at the University of Minnesota,
+and licensed under their usage license. For full text of the usage license, see [ml-100k license](http://files.grouplens.org/datasets/movielens/ml-100k-README.txt)
+ and [ml-10m license](http://files.grouplens.org/datasets/movielens/ml-10m-README.html). 
\ No newline at end of file
diff --git a/example/recommenders/demo2-dssm.ipynb b/example/recommenders/demo2-dssm.ipynb
index 49450c56ebc6..d0cd3ed65771 100644
--- a/example/recommenders/demo2-dssm.ipynb
+++ b/example/recommenders/demo2-dssm.ipynb
@@ -41,7 +41,7 @@
     "hidden_units = 128\n",
     "epsilon_proj = 0.25\n",
     "\n",
-    "ctx = mx.gpu() if len(mx.test_utils.list_gpus()) > 0 else mx.cpu()"
+    "ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()"
    ]
   },
   {
diff --git a/example/recommenders/movielens_data.py b/example/recommenders/movielens_data.py
index 5f86ad17a911..e92c73a8bcd9 100644
--- a/example/recommenders/movielens_data.py
+++ b/example/recommenders/movielens_data.py
@@ -45,6 +45,10 @@ def load_mldataset(filename):
 def ensure_local_data(prefix):
     if not os.path.exists("%s.zip" % prefix):
         print("Downloading MovieLens data: %s" % prefix)
+        # MovieLens 100k dataset from https://grouplens.org/datasets/movielens/
+        # This dataset is copy right to GroupLens Research Group at the University of Minnesota,
+        # and licensed under their usage license.
+        # For full text of the usage license, see http://files.grouplens.org/datasets/movielens/ml-100k-README.txt
         os.system("wget http://files.grouplens.org/datasets/movielens/%s.zip" % prefix)
         os.system("unzip %s.zip" % prefix)
 
diff --git a/example/sparse/matrix_factorization/data.py b/example/sparse/matrix_factorization/data.py
index 049f5c27ae10..8ad3ee8c3170 100644
--- a/example/sparse/matrix_factorization/data.py
+++ b/example/sparse/matrix_factorization/data.py
@@ -19,6 +19,10 @@
 import mxnet as mx
 
 def get_movielens_data(data_dir, prefix):
+    # MovieLens 10M dataset from https://grouplens.org/datasets/movielens/
+    # This dataset is copy right to GroupLens Research Group at the University of Minnesota,
+    # and licensed under their usage license.
+    # For full text of the license, see http://files.grouplens.org/datasets/movielens/ml-10m-README.html
     if not os.path.exists(os.path.join(data_dir, "ml-10M100K")):
         mx.test_utils.get_zip_data(data_dir,
                                    "http://files.grouplens.org/datasets/movielens/%s.zip" % prefix,
diff --git a/example/ssd/quantization.py b/example/ssd/quantization.py
index 4b111dfa1875..d50935499240 100644
--- a/example/ssd/quantization.py
+++ b/example/ssd/quantization.py
@@ -101,7 +101,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
         label = mx.sym.Variable(name='label')
         sym = mx.sym.Group([sym, label])
 
-    sym = sym.get_backend_symbol('MKLDNN')
+    sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
 
     # get batch size
     batch_size = args.batch_size
@@ -163,6 +163,6 @@ def calib_layer(name): return not (name.endswith('_data') or
                                                         label_names=(label_name,), logger=logger)
         sym_name = '%s-symbol.json' % ('./model/cqssd_vgg16_reduced_300')
         param_name = '%s-%04d.params' % ('./model/cqssd_vgg16_reduced_300', epoch)
-    qsym = qsym.get_backend_symbol('MKLDNN_POST_QUANTIZE')
+    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
     save_symbol(sym_name, qsym, logger)
     save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/svm_mnist/svm_mnist.py b/example/svm_mnist/svm_mnist.py
index 3fc0362f6b01..e166cb6ac707 100644
--- a/example/svm_mnist/svm_mnist.py
+++ b/example/svm_mnist/svm_mnist.py
@@ -82,7 +82,7 @@
 # Article's suggestion on batch size
 batch_size = 200
 
-ctx = mx.gpu() if len(mx.test_utils.list_gpus()) > 0 else mx.cpu()
+ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()
 
 results = {}
 for output in [mlp_svm_l2, mlp_svm_l1, mlp_softmax]:
@@ -121,4 +121,4 @@
 
 #svm_l2 97.85 %s
 #svm_l1 98.15 %s
-#softmax 97.69 %s
\ No newline at end of file
+#softmax 97.69 %s
diff --git a/include/mkldnn b/include/mkldnn
deleted file mode 120000
index ed48bf601bd6..000000000000
--- a/include/mkldnn
+++ /dev/null
@@ -1 +0,0 @@
-../3rdparty/mkldnn/include
\ No newline at end of file
diff --git a/include/mkldnn/mkldnn.h b/include/mkldnn/mkldnn.h
new file mode 120000
index 000000000000..873c515d113a
--- /dev/null
+++ b/include/mkldnn/mkldnn.h
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/mkldnn.h
\ No newline at end of file
diff --git a/include/mkldnn/mkldnn_types.h b/include/mkldnn/mkldnn_types.h
new file mode 120000
index 000000000000..548b884e2edb
--- /dev/null
+++ b/include/mkldnn/mkldnn_types.h
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/mkldnn_types.h
\ No newline at end of file
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index f79f224029b2..a2da6db978cb 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -320,8 +320,9 @@ MXNET_DLL int MXDumpProcessProfile(int finished, int profile_process, KVStoreHan
  */
 MXNET_DLL int MXDumpProfile(int finished);
 
+
 /*!
- * \brief Print aggregate stats to the a string
+ * \brief Deprecated, use MXAggregateProfileStatsPrintEx instead.
  * \param out_str Will receive a pointer to the output string
  * \param reset Clear the aggregate stats after printing
  * \return 0 when success, -1 when failure happens.
@@ -329,6 +330,20 @@ MXNET_DLL int MXDumpProfile(int finished);
  */
 MXNET_DLL int MXAggregateProfileStatsPrint(const char **out_str, int reset);
 
+/*!
+ * \brief Print sorted aggregate stats to the a string
+ *        How aggregate stats are stored will not change
+ * \param out_str will receive a pointer to the output string
+ * \param reset clear the aggregate stats after printing
+ * \param format whether to return in tabular or json format
+ * \param sort_by sort by avg, min, max, or count
+ * \param ascending whether to sort ascendingly
+ * \return 0 when success, -1 when failure happens.
+ * \note
+ */
+MXNET_DLL int MXAggregateProfileStatsPrintEx(const char **out_str, int reset, int format,
+                                            int sort_by, int ascending);
+
 /*!
  * \brief Pause profiler tuning collection
  * \param paused If nonzero, profiling pauses. Otherwise, profiling resumes/continues
@@ -809,7 +824,8 @@ MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle,
                                        DLManagedTensorHandle *out_dlpack);
 
 /*!
-* \brief Create a NDArray backed by a dlpack tensor.
+* \brief DEPRECATED. Use MXNDArrayFromDLPackEx instead.
+
 *
 * This allows us to create a NDArray using the memory
 * allocated by an external deep learning framework
@@ -818,12 +834,31 @@ MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle,
 * The memory is retained until the NDArray went out of scope.
 *
 * \param dlpack the pointer of the input DLManagedTensor
+* \param transient_handle whether the handle will be destructed before calling the deleter
 * \param out_handle pointer holder to get pointer of NDArray
 * \return 0 when success, -1 when failure happens
 */
 MXNET_DLL int MXNDArrayFromDLPack(DLManagedTensorHandle dlpack,
                                   NDArrayHandle *out_handle);
 
+/*!
+* \brief Create a NDArray backed by a dlpack tensor.
+*
+* This allows us to create a NDArray using the memory
+* allocated by an external deep learning framework
+* that is DLPack compatible.
+*
+* The memory is retained until the NDArray went out of scope.
+*
+* \param dlpack the pointer of the input DLManagedTensor
+* \param transient_handle whether the handle will be destructed before calling the deleter
+* \param out_handle pointer holder to get pointer of NDArray
+* \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXNDArrayFromDLPackEx(DLManagedTensorHandle dlpack,
+                                    const bool transient_handle,
+                                    NDArrayHandle *out_handle);
+
 /*!
  * \brief Delete a dlpack tensor
  * \param dlpack the pointer of the input DLManagedTensor
@@ -1065,14 +1100,14 @@ MXNET_DLL int MXAutogradIsTraining(bool* curr);
  * \param curr returns the current status
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIsNumpyCompatible(bool* curr);
+MXNET_DLL int MXIsNumpyShape(bool* curr);
 /*!
  * \brief set numpy compatibility switch
- * \param is_np_comp 1 when numpy compatibility is on, 0 when off
+ * \param is_np_shape 1 when numpy shape semantics is on, 0 when off
  * \param prev returns the previous status before this set
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSetIsNumpyCompatible(int is_np_comp, int* prev);
+MXNET_DLL int MXSetIsNumpyShape(int is_np_shape, int* prev);
 /*!
  * \brief mark NDArrays as variables to compute gradient for autograd
  * \param num_var number of variable NDArrays
@@ -1291,6 +1326,13 @@ MXNET_DLL int MXSymbolCreateFromFile(const char *fname, SymbolHandle *out);
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolCreateFromJSON(const char *json, SymbolHandle *out);
+/*!
+ * \brief Remove the operators amp_cast and amp_multicast
+ * \param sym_handle the input symbol.
+ * \param ret_sym_handle the output symbol.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolRemoveAmpCast(SymbolHandle sym_handle, SymbolHandle* ret_sym_handle);
 /*!
  * \brief Save a symbol into a json file.
  * \param symbol the input symbol.
@@ -1747,6 +1789,14 @@ MXNET_DLL int MXSetCalibTableToQuantizedSymbol(SymbolHandle qsym_handle,
 MXNET_DLL int MXGenBackendSubgraph(SymbolHandle sym_handle, const char *backend,
                                    SymbolHandle *ret_sym_handle);
 
+/*!
+ * \brief Generate atomic symbol (able to be composed) from a source symbol
+ * \param sym_handle source symbol
+ * \param ret_sym_handle returned atomic symbol
+ */
+MXNET_DLL int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_handle);
+
+
 //--------------------------------------------
 // Part 4: Executor interface
 //--------------------------------------------
@@ -2729,6 +2779,12 @@ MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
 MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
                                            mx_uint ndim, int dtype, NDArrayHandle *out);
 
+/*!
+ * \brief Release all unreferenced memory from the devices storage managers memory pool
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ */
+MXNET_DLL int MXStorageEmptyCache(int dev_type, int dev_id);
 
 /*!
  * \brief Reconstruct NDArray from shared memory handle
@@ -2751,9 +2807,9 @@ MXNET_DLL int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, cons
   * \param ctx_handle Execution context.
   * \param const_vars_handle The variables that current operation will use
   *                          but not mutate.
-  * \param num_const_vars The number of const_vars.
+  * \param num_const_vars The number of const_vars_handle.
   * \param mutable_vars_handle The variables that current operation will mutate.
-  * \param num_mutable_vars The number of mutable_vars.
+  * \param num_mutable_vars The number of mutable_vars_handle.
   * \param prop_handle Property of the function.
   * \param priority Priority of the action, as hint to the engine.
   * \param opr_name The operation name.
@@ -2775,9 +2831,9 @@ MXNET_DLL int MXEnginePushAsync(EngineAsyncFunc async_func, void* func_param,
   * \param ctx_handle Execution context.
   * \param const_vars_handle The variables that current operation will use
   *                          but not mutate.
-  * \param num_const_vars The number of const_vars.
+  * \param num_const_vars The number of const_vars_handle.
   * \param mutable_vars_handle The variables that current operation will mutate.
-  * \param num_mutable_vars The number of mutable_vars.
+  * \param num_mutable_vars The number of mutable_vars_handle.
   * \param prop_handle Property of the function.
   * \param priority Priority of the action, as hint to the engine.
   * \param opr_name The operation name.
@@ -2789,6 +2845,53 @@ MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
 
+/*!
+  * \brief Push an asynchronous operation to the engine.
+  * \param async_func Execution function whici takes a parameter on_complete
+  *                   that must be called when the execution ompletes.
+  * \param func_param The parameter set on calling async_func, can be NULL.
+  * \param deleter The callback to free func_param, can be NULL.
+  * \param ctx_handle Execution context.
+  * \param const_nds_handle The NDArrays that current operation will use
+  *                          but not mutate.
+  * \param num_const_nds The number of const_nds_handle.
+  * \param mutable_nds_handle The NDArrays that current operation will mutate.
+  * \param num_mutable_nds The number of mutable_nds_handle.
+  * \param prop_handle Property of the function.
+  * \param priority Priority of the action, as hint to the engine.
+  * \param opr_name The operation name.
+  * \param wait Whether this is a WaitForVar operation.
+  */
+MXNET_DLL int MXEnginePushAsyncND(EngineAsyncFunc async_func, void* func_param,
+                                EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                                NDArrayHandle const_nds_handle, int num_const_nds,
+                                NDArrayHandle mutable_nds_handle, int num_mutable_nds,
+                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
+                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL),
+                                bool wait DEFAULT(false));
+
+/*!
+  * \brief Push a synchronous operation to the engine.
+  * \param sync_func Execution function that executes the operation.
+  * \param func_param The parameter set on calling sync_func, can be NULL.
+  * \param deleter The callback to free func_param, can be NULL.
+  * \param ctx_handle Execution context.
+  * \param const_nds_handle The NDArrays that current operation will use
+  *                          but not mutate.
+  * \param num_const_nds The number of const_nds_handle.
+  * \param mutable_nds_handle The NDArrays that current operation will mutate.
+  * \param num_mutable_nds The number of mutable_nds_handle.
+  * \param prop_handle Property of the function.
+  * \param priority Priority of the action, as hint to the engine.
+  * \param opr_name The operation name.
+  */
+MXNET_DLL int MXEnginePushSyncND(EngineSyncFunc sync_func, void* func_param,
+                               EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                               NDArrayHandle const_nds_handle, int num_const_nds,
+                               NDArrayHandle mutable_nds_handle, int num_mutable_nds,
+                               EngineFnPropertyHandle prop_handle DEFAULT(NULL),
+                               int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
+
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index ad209913ac53..a86cc085a34b 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -98,13 +98,13 @@ class Imperative {
       return old;
   }
   /*! brief whether numpy compatibility is on. */
-  bool is_np_comp() const {
-    return is_np_comp_;
+  bool is_np_shape() const {
+    return is_np_shape_;
   }
   /*! brief turn on or turn off numpy compatibility switch. */
-  bool set_is_np_comp(bool is_np_comp) {
-    bool old = is_np_comp_;
-    is_np_comp_ = is_np_comp;
+  bool set_is_np_shape(bool is_np_shape) {
+    bool old = is_np_shape_;
+    is_np_shape_ = is_np_shape;
     return old;
   }
   /*! \brief to record operator, return corresponding node. */
@@ -177,13 +177,13 @@ class Imperative {
   static thread_local bool is_recording_;
   // TOOD(junwu): Added numpy compatibility switch for backward compatibility.
   // Delete it in the next major release.
-  static thread_local bool is_np_comp_;
+  static thread_local bool is_np_shape_;
 #else
   static MX_THREAD_LOCAL bool is_train_;
   static MX_THREAD_LOCAL bool is_recording_;
   // TOOD(junwu): Added numpy compatibility switch for backward compatibility.
   // Delete it in the next major release.
-  static MX_THREAD_LOCAL bool is_np_comp_;
+  static MX_THREAD_LOCAL bool is_np_shape_;
 #endif
   /*! \brief node count used for naming */
   std::atomic<uint64_t> node_count_{0};
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 340c38005493..34e891e0f336 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -82,7 +82,8 @@ class MKLDNNMemory;
 class NDArray {
  public:
   /*! \brief default constructor */
-  NDArray() {
+  NDArray()
+    : entry_(nullptr) {
   }
   /*!
    * \brief constructs a new dynamic NDArray
@@ -94,8 +95,10 @@ class NDArray {
   NDArray(const mxnet::TShape &shape, Context ctx,
           bool delay_alloc = false, int dtype = mshadow::default_type_flag)
       : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
-        shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage),
-        entry_({nullptr, 0, 0}) {
+        shape_(shape),
+        dtype_(dtype),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
   /*! \brief constructor for NDArray with storage type
    */
@@ -109,11 +112,12 @@ class NDArray {
    * \param ctx context of NDArray
    * \param dtype data type of this ndarray
    */
-  explicit NDArray(Context ctx, int dtype = mshadow::default_type_flag) {
-    ptr_ = std::make_shared<Chunk>(mxnet::TShape(mshadow::Shape1(0)), ctx, true, dtype);
-    dtype_ = dtype;
-    storage_type_ = kDefaultStorage;
-    entry_ = {nullptr, 0, 0};
+  explicit NDArray(Context ctx, int dtype = mshadow::default_type_flag)
+      : ptr_(std::make_shared<Chunk>(mxnet::TShape(mshadow::Shape1(0)), ctx, true, dtype)),
+        shape_(),
+        dtype_(dtype),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
@@ -123,9 +127,11 @@ class NDArray {
    * \param dev_id the device id this tensor sits at
    */
   NDArray(const TBlob &data, int dev_id)
-      : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
-        dtype_(data.type_flag_), storage_type_(kDefaultStorage),
-        entry_({nullptr, 0, 0}) {
+      : ptr_(std::make_shared<Chunk>(data, dev_id)),
+        shape_(data.shape_),
+        dtype_(data.type_flag_),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
 
   /*!
@@ -137,20 +143,22 @@ class NDArray {
    * \param deleter the function pointer of custom deleter
    */
   NDArray(const TBlob &data, int dev_id, const std::function<void()>& deleter)
-      : ptr_(new Chunk(data, dev_id),
-        [deleter](Chunk *p) {
-          deleter();    // call custom deleter
-          delete p;     // delete Chunk object
+      : ptr_(new Chunk(data, dev_id), [deleter](Chunk *p) {
+            deleter();    // call custom deleter
+            delete p;     // delete Chunk object
         }),
         shape_(data.shape_),
         dtype_(data.type_flag_), storage_type_(kDefaultStorage),
-        entry_({nullptr, 0, 0}) {
+        entry_(nullptr) {
   }
 
   /*! \brief create ndarray from shared memory */
   NDArray(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
-      : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
-        dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+      : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)),
+        shape_(shape),
+        dtype_(dtype),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {
   }
 
   /*!
@@ -165,8 +173,11 @@ class NDArray {
    */
   NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape,
           const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
-      : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
-        dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) {
+      : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)),
+        shape_(shape),
+        dtype_(data.type_flag_),
+        storage_type_(stype),
+        entry_(nullptr) {
   }
   /*!
    * \brief initialize the NDArray, assuming it is not assigned a meaningful shape before
@@ -587,7 +598,7 @@ class NDArray {
    *
    * \return The created NDArray view.
    */
-  static NDArray FromDLPack(const DLManagedTensor* tensor);
+  static NDArray FromDLPack(const DLManagedTensor* tensor, bool transient_handle);
 
   /*!
    * \brief Update ndarray chunk storage handles using existing ndarray storage handles
@@ -642,7 +653,7 @@ class NDArray {
    */
   NDArray Detach() const {
     NDArray ret(*this);
-    ret.entry_ = nnvm::NodeEntry{nullptr, 0, 0};
+    ret.entry_ = nnvm::NodeEntry(nullptr);
     return ret;
   }
 
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index a8481c1d36ef..4d1fc3d2c6da 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -95,6 +95,14 @@ class Storage {
    * \param handle Handle struct.
    */
   virtual void DirectFree(Handle handle) = 0;
+  /*!
+  * \brief Release all memory from device if using a pooled storage manager
+  *
+  * This release all memory from pool storage managers such as
+  * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
+  * For non-pool memory managers this has no effect.
+  */
+  virtual void ReleaseAll(Context ctx) = 0;
   /*!
    * \brief Destructor.
    */
diff --git a/make/config.mk b/make/config.mk
index 2080a016572f..4bddb8ba461d 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -224,6 +224,9 @@ USE_CPP_PACKAGE = 0
 # Note: the size of each dimension is still bounded by INT32_MAX
 USE_INT64_TENSOR_SIZE = 0
 
+# Python executable. Needed for cython target
+PYTHON = python
+
 #----------------------------
 # plugins
 #----------------------------
diff --git a/make/pip/pip_linux_cu101.mk b/make/pip/pip_linux_cu101.mk
new file mode 100644
index 000000000000..6aef44237f29
--- /dev/null
+++ b/make/pip/pip_linux_cu101.mk
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-10.1
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+ENABLE_CUDA_RTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu101mkl.mk b/make/pip/pip_linux_cu101mkl.mk
new file mode 100644
index 000000000000..1bb939d0a04c
--- /dev/null
+++ b/make/pip/pip_linux_cu101mkl.mk
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-10.1
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+ENABLE_CUDA_RTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
+
diff --git a/mkldnn.mk b/mkldnn.mk
index be45ce5df5d4..802f3dc747c2 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,6 +19,7 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
+	MXNET_INCLDIR = $(ROOTDIR)/include
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
@@ -49,7 +50,7 @@ $(MKLDNN_LIBFILE):
 	cp $(OMP_LIBFILE) $(MXNET_LIBDIR)
 	cp $(MKLML_LIBFILE) $(MXNET_LIBDIR)
 	cp $(MKLDNN_LIBFILE) $(MXNET_LIBDIR)
-
+	cp $(MKLDNN_BUILDDIR)/include/mkldnn_version.h $(MXNET_INCLDIR)/mkldnn/.
 mkldnn_clean:
 	$(RM) -r 3rdparty/mkldnn/build
 	$(RM) -r $(MKLDNNROOT)
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 79eb1f10f427..ab4bffde28a9 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -23,7 +23,8 @@
 
 from .context import Context, current_context, cpu, gpu, cpu_pinned
 from . import engine
-from .base import MXNetError, is_np_compat, set_np_compat, np_compat, use_np_compat
+from .base import MXNetError
+from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
 from . import base
 from . import contrib
 from . import ndarray
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index afc67e2569bd..f461b77e2818 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -493,6 +493,7 @@ def delete_entry(_):
                                       POINTER(CFUNCTYPE(c_int))),
                                  cast(c_array(c_void_p, [None]*len(callbacks)),
                                       POINTER(c_void_p)))
+        Function._registry.ref_holder[key] = context
         check_call(_LIB.MXCustomFunctionRecord(
             c_int(len(inputs)),
             c_handle_array(inputs),
@@ -500,8 +501,6 @@ def delete_entry(_):
             c_handle_array(outputs),
             ctypes.byref(context)))
 
-        Function._registry.ref_holder[key] = context
-
         return ret_outputs
 
     def forward(self, *inputs):
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 53414016e39e..73fae4876873 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -20,7 +20,6 @@
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
-from functools import wraps
 import atexit
 import ctypes
 import os
@@ -31,7 +30,7 @@
 
 from . import libinfo
 
-__all__ = ['MXNetError', 'is_np_compat', 'set_np_compat', 'np_compat', 'use_np_compat']
+__all__ = ['MXNetError']
 #----------------------------
 # library loading
 #----------------------------
@@ -735,140 +734,3 @@ def write_all_str(module_file, module_all_list):
 
 ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
 ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
-
-
-def set_np_compat(active):
-    """
-    Turns on/off NumPy compatibility. NumPy-compatibility is turned off by default in backend.
-
-    Parameters
-    ----------
-    active : bool
-        Indicates whether to turn on/off NumPy compatibility.
-
-    Returns
-    -------
-        A bool value indicating the previous state of NumPy compatibility.
-    """
-    prev = ctypes.c_int()
-    check_call(_LIB.MXSetIsNumpyCompatible(ctypes.c_int(active), ctypes.byref(prev)))
-    return bool(prev.value)
-
-
-def is_np_compat():
-    """
-    Checks whether the NumPy compatibility is currently turned on.
-    NumPy-compatibility is turned off by default in backend.
-
-    Returns
-    -------
-        A bool value indicating whether the NumPy compatibility is currently on.
-    """
-    curr = ctypes.c_bool()
-    check_call(_LIB.MXIsNumpyCompatible(ctypes.byref(curr)))
-    return curr.value
-
-
-class _NumpyCompatibilityStateScope(object):
-    """Scope for managing numpy compatibility state.
-    Do not use this class directly. Use `np_compat(active)` instead.
-
-    Example::
-
-        with _NumpyCompatibilityStateScope(True):
-            y = model(x)
-            backward([y])
-
-    """
-    def __init__(self, is_np_compat):  #pylint: disable=redefined-outer-name
-        self._enter_is_np_compat = is_np_compat
-        self._prev_is_np_compat = None
-
-    def __enter__(self):
-        if self._enter_is_np_compat is not None:
-            self._prev_is_np_compat = set_np_compat(self._enter_is_np_compat)
-
-    def __exit__(self, ptype, value, trace):
-        if self._enter_is_np_compat is not None and self._prev_is_np_compat != self._enter_is_np_compat:
-            set_np_compat(self._prev_is_np_compat)
-
-
-def np_compat(active=True):
-    """Returns an activated/deactivated NumPy compatibility state scope to be used in 'with' statement
-    and captures code that needs the compatibility.
-
-    Example::
-
-        with mx.np_compat(active=True):
-            # A scalar tensor's shape is `()`, whose `ndim` is `0`.
-            scalar = mx.nd.ones(shape=())
-            assert scalar.shape == ()
-
-            # In NumPy compatible mode, 0 in a shape means that dimension contains zero elements.
-            data = mx.sym.var("data", shape=(0, 2, 3))
-            ret = mx.sym.sin(data)
-            arg_shapes, out_shapes, _ = ret.infer_shape()
-            assert arg_shapes[0] == (0, 2, 3)
-            assert out_shapes[0] == (0, 2, 3)
-
-            # -1 means unknown shape dimension size in the new NumPy-compatible shape definition
-            data = mx.sym.var("data", shape=(-1, 2, 3))
-            ret = mx.sym.sin(data)
-            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
-            assert arg_shapes[0] == (-1, 2, 3)
-            assert out_shapes[0] == (-1, 2, 3)
-
-            # When a shape is completely unknown in NumPy-compatible mode, it is
-            # represented as `None` in Python.
-            data = mx.sym.var("data")
-            ret = mx.sym.sin(data)
-            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
-            assert arg_shapes[0] is None
-            assert out_shapes[0] is None
-
-        with mx.np_compat(active=False):
-            # 0 means unknown shape dimension size in the legacy shape definition.
-            data = mx.sym.var("data", shape=(0, 2, 3))
-            ret = mx.sym.sin(data)
-            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
-            assert arg_shapes[0] == (0, 2, 3)
-            assert out_shapes[0] == (0, 2, 3)
-
-            # When a shape is completely unknown in the legacy mode (default), its ndim is
-            # equal to 0 and it is represented as `()` in Python.
-            data = mx.sym.var("data")
-            ret = mx.sym.sin(data)
-            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
-            assert arg_shapes[0] == ()
-            assert out_shapes[0] == ()
-    """
-    return _NumpyCompatibilityStateScope(active)
-
-
-def use_np_compat(func):
-    """Wraps a function with an activated NumPy-compatibility scope. This ensures
-    that the execution of the function is guaranteed with NumPy compatible semantics,
-    such as zero-dim and zero size tensors.
-
-    Example::
-        import mxnet as mx
-        @mx.use_np_compat
-        def scalar_one():
-            return mx.nd.ones(())
-        print(scalar_one())
-
-    Parameters
-    ----------
-    func : a user-provided callable function to be scoped by the NumPy compatibility state.
-
-    Returns
-    -------
-    Function
-        A function for wrapping the user functions in the NumPy compatibility scope.
-    """
-    @wraps(func)
-    def _with_np_compat(*args, **kwargs):
-        with np_compat(active=True):
-            return func(*args, **kwargs)
-
-    return _with_np_compat
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 15ea9905de03..f284e00127b4 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -145,6 +145,24 @@ def default_ctx(cls, val):
         cls._default_ctx.value = val
     #pylint: enable=no-self-argument
 
+    def empty_cache(self):
+        """Empties the memory cache for the current contexts device.
+
+        MXNet utilizes a memory pool to avoid excessive allocations.
+        Calling empty_cache will empty the memory pool of the contexts
+        device. This will only free the memory of the unreferenced data.
+
+        Examples
+        -------
+        >>> ctx = mx.gpu(0)
+        >>> arr = mx.nd.ones((200,200), ctx=ctx)
+        >>> del arr
+        >>> ctx.empty_cache() # forces release of memory allocated for arr
+        """
+        dev_type = ctypes.c_int(self.device_typeid)
+        dev_id = ctypes.c_int(self.device_id)
+        check_call(_LIB.MXStorageEmptyCache(dev_type, dev_id))
+
 # initialize the default context in Context
 Context._default_ctx.value = Context('cpu', 0)
 
diff --git a/python/mxnet/contrib/amp/__init__.py b/python/mxnet/contrib/amp/__init__.py
new file mode 100644
index 000000000000..7aebc41ffcaa
--- /dev/null
+++ b/python/mxnet/contrib/amp/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import
+"""Automatic mixed precision module."""
+
+from .amp import *
diff --git a/python/mxnet/contrib/amp/amp.py b/python/mxnet/contrib/amp/amp.py
new file mode 100755
index 000000000000..bb3972092139
--- /dev/null
+++ b/python/mxnet/contrib/amp/amp.py
@@ -0,0 +1,344 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Functions for enabling AMP (automatic mixed precision)."""
+__all__ = ['init', 'init_trainer', 'scale_loss', 'unscale']
+
+from types import MethodType
+import logging
+import contextlib
+import numpy as np
+
+from ... import symbol
+from ...symbol import Symbol
+from ...symbol import contrib as symbol_contrib
+from ... import ndarray
+from ...ndarray import NDArray
+from . import lists
+from ...gluon import trainer
+from ... import base
+from ... import optimizer as opt
+from .loss_scaler import LossScaler
+
+def _cast_symbol_NDArray(s, dtype):
+    float_types = (np.float16, np.float32)
+    if isinstance(s, Symbol):
+        return symbol.amp_cast(s, dtype=dtype)
+    elif isinstance(s, NDArray):
+        if (s.dtype != dtype and
+                s.dtype in float_types and
+                s.context.device_type != 'cpu'):
+            return ndarray.amp_cast(s, dtype=dtype)
+        else:
+            return s
+    else:
+        return s
+
+def _get_fun_to_wrap(name, module, submodule_dict):
+    module_internal = getattr(module, "_internal")
+    prefix = base._get_op_name_prefix(name)
+    if len(prefix) > 0:
+        if prefix != '_random_' or name.endswith('_like'):
+            func_name = name[len(prefix):]
+            cur_module = submodule_dict[prefix]
+        else:
+            func_name = name
+            cur_module = module_internal
+    elif name.startswith('_'):
+        func_name = name
+        cur_module = module_internal
+    else:
+        func_name = name
+        cur_module = module
+    return func_name, cur_module
+
+def _wrap_symbol_functions(module, target_dtype, target_precision_ops=None,
+                           conditional_fp32_ops=None, fp32_ops=None):
+    def _ndarray_wrapper(f, target_dtype, cond_arg=None):
+        def _new_fun(*args, **kwargs):
+            if cond_arg is not None:
+                if (cond_arg[0] not in kwargs or
+                        kwargs[cond_arg[0]] not in cond_arg[1]):
+                    return f(*args, **kwargs)
+            new_args = list(map(lambda x: _cast_symbol_NDArray(x, target_dtype), args))
+            args = tuple(new_args)
+            kwargs = {k: _cast_symbol_NDArray(v, target_dtype) for k, v in kwargs.items()}
+            return f(*args, **kwargs)
+        _new_fun.__name__ = f.__name__
+        _new_fun.__module__ = f.__module__
+        _new_fun.__doc__ = f.__doc__
+        return _new_fun
+
+    def _symbol_wrapper(f, target_dtype, cond_arg=None):
+        def _new_fun(*args, **kwargs):
+            if cond_arg is not None:
+                if (cond_arg[0] not in kwargs or
+                        kwargs[cond_arg[0]] not in cond_arg[1]):
+                    return f(*args, **kwargs)
+            sym = f(*args, **kwargs)
+            inputs = sym.get_children()
+            aux = sym.list_auxiliary_states()
+            inputs = list(map(lambda x: _cast_symbol_NDArray(x, target_dtype)
+                              if x.name not in aux else x, inputs))
+            atomic_sym = sym._gen_atomic_symbol()
+            wrapped_sym = atomic_sym(*inputs)
+            wrapped_sym._set_attr(name=sym.name)
+            return wrapped_sym
+        _new_fun.__name__ = f.__name__
+        _new_fun.__module__ = f.__module__
+        _new_fun.__doc__ = f.__doc__
+        return _new_fun
+
+    def _symbol_widest_wrapper(f):
+        def _new_fun(*args, **kwargs):
+            symbols = []
+            is_symbol = False
+            args = list(args)
+            for i, arg in enumerate(args):
+                if isinstance(arg, (Symbol, NDArray)):
+                    symbols.append((args, i, arg))
+                    is_symbol = is_symbol or isinstance(arg, Symbol)
+            for k, arg in kwargs.items():
+                if isinstance(arg, (Symbol, NDArray)):
+                    symbols.append((kwargs, k, arg))
+                    is_symbol = is_symbol or isinstance(arg, Symbol)
+            if not is_symbol:
+                # NDArray case
+                widest_type = target_dtype
+                for _, _, arg in symbols:
+                    if isinstance(arg, NDArray):
+                        if arg.dtype == np.float32:
+                            widest_type = np.float32
+                for arr, index, arg in symbols:
+                    if arg.dtype != widest_type and arg.dtype == target_dtype:
+                        arr[index] = ndarray.amp_cast(arg, dtype=widest_type)
+            else:
+                # Symbol case
+                sym_to_check = list(map(lambda x: x[2], symbols))
+                casted_syms = symbol.amp_multicast(*sym_to_check, num_outputs=len(sym_to_check))
+                symbols = list(map(lambda x_y: (x_y[0][0], x_y[0][1], x_y[1]),
+                                   zip(symbols, casted_syms)))
+                for arr, index, arg in symbols:
+                    arr[index] = arg
+
+            return f(*args, **kwargs)
+        _new_fun.__name__ = f.__name__
+        _new_fun.__module__ = f.__module__
+        _new_fun.__doc__ = f.__doc__
+        return _new_fun
+
+    _wrapper = _symbol_wrapper if module in (symbol, Symbol, symbol_contrib) else _ndarray_wrapper
+
+    submodule_dict = {}
+    for op_name_prefix in base._OP_NAME_PREFIX_LIST:
+        submodule_dict[op_name_prefix] =\
+                getattr(module, op_name_prefix[1:-1])
+
+    wrap_list = target_precision_ops if target_precision_ops is not None \
+                    else lists.symbol.FP16_FUNCS
+    for fun_name in wrap_list:
+        try:
+            fun_name, cur_module = _get_fun_to_wrap(fun_name, module, submodule_dict)
+            f_to_wrap = getattr(cur_module, fun_name)
+            setattr(cur_module, fun_name, _wrapper(f_to_wrap, target_dtype))
+            if cur_module == module:
+                setattr(module.op, fun_name, _wrapper(f_to_wrap, target_dtype))
+        except AttributeError:
+            pass
+
+    wrap_list = fp32_ops if fp32_ops is not None else lists.symbol.FP32_FUNCS
+    for fun_name in wrap_list:
+        try:
+            fun_name, cur_module = _get_fun_to_wrap(fun_name, module, submodule_dict)
+            f_to_wrap = getattr(cur_module, fun_name)
+            setattr(cur_module, fun_name, _wrapper(f_to_wrap, np.float32))
+            if cur_module == module:
+                setattr(module.op, fun_name, _wrapper(f_to_wrap, np.float32))
+        except AttributeError:
+            pass
+
+    wrap_list = conditional_fp32_ops if conditional_fp32_ops is not None \
+                    else lists.symbol.CONDITIONAL_FP32_FUNCS
+    for fun_name, arg, arg_values in wrap_list:
+        try:
+            fun_name, cur_module = _get_fun_to_wrap(fun_name, module, submodule_dict)
+            f_to_wrap = getattr(cur_module, fun_name)
+            setattr(cur_module, fun_name, _wrapper(f_to_wrap, np.float32, (arg, arg_values)))
+            if cur_module == module:
+                setattr(module.op, fun_name, _wrapper(f_to_wrap, np.float32, (arg, arg_values)))
+        except AttributeError:
+            pass
+
+    for fun_name in lists.symbol.WIDEST_TYPE_CASTS:
+        try:
+            fun_name, cur_module = _get_fun_to_wrap(fun_name, module, submodule_dict)
+            f_to_wrap = getattr(cur_module, fun_name)
+            setattr(cur_module, fun_name, _symbol_widest_wrapper(f_to_wrap))
+            if cur_module == module:
+                setattr(module.op, fun_name, _symbol_widest_wrapper(f_to_wrap))
+        except AttributeError:
+            pass
+
+def _wrap_loss_output_functions(module, ls):
+    if module == ndarray:
+        def _wrapper(f):
+            def _scaling_wrapper(*args, **kwargs):
+                if 'grad_scale' in kwargs:
+                    kwargs['grad_scale'] = kwargs['grad_scale'] * ls.loss_scale
+                else:
+                    kwargs['grad_scale'] = ls.loss_scale
+                return f(*args, **kwargs)
+            _scaling_wrapper.__name__ = f.__name__
+            _scaling_wrapper.__module__ = f.__module__
+            _scaling_wrapper.__doc__ = f.__doc__
+            return _scaling_wrapper
+    else:
+        def _wrapper(f):
+            def _warning_wrapper(*args, **kwargs):
+                logging.warning("%s does not support dynamic loss scaling "
+                                "in symbolic and hybridized execution.", f.__name__)
+                return f(*args, **kwargs)
+            _warning_wrapper.__name__ = f.__name__
+            _warning_wrapper.__module__ = f.__module__
+            _warning_wrapper.__doc__ = f.__doc__
+            return _warning_wrapper
+
+    for fun_name in lists.symbol.LOSS_OUTPUT_FUNCTIONS:
+        try:
+            f_to_wrap = getattr(module, fun_name)
+            setattr(module, fun_name, _wrapper(f_to_wrap))
+        except AttributeError:
+            pass
+
+_amp_initialized = False
+_amp_loss_scale_initialized = False
+_loss_scaler = None
+
+@contextlib.contextmanager
+def scale_loss(loss, optimizer_or_trainer):
+    assert optimizer_or_trainer._amp_loss_scaler is not None, \
+        'Loss scaler is not initialized, did you forget to call amp.init_trainer()?'
+    optimizer_or_trainer._scale = (optimizer_or_trainer._amp_original_scale /
+                                   optimizer_or_trainer._amp_loss_scaler.loss_scale)
+    if isinstance(loss, (list, tuple)):
+        yield [l * optimizer_or_trainer._amp_loss_scaler.loss_scale for l in loss]
+    else:
+        yield optimizer_or_trainer._amp_loss_scaler.loss_scale * loss
+
+def init(target_dtype='float16', target_precision_ops=None,
+         conditional_fp32_ops=None, fp32_ops=None):
+    """Initialize AMP (automatic mixed precision).
+
+    This needs to be done before model creation.
+
+    Parameters
+    ----------
+    target_dtype : {'float16'}
+        Target low precision type for AMP. Currently only float16 is supported.
+    target_precision_ops : list of string
+        Override the list of functions casted to FP16. Entries in this list
+        are names of the functions casted to FP16.
+    conditional_fp32_ops : list of (string, string, list of string)
+        Override the list of functions conditionally casted to FP32. The format
+        of the list is (name of the function, name of the parameter, list of
+        values of the parameter that make the function be casted to FP32).
+    fp32_ops : list of string
+        Override the list of functions casted to FP32. Entries in this list
+        are names of the functions casted to FP32.
+    """
+    global _amp_initialized
+    global _loss_scaler
+    if not _amp_initialized:
+        assert target_dtype in ['float16', np.float16], \
+               "AMP currently supports only float16 as a target_dtype"
+        _amp_initialized = True
+        logging.info("Using AMP")
+        target_dtype = np.dtype(target_dtype)
+        _wrap_symbol_functions(symbol, target_dtype, target_precision_ops,
+                               conditional_fp32_ops, fp32_ops)
+        _wrap_symbol_functions(ndarray, target_dtype, target_precision_ops,
+                               conditional_fp32_ops, fp32_ops)
+        _loss_scaler = LossScaler()
+        _wrap_loss_output_functions(ndarray, _loss_scaler)
+        _wrap_loss_output_functions(symbol, _loss_scaler)
+
+def init_trainer(optimizer_or_trainer):
+    """Initialize trainer or optimizer to work with AMP dynamic loss scaling.
+
+    Parameters
+    ----------
+    optimizer_or_trainer : Optimizer or Trainer
+        MXNet Optimizer or Gluon trainer to initialize with AMP
+    """
+    global _amp_loss_scale_initialized
+    global _amp_initialized
+    global _loss_scaler
+    assert _amp_initialized, "AMP not initialized, did you forget to call amp.init()?"
+    if not _amp_loss_scale_initialized:
+        _amp_loss_scale_initialized = True
+        loss_scaler = _loss_scaler
+    else:
+        loss_scaler = LossScaler()
+    #_wrap_output
+    if isinstance(optimizer_or_trainer, trainer.Trainer):
+        optimizer_or_trainer._amp_loss_scaler = loss_scaler
+        optimizer_or_trainer._amp_original_scale = optimizer_or_trainer._scale
+        skip_update = optimizer_or_trainer._amp_loss_scaler.wait_and_update
+        optimizer_or_trainer._optimizer.old_update_multi_precision = \
+                optimizer_or_trainer._optimizer.update_multi_precision
+        def new_update_multi_precision(self, index, weight, grad, state):
+            if not skip_update():
+                self.old_update_multi_precision(index, weight, grad, state)
+        optimizer_or_trainer._optimizer.update_multi_precision = \
+            MethodType(new_update_multi_precision, optimizer_or_trainer._optimizer)
+        launch_check_overflow = optimizer_or_trainer._amp_loss_scaler.launch_check_overflow
+        optimizer_or_trainer._old_update = optimizer_or_trainer._update
+        def new_update(self, ignore_stale_grad=False):
+            launch_check_overflow(self._params)
+            self._old_update(ignore_stale_grad)
+        optimizer_or_trainer._update = MethodType(new_update, optimizer_or_trainer)
+
+    elif isinstance(optimizer_or_trainer, opt.Optimizer):
+        # TODO(ptredak): make it work with the optimizer
+        raise TypeError("AMP is currently only compatible with Gluon Trainer")
+    else:
+        raise TypeError("optimizer_or_trainer should be a Gluon Trainer or "
+                        "an optimizer, instead is %s" % type(optimizer_or_trainer))
+
+def unscale(optimizer_or_trainer):
+    """Check and unscale the gradients manually. This function should only be used
+    if accessing gradients is necessary, e.g. for gradient clipping.
+
+    Parameters
+    ----------
+    optimizer_or_trainer : Optimizer or Trainer
+        MXNet optimizer or Gluon Trainer used when scaling the gradients
+    """
+    if isinstance(optimizer_or_trainer, trainer.Trainer):
+        valid_grads = [p._grad for p in optimizer_or_trainer._params if p._grad is not None]
+        for grads in valid_grads:
+            # TODO(ptredak): make a bulked unscale
+            for g in grads:
+                g[:] *= optimizer_or_trainer._scale
+        optimizer_or_trainer._scale = 1.
+    elif isinstance(optimizer_or_trainer, opt.Optimizer):
+        # TODO(ptredak): make it work with the optimizer
+        raise TypeError("AMP is currently only compatible with Gluon Trainer")
+    else:
+        raise TypeError("optimizer_or_trainer should be a Gluon Trainer or "
+                        "an optimizer, instead is %s" % type(optimizer_or_trainer))
diff --git a/python/mxnet/contrib/amp/lists/__init__.py b/python/mxnet/contrib/amp/lists/__init__.py
new file mode 100644
index 000000000000..e1289441181a
--- /dev/null
+++ b/python/mxnet/contrib/amp/lists/__init__.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Lists of functions whitelisted/blacklisted for automatic mixed precision."""
+
+from . import symbol
diff --git a/python/mxnet/contrib/amp/lists/symbol.py b/python/mxnet/contrib/amp/lists/symbol.py
new file mode 100644
index 000000000000..9c99340ab75d
--- /dev/null
+++ b/python/mxnet/contrib/amp/lists/symbol.py
@@ -0,0 +1,610 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Lists of functions whitelisted/blacklisted for automatic mixed precision in symbol API."""
+
+# Functions that should be cast to lower precision
+FP16_FUNCS = [
+    'Convolution',
+    'Deconvolution',
+    'FullyConnected',
+    'RNN',
+    ]
+
+# Functions that should not be casted, either because
+# they are irrelevant (not used in the network itself
+# like image transformations or optimizers) or they
+# are dtype neutral (can work in both fp16 and fp32)
+FP16_FP32_FUNCS = [
+    'BatchNorm',
+    'BatchNorm_v1',
+    'BilinearSampler',
+    'BlockGrad',
+    'Cast',
+    'cast',
+    'cast_storage',
+    'Crop',
+    'Dropout',
+    'Embedding',
+    '_sparse_Embedding',
+    '_sparse_FullyConnected',
+    'Flatten',
+    'GridGenerator',
+    'Pad',
+    'Pooling',
+    'Pooling_v1',
+    'ROIPooling',
+    'Reshape',
+    'SequenceLast',
+    'SequenceMask',
+    'SequenceReverse',
+    'SliceChannel',
+    'SpatialTransformer',
+    'SwapAxis',
+    'UpSampling',
+    '_CachedOp',
+    '_CrossDeviceCopy',
+    '_CustomFunction',
+    '_DivScalar',
+    '_EqualScalar',
+    '_GreaterScalar',
+    '_GreaterEqualScalar',
+    '_LesserScalar',
+    '_LesserEqualScalar',
+    '_LogicalAndScalar',
+    '_LogicalOrScalar',
+    '_LogicalXorScalar',
+    '_MaximumScalar',
+    '_MinimumScalar',
+    '_MinusScalar',
+    '_ModScalar',
+    '_MulScalar',
+    '_NoGradient',
+    '_NotEqualScalar',
+    '_PlusScalar',
+    '_RMinusScalar',
+    '_RModScalar',
+    '_adamw_update',
+    '_add',
+    '_arange',
+    '_broadcast_backward',
+    '_cond',
+    '_contrib_AdaptiveAvgPooling2D',
+    '_contrib_BilinearResize2D',
+    '_contrib_SparseEmbedding',
+    '_contrib_bipartite_matching',
+    '_contrib_dequantize',
+    '_contrib_div_sqrt_dim',
+    '_contrib_boolean_mask',
+    '_contrib_getnnz',
+    '_contrib_gradientmultiplier',
+    '_contrib_group_adagrad_update',
+    '_contrib_ifft',
+    '_contrib_index_array',
+    '_contrib_index_copy',
+    '_contrib_quadratic',
+    '_contrib_quantize',
+    '_contrib_quantize_v2',
+    '_contrib_quantized_concat',
+    '_contrib_quantized_conv',
+    '_contrib_quantized_flatten',
+    '_contrib_quantized_fully_connected',
+    '_contrib_quantized_pooling',
+    '_contrib_quantized_elemwise_add',
+    '_contrib_quantized_act',
+    '_image_crop',
+    '_linspace',
+    '_contrib_requantize',
+    '_copy',
+    '_copyto',
+    '_crop_assign',
+    '_crop_assign_scalar',
+    '_cvcopyMakeBorder',
+    '_cvimdecode',
+    '_cvimread',
+    '_cvimresize',
+    '_div_scalar',
+    '_equal_scalar',
+    '_eye',
+    '_foreach',
+    '_while_loop',
+    '_full',
+    '_grad_add',
+    '_greater_scalar',
+    '_greater_equal_scalar',
+    '_histogram',
+    '_identity_with_attr_like_rhs',
+    '_image_adjust_lighting',
+    '_image_flip_left_right',
+    '_image_flip_top_bottom',
+    '_image_normalize',
+    '_image_random_brightness',
+    '_image_random_color_jitter',
+    '_image_random_contrast',
+    '_image_random_flip_left_right',
+    '_image_random_flip_top_bottom',
+    '_image_random_hue',
+    '_image_random_lighting',
+    '_image_random_saturation',
+    '_image_resize',
+    '_image_to_tensor',
+    '_imdecode',
+    '_lesser_scalar',
+    '_lesser_equal_scalar',
+    '_logical_and_scalar',
+    '_logical_or_scalar',
+    '_logical_xor_scalar',
+    '_maximum_scalar',
+    '_minimum_scalar',
+    '_minus_scalar',
+    '_mod_scalar',
+    '_mp_adamw_update',
+    '_mul_scalar',
+    '_not_equal_scalar',
+    '_onehot_encode',
+    '_ones',
+    '_plus_scalar',
+    '_random_exponential',
+    '_random_exponential_like',
+    '_random_gamma',
+    '_random_gamma_like',
+    '_random_generalized_negative_binomial',
+    '_random_generalized_negative_binomial_like',
+    '_random_negative_binomial',
+    '_random_negative_binomial_like',
+    '_random_normal',
+    '_random_normal_like',
+    '_random_poisson',
+    '_random_poisson_like',
+    '_random_randint',
+    '_random_uniform',
+    '_random_uniform_like',
+    '_ravel_multi_index',
+    '_rminus_scalar',
+    '_rmod_scalar',
+    '_rnn_param_concat',
+    '_sample_exponential',
+    '_sample_gamma',
+    '_sample_generalized_negative_binomial',
+    '_sample_multinomial',
+    '_sample_negative_binomial',
+    '_sample_normal',
+    '_sample_poisson',
+    '_sample_uniform',
+    '_sample_unique_zipfian',
+    '_scatter_minus_scalar',
+    '_scatter_plus_scalar',
+    '_scatter_set_nd',
+    '_set_value',
+    '_shuffle',
+    '_slice_assign',
+    '_slice_assign_scalar',
+    '_sparse_abs',
+    '_sparse_adagrad_update',
+    '_sparse_adam_update',
+    '_sparse_arccosh',
+    '_sparse_arcsinh',
+    '_sparse_arctan',
+    '_sparse_cast_storage',
+    '_sparse_cbrt',
+    '_sparse_ceil',
+    '_sparse_clip',
+    '_sparse_concat',
+    '_sparse_cos',
+    '_sparse_degrees',
+    '_sparse_fix',
+    '_sparse_floor',
+    '_sparse_ftrl_update',
+    '_sparse_negative',
+    '_sparse_radians',
+    '_sparse_relu',
+    '_sparse_retain',
+    '_sparse_rint',
+    '_sparse_round',
+    '_sparse_sgd_mom_update',
+    '_sparse_sgd_update',
+    '_sparse_sigmoid',
+    '_sparse_sign',
+    '_sparse_sin',
+    '_sparse_sinh',
+    '_sparse_slice',
+    '_sparse_sqrt',
+    '_sparse_stop_gradient',
+    '_sparse_tanh',
+    '_sparse_trunc',
+    '_sparse_zeros_like',
+    '_split_v2',
+    '_split_v2_backward',
+    '_unravel_index',
+    '_zeros',
+    '_zeros_without_dtype',
+    'abs',
+    'adam_update',
+    'all_finite',
+    'amp_cast',
+    'amp_multicast',
+    'arccosh',
+    'arcsinh',
+    'arctan',
+    'argmax',
+    'argmax_channel',
+    'argmin',
+    'batch_take',
+    'broadcast_axes',
+    'broadcast_axis',
+    'broadcast_like',
+    'broadcast_to',
+    'cbrt',
+    'ceil',
+    'choose_element_0index',
+    'clip',
+    'cos',
+    'crop',
+    'degrees',
+    'depth_to_space',
+    'diag',
+    'erf',
+    'expand_dims',
+    'fill_element_0index',
+    'fix',
+    'flatten',
+    'flip',
+    'floor',
+    'ftml_update',
+    'ftrl_update',
+    'gather_nd',
+    'hard_sigmoid',
+    'identity',
+    'logical_not',
+    'max_axis',
+    'max',
+    'min',
+    'min_axis',
+    'mp_sgd_mom_update',
+    'mp_sgd_update',
+    'multi_all_finite',
+    'multi_mp_sgd_mom_update',
+    'multi_mp_sgd_update',
+    'multi_sgd_mom_update',
+    'multi_sgd_update',
+    'negative',
+    'normal',
+    'one_hot',
+    'ones_like',
+    'pad',
+    'pick',
+    'radians',
+    'random_exponential',
+    'random_gamma',
+    'random_generalized_negative_binomial',
+    'random_negative_binomial',
+    'random_normal',
+    'random_poisson',
+    'random_randint',
+    'random_uniform',
+    'ravel_multi_index',
+    'relu',
+    'repeat',
+    'reshape',
+    'reshape_like',
+    'reverse',
+    'rint',
+    'rmsprop_update',
+    'rmspropalex_update',
+    'round',
+    'sample_exponential',
+    'sample_gamma',
+    'sample_generalized_negative_binomial',
+    'sample_multinomial',
+    'sample_negative_binomial',
+    'sample_normal',
+    'sample_poisson',
+    'sample_uniform',
+    'scatter_nd',
+    'sgd_mom_update',
+    'sgd_update',
+    'shape_array',
+    'shuffle',
+    'sigmoid',
+    'sign',
+    'signsgd_update',
+    'signum_update',
+    'sin',
+    'size_array',
+    'slice',
+    'slice_axis',
+    'slice_like',
+    'softsign',
+    'sort',
+    'space_to_depth',
+    'split',
+    'sqrt',
+    'squeeze',
+    'stop_gradient',
+    'swapaxes',
+    'take',
+    'tanh',
+    'tile',
+    'topk',
+    'transpose',
+    'trunc',
+    'uniform',
+    'unravel_index',
+    'zeros_like',
+    '_sg_mkldnn_conv',
+    '_sg_mkldnn_fully_connected',
+    'CuDNNBatchNorm',
+    '_TensorRT',
+    ]
+
+# Functions that have to be cast to FP32 due to possible
+# overflows
+FP32_FUNCS = [
+    'Convolution_v1',
+    'IdentityAttachKLSparseReg',
+    'arccos',
+    '_sparse_arccos',
+    'arcsin',
+    'cosh',
+    '_sparse_cosh',
+    'erfinv',
+    'sinh',
+    'tan',
+    '_sparse_tan',
+    'arctanh',
+    '_sparse_arcsin',
+    '_sparse_arctanh',
+
+    # Exponents
+    'exp',
+    'expm1',
+    '_sparse_exp',
+    '_sparse_expm1',
+    'log',
+    'log10',
+    'log2',
+    'log1p',
+
+    # Powers
+    'broadcast_power',
+    'square',
+    '_sparse_square',
+    'reciprocal',
+    '_RDivScalar',
+    '_rdiv_scalar',
+    'rsqrt',
+    'rcbrt',
+    '_Power',
+    '_PowerScalar',
+    '_power',
+    '_power_scalar',
+    '_RPowerScalar',
+    '_rpower_scalar',
+    'linalg_sumlogdiag',
+    '_Hypot',
+    '_HypotScalar',
+    '_hypot',
+    '_hypot_scalar',
+    'broadcast_hypot',
+    '_square_sum',
+    '_contrib_hawkesll',
+
+    # Reductions
+    'sum',
+    'sum_axis',
+    'nansum',
+    'prod',
+    'nanprod',
+    'mean',
+    'norm',
+    'softmin',
+    'khatri_rao',
+    'moments',
+
+    # Misc
+    'gamma',
+    'gammaln',
+    '_linalg_gelqf',
+    '_linalg_gemm',
+    '_linalg_gemm2',
+    '_linalg_potrf',
+    '_linalg_potri',
+    '_linalg_sumlogdiag',
+    '_linalg_syevd',
+    '_linalg_syrk',
+    '_linalg_trmm',
+    '_linalg_trsm',
+    '_linalg_makediag',
+    '_linalg_extractdiag',
+    '_linalg_maketrian',
+    '_linalg_extracttrian',
+    '_linalg_inverse',
+    'linalg_syrk',
+    'linalg_potrf',
+    'linalg_potri',
+    'linalg_gemm2',
+    'linalg_gemm',
+    'linalg_gelqf',
+    'linalg_trmm',
+    'linalg_trsm',
+    'linalg_makediag',
+    'linalg_extractdiag',
+    'linalg_maketrian',
+    'linalg_extracttrian',
+    'linalg_inverse',
+    '_NDArray',
+    '_Native',
+    '_contrib_count_sketch',
+    '_contrib_SyncBatchNorm',
+    '_contrib_fft',
+    '_sparse_gamma',
+    '_sparse_gammaln',
+    '_sparse_log',
+    '_sparse_log10',
+    '_sparse_log1p',
+    '_sparse_log2',
+    '_sparse_make_loss',
+    '_sparse_mean',
+    '_sparse_norm',
+    '_sparse_rsqrt',
+    'argsort',
+
+    # Neural network
+    'SoftmaxOutput',
+    'softmax',
+    'Softmax',
+    'log_softmax',
+    'InstanceNorm',
+    'LayerNorm',
+    'L2Normalization',
+    'LRN',
+    'SoftmaxActivation',
+    'LinearRegressionOutput',
+    'LogisticRegressionOutput',
+    'MAERegressionOutput',
+    '_sparse_LinearRegressionOutput',
+    '_sparse_LogisticRegressionOutput',
+    '_sparse_MAERegressionOutput',
+    'SVMOutput',
+    'softmax_cross_entropy',
+    'smooth_l1',
+    'MakeLoss',
+    'make_loss',
+    'Custom',
+    'CTCLoss',
+    '_contrib_CTCLoss',
+    '_contrib_ctc_loss',
+    'ctc_loss',
+    '_contrib_DeformableConvolution',
+    '_contrib_DeformablePSROIPooling',
+    ]
+
+# Functions that have to be cast to FP32 only for
+# some values of their parameters
+CONDITIONAL_FP32_FUNCS = [
+    ('Activation', 'act_type', ['softrelu']),
+    ('LeakyReLU', 'act_type', ['elu', 'selu']),
+    ]
+
+# Functions with multiple inputs, that need the same
+# type of all their inputs
+WIDEST_TYPE_CASTS = [
+    '_Plus',
+    '_plus',
+    '_Minus',
+    '_sub',
+    '_Mul',
+    '_Div',
+    '_div',
+    '_scatter_elemwise_div',
+    '_Mod',
+    '_Not_Equal',
+    '_Equal',
+    '_equal',
+    '_Greater',
+    '_greater',
+    '_Greater_Equal',
+    '_greater_equal',
+    '_Lesser',
+    '_Lesser_Equal',
+    '_lesser',
+    '_lesser_equal',
+    '_Logical_And',
+    '_Logical_Or',
+    '_Logical_Xor',
+    '_logical_and',
+    '_logical_or',
+    '_logical_xor',
+    '_maximum',
+    '_minimum',
+    '_minus',
+    '_mod',
+    '_mul',
+    '_not_equal',
+    'Concat',
+    'concat',
+    'Correlation',
+    'ElementWiseSum',
+    '_sparse_ElementWiseSum',
+    'add_n',
+    '_sparse_add_n',
+    'batch_dot',
+    'broadcast_add',
+    'broadcast_plus',
+    'broadcast_div',
+    'broadcast_equal',
+    'broadcast_greater',
+    'broadcast_greater_equal',
+    'broadcast_lesser',
+    'broadcast_lesser_equal',
+    'broadcast_logical_and',
+    'broadcast_logical_or',
+    'broadcast_logical_xor',
+    'broadcast_maximum',
+    'broadcast_minimum',
+    'broadcast_minus',
+    'broadcast_mod',
+    'broadcast_mul',
+    'broadcast_not_equal',
+    'broadcast_sub',
+    'dot',
+    'elemwise_add',
+    'elemwise_div',
+    'elemwise_mul',
+    'elemwise_sub',
+    'stack',
+    '_Maximum',
+    '_Minimum',
+    '_contrib_MultiBoxDetection',
+    '_contrib_MultiBoxPrior',
+    '_contrib_MultiBoxTarget',
+    '_contrib_MultiProposal',
+    '_contrib_PSROIPooling',
+    '_contrib_Proposal',
+    '_contrib_ROIAlign',
+    '_contrib_box_iou',
+    '_contrib_box_nms',
+    '_contrib_box_non_maximum_suppression',
+    '_contrib_dgl_adjacency',
+    '_contrib_dgl_csr_neighbor_non_uniform_sample',
+    '_contrib_dgl_csr_neighbor_uniform_sample',
+    '_contrib_dgl_graph_compact',
+    '_contrib_dgl_subgraph',
+    '_contrib_edge_id',
+    'where',
+    '_sparse_where',
+    '_sparse_broadcast_add',
+    '_sparse_broadcast_div',
+    '_sparse_broadcast_minus',
+    '_sparse_broadcast_mul',
+    '_sparse_broadcast_plus',
+    '_sparse_broadcast_sub',
+    '_sparse_dot',
+    '_sparse_elemwise_add',
+    '_sparse_elemwise_div',
+    '_sparse_elemwise_mul',
+    '_sparse_elemwise_sub',
+    '_sparse_sum',
+    ]
+
+LOSS_OUTPUT_FUNCTIONS = [
+    'SoftmaxOutput',
+    'LinearRegressionOutput',
+    'LogisticRegressionOutput',
+    'MAERegressionOutput',
+    ]
diff --git a/python/mxnet/contrib/amp/loss_scaler.py b/python/mxnet/contrib/amp/loss_scaler.py
new file mode 100755
index 000000000000..a2600bcc2a49
--- /dev/null
+++ b/python/mxnet/contrib/amp/loss_scaler.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Dynamic loss scaler for AMP."""
+import logging
+
+from ...ndarray import multi_all_finite
+from ...ndarray import ndarray as nd
+from ... import autograd as ag
+
+class LossScaler(object):
+    """Dynamic loss scaler for AMP.
+
+    Properties
+    ----------
+    loss_scale : float
+        The current loss scale
+    """
+    def __init__(self):
+        self._loss_scale = 2.**16
+        self._next_loss_scale = self._loss_scale
+        self._max_loss_scale = 2.**24
+        self._scale_seq_len = 2000
+        self._unskipped = 0
+        self._has_overflow = False
+
+    @property
+    def loss_scale(self):
+        return self._loss_scale
+
+    def launch_check_overflow(self, params):
+        """Launch overflow checking for gradients."""
+        self._wait_for_outputs = True
+        self._has_overflow = False
+        with ag.pause():
+            chunk_size = 200
+            valid_params = [p._grad[0] for p in params if p._grad is not None]
+            gpu_output = nd.ones((1,), ctx=valid_params[0].context)
+            nb_params = len(valid_params)
+            for idx in range(0, nb_params, chunk_size):
+                multi_all_finite(*valid_params[idx:idx+chunk_size],
+                                 num_arrays=len(valid_params[idx:idx+chunk_size]),
+                                 init_output=False, out=gpu_output)
+            self.output = gpu_output
+
+    def wait_and_update(self):
+        """Wait for the results of overflow checking and update the loss scale."""
+        if self._wait_for_outputs:
+            self._has_overflow = not bool(self.output.asnumpy())
+            self._loss_scale = self._next_loss_scale
+            if self._has_overflow:
+                self._next_loss_scale = self._loss_scale / 2.
+                self._unskipped = 0
+                logging.info("AMP: decreasing loss scale to %f", self._next_loss_scale)
+            else:
+                self._unskipped += 1
+            if self._unskipped == self._scale_seq_len:
+                self._unskipped = 0
+                self._next_loss_scale = min(self._max_loss_scale, self._loss_scale * 2.)
+                logging.info("AMP: increasing loss scale to %f", self._next_loss_scale)
+            self._wait_for_outputs = False
+        return self._has_overflow
diff --git a/python/mxnet/contrib/onnx/onnx2mx/import_model.py b/python/mxnet/contrib/onnx/onnx2mx/import_model.py
index 0048c748e918..1c195435729b 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/import_model.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/import_model.py
@@ -24,7 +24,7 @@
 def import_model(model_file):
     """Imports the ONNX model file, passed as a parameter, into MXNet symbol and parameters.
     Operator support and coverage -
-    https://cwiki.apache.org/confluence/display/MXNET/MXNet-ONNX+Integration
+    https://cwiki.apache.org/confluence/display/MXNET/ONNX+Operator+Coverage
 
     Parameters
     ----------
diff --git a/python/mxnet/contrib/text/embedding.py b/python/mxnet/contrib/text/embedding.py
index b7f3fcbc5c50..9d529db7d846 100644
--- a/python/mxnet/contrib/text/embedding.py
+++ b/python/mxnet/contrib/text/embedding.py
@@ -573,7 +573,7 @@ class FastText(_TokenEmbedding):
     https://fasttext.cc/
 
     To get the updated URLs to the externally hosted pre-trained token embedding files, visit
-    https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
+    https://github.com/facebookresearch/fastText/blob/master/docs/pretrained-vectors.md
 
     License for pre-trained embeddings:
 
diff --git a/python/mxnet/cython/base.pyi b/python/mxnet/cython/base.pyi
index d73e1a7d0194..548afc782763 100644
--- a/python/mxnet/cython/base.pyi
+++ b/python/mxnet/cython/base.pyi
@@ -90,20 +90,25 @@ cdef extern from "mxnet/c_api.h":
     int MXSymbolSetAttr(SymbolHandle symbol,
                         const char* key,
                         const char* value);
-    int MXImperativeInvoke(OpHandle creator,
+    int MXImperativeInvokeEx(OpHandle creator,
+                             int num_inputs,
+                             NDArrayHandle *inputs,
+                             int *num_outputs,
+                             NDArrayHandle **outputs,
+                             int num_params,
+                             const char **param_keys,
+                             const char **param_vals,
+                             const int **out_stypes);
+    int MXNDArrayFree(NDArrayHandle handle);
+    int MXCreateCachedOpEx(SymbolHandle handle,
+                            int num_flags,
+                            const char** keys,
+                            const char** vals,
+                            CachedOpHandle *out);
+    int MXFreeCachedOp(CachedOpHandle handle);
+    int MXInvokeCachedOpEx(CachedOpHandle handle,
                            int num_inputs,
                            NDArrayHandle *inputs,
                            int *num_outputs,
                            NDArrayHandle **outputs,
-                           int num_params,
-                           const char **param_keys,
-                           const char **param_vals);
-    int MXNDArrayFree(NDArrayHandle handle);
-    int MXCreateCachedOp(SymbolHandle handle,
-                         CachedOpHandle *out);
-    int MXFreeCachedOp(CachedOpHandle handle);
-    int MXInvokeCachedOp(CachedOpHandle handle,
-                       int num_inputs,
-                       NDArrayHandle *inputs,
-                       int *num_outputs,
-                       NDArrayHandle **outputs);
+                           const int **out_stypes);
diff --git a/python/mxnet/cython/ndarray.pyx b/python/mxnet/cython/ndarray.pyx
index 319dc492dbb8..f9279889b504 100644
--- a/python/mxnet/cython/ndarray.pyx
+++ b/python/mxnet/cython/ndarray.pyx
@@ -63,19 +63,16 @@ cdef class NDArrayBase:
         return (_ndarray_cls, (None,), self.__getstate__())
 
 
-_ndarray_cls = NDArrayBase
+_ndarray_cls = None
 
 def _set_ndarray_class(cls):
     global _ndarray_cls
     _ndarray_cls = cls
 
 
-cdef NewArray(NDArrayHandle handle):
+cdef NewArray(NDArrayHandle handle, int stype=-1):
     """Create a new array given handle"""
-    nd = _ndarray_cls(None)
-    (<NDArrayBase>nd).chandle = handle
-    (<NDArrayBase>nd).cwritable = True
-    return nd
+    return _ndarray_cls(_ctypes.cast(<unsigned long long>handle, _ctypes.c_void_p), stype=stype)
 
 
 cdef class CachedOp:
@@ -99,10 +96,21 @@ cdef class CachedOp:
         def __set__(self, value):
             self._set_handle(value)
 
-    def __init__(self, sym):
-        cdef unsigned long long ptr = sym.handle.value
-        CALL(MXCreateCachedOp(
-            (<SymbolHandle>ptr),
+    def __init__(self, sym, flags=()):
+        cdef vector[string] s_flag_keys
+        cdef vector[string] s_flag_vals
+        if flags is not None:
+            for k, v in flags:
+                s_flag_keys.push_back(c_str(k))
+                s_flag_vals.push_back(c_str(str(v)))
+        cdef vector[const char*] c_flag_keys = SVec2Ptr(s_flag_keys)
+        cdef vector[const char*] c_flag_vals = SVec2Ptr(s_flag_vals)
+
+        CALL(MXCreateCachedOpEx(
+            <SymbolHandle>(<unsigned long long>sym.handle.value),
+            len(flags),
+            CBeginPtr(c_flag_keys),
+            CBeginPtr(c_flag_vals),
             &self.chandle))
 
     def __del__(self):
@@ -115,6 +123,7 @@ cdef class CachedOp:
         cdef NDArrayHandle* p_output_vars
         cdef NDArrayHandle ret_handle
         cdef int num_output
+        cdef const int* p_output_stypes
 
         for i in args:
             ndvars.push_back((<NDArrayBase>i).chandle)
@@ -130,24 +139,24 @@ cdef class CachedOp:
 
         num_output = output_vars.size()
         if output_vars.size() == 0:
-            output_vars.resize(1)
             p_output_vars = NULL
         else:
             p_output_vars = &output_vars[0]
 
-        CALL(MXInvokeCachedOp(
-            (<CachedOp>self).chandle,
+        CALL(MXInvokeCachedOpEx(
+            self.chandle,
             <int>len(args),
             &ndvars[0] if ndvars.size() != 0 else NULL,
             &num_output,
-            &p_output_vars))
+            &p_output_vars,
+            &p_output_stypes))
 
         if original_output is not None:
             return original_output
         if num_output == 1:
-            return NewArray(p_output_vars[0])
+            return NewArray(p_output_vars[0], p_output_stypes[0])
         else:
-            return tuple(NewArray(p_output_vars[i]) for i in range(num_output))
+            return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
 
 
 def _imperative_invoke(handle, ndargs, keys, vals, out):
@@ -161,6 +170,7 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
     cdef NDArrayHandle* p_output_vars
     cdef NDArrayHandle ret_handle
     cdef int num_output
+    cdef const int* p_output_stypes
 
     for i in ndargs:
         ndvars.push_back((<NDArrayBase>i).chandle)
@@ -180,7 +190,6 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
 
     num_output = output_vars.size()
     if output_vars.size() == 0:
-        output_vars.resize(1)
         p_output_vars = NULL
     else:
         p_output_vars = &output_vars[0]
@@ -188,7 +197,7 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
     cdef vector[const char*] param_keys = SVec2Ptr(ckeys)
     cdef vector[const char*] param_vals = SVec2Ptr(cvals)
 
-    CALL(MXImperativeInvoke(
+    CALL(MXImperativeInvokeEx(
         chandle,
         <int>ndvars.size(),
         &ndvars[0] if ndvars.size() != 0 else NULL,
@@ -196,11 +205,12 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         &p_output_vars,
         <int>param_keys.size(),
         CBeginPtr(param_keys),
-        CBeginPtr(param_vals)))
+        CBeginPtr(param_vals),
+        &p_output_stypes))
 
     if original_output is not None:
         return original_output
     if num_output == 1:
-        return NewArray(p_output_vars[0])
+        return NewArray(p_output_vars[0], p_output_stypes[0])
     else:
-        return tuple(NewArray(p_output_vars[i]) for i in range(num_output))
+        return [NewArray(p_output_vars[i], p_output_stypes[i]) for i in range(num_output)]
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 2f3ed91cb5b7..e78d11c94e43 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -354,7 +354,7 @@ def save_params(self, filename):
                               'save_parameters may resolve this error.'%e.message)
 
     def load_parameters(self, filename, ctx=None, allow_missing=False,
-                        ignore_extra=False):
+                        ignore_extra=False, cast_dtype=False):
         """Load parameters from file previously saved by `save_parameters`.
 
         Parameters
@@ -368,6 +368,9 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         ignore_extra : bool, default False
             Whether to silently ignore parameters from the file that are not
             present in this Block.
+        cast_dtype : bool, default False
+            Cast the data type of the NDArray loaded from the checkpoint to the dtype
+            provided by the Parameter if any.
 
         References
         ----------
@@ -383,7 +386,7 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
             # legacy loading
             del loaded
             self.collect_params().load(
-                filename, ctx, allow_missing, ignore_extra, self.prefix)
+                filename, ctx, allow_missing, ignore_extra, self.prefix, cast_dtype=cast_dtype)
             return
 
         if not allow_missing:
@@ -399,7 +402,7 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
                     "which contains parameters %s. Set ignore_extra=True to ignore. "%(
                         name, filename, _brief_print_list(self._params.keys())))
             if name in params:
-                params[name]._load_init(loaded[name], ctx)
+                params[name]._load_init(loaded[name], ctx, cast_dtype=cast_dtype)
 
     def load_params(self, filename, ctx=None, allow_missing=False,
                     ignore_extra=False):
@@ -865,7 +868,7 @@ def infer_type(self, *args):
         """Infers data type of Parameters from inputs."""
         self._infer_attrs('infer_type', 'dtype', *args)
 
-    def export(self, path, epoch=0):
+    def export(self, path, epoch=0, remove_amp_cast=True):
         """Export HybridBlock to json format that can be loaded by
         `SymbolBlock.imports`, `mxnet.mod.Module` or the C++ interface.
 
@@ -885,7 +888,7 @@ def export(self, path, epoch=0):
                 "Please first call block.hybridize() and then run forward with "
                 "this block at least once before calling export.")
         sym = self._cached_graph[1]
-        sym.save('%s-symbol.json'%path)
+        sym.save('%s-symbol.json'%path, remove_amp_cast=remove_amp_cast)
 
         arg_names = set(sym.list_arguments())
         aux_names = set(sym.list_auxiliary_states())
diff --git a/python/mxnet/gluon/contrib/estimator/__init__.py b/python/mxnet/gluon/contrib/estimator/__init__.py
new file mode 100644
index 000000000000..58600dadffb4
--- /dev/null
+++ b/python/mxnet/gluon/contrib/estimator/__init__.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""Gluon Estimator Module"""
+from .estimator import *
+from .event_handler import *
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
new file mode 100644
index 000000000000..da1a3915caec
--- /dev/null
+++ b/python/mxnet/gluon/contrib/estimator/estimator.py
@@ -0,0 +1,408 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-variable
+"""Gluon Estimator"""
+
+import copy
+import warnings
+
+from .event_handler import MetricHandler, ValidationHandler, LoggingHandler, StoppingHandler
+from .event_handler import TrainBegin, EpochBegin, BatchBegin, BatchEnd, EpochEnd, TrainEnd
+from .... import gluon, autograd
+from ....context import Context, cpu, gpu, num_gpus
+from ....metric import EvalMetric, Loss, Accuracy
+
+__all__ = ['Estimator']
+
+
+class Estimator(object):
+    """Estimator Class for easy model training
+
+    :py:class:`Estimator` can be used to facilitate the training & validation process
+
+
+    Parameters
+    ----------
+    net : Block
+        The model used for training.
+    loss : gluon.loss.Loss or list of gluon.loss.Loss
+        Loss(objective functions) to calculate during training.
+    metrics : EvalMetric or list of EvalMetric
+        Metrics for evaluating models.
+    initializer : Initializer
+        Initializer to initialize the network.
+    trainer : Trainer
+        Trainer to apply optimizer on network parameters.
+    context : Context or list of Context
+        Device(s) to run the training on.
+    """
+
+    def __init__(self, net,
+                 loss,
+                 metrics=None,
+                 initializer=None,
+                 trainer=None,
+                 context=None):
+
+        self.net = net
+        self.loss = self._check_loss(loss)
+        self.train_metrics = self._check_metrics(metrics)
+
+        self.context = self._check_context(context)
+        self._initialize(initializer)
+        self.trainer = self._check_trainer(trainer)
+
+    def _check_loss(self, loss):
+        if isinstance(loss, gluon.loss.Loss):
+            loss = [loss]
+        elif isinstance(loss, list) and all([isinstance(l, gluon.loss.Loss) for l in loss]):
+            loss = loss
+        else:
+            raise ValueError("loss must be a Loss or a list of Loss, "
+                             "refer to gluon.loss.Loss:{}".format(loss))
+        return loss
+
+    def _check_metrics(self, metrics):
+        if isinstance(metrics, EvalMetric):
+            metrics = [metrics]
+        else:
+            metrics = metrics or []
+            if not all([isinstance(metric, EvalMetric) for metric in metrics]):
+                raise ValueError("metrics must be a Metric or a list of Metric, "
+                                 "refer to mxnet.metric.EvalMetric:{}".format(metrics))
+        return metrics
+
+    def _check_context(self, context):
+        # infer available context
+        gpus = num_gpus()
+        available_gpus = [gpu(i) for i in range(gpus)]
+
+        if context:
+            # check context values, only accept Context or a list of Context
+            if isinstance(context, Context):
+                context = [context]
+            elif isinstance(context, list) and all([isinstance(c, Context) for c in context]):
+                context = context
+            else:
+                raise ValueError("context must be a Context or a list of Context, "
+                                 "for example mx.cpu() or [mx.gpu(0), mx.gpu(1)], "
+                                 "refer to mxnet.Context:{}".format(context))
+            for ctx in context:
+                assert ctx in available_gpus or str(ctx).startswith('cpu'), \
+                    "%s is not available, please make sure " \
+                    "your context is in one of: mx.cpu(), %s" % \
+                    (ctx, ", ".join([str(ctx) for ctx in available_gpus]))
+        else:
+            # provide default context
+            if gpus > 0:
+                # only use 1 GPU by default
+                if gpus > 1:
+                    warnings.warn("You have multiple GPUs, gpu(0) will be used by default."
+                                  "To utilize all your GPUs, specify context as a list of gpus, "
+                                  "e.g. context=[mx.gpu(0), mx.gpu(1)] ")
+                context = [gpu(0)]
+            else:
+                context = [cpu()]
+        return context
+
+    def _initialize(self, initializer):
+        # initialize the network
+        if not self._is_initialized():
+            # net is partially or not initialized,
+            # initialize with user specified initializer
+            # if initializer is None, default initializer will be used
+            # do not re-init layers already initialized
+            if initializer:
+                self.net.initialize(init=initializer, ctx=self.context)
+            else:
+                self.net.initialize(ctx=self.context)
+        elif initializer:
+            # net is fully initialized, and user passed not None initializer
+            # do not force reinitialize, give warning
+            warnings.warn("Network already fully initialized, skipping initialization. "
+                          "You don't need to pass initializer if you already "
+                          "initialized your net. "
+                          "You can use net.initialize(init=your_initializer, force_reinit=True)"
+                          "to force re-initialize.")
+
+    def _check_trainer(self, trainer):
+        # handle trainer
+        if not trainer:
+            warnings.warn("No trainer specified, default SGD optimizer "
+                          "with learning rate 0.001 is used.")
+            trainer = gluon.Trainer(self.net.collect_params(),
+                                    'sgd', {'learning_rate': 0.001})
+        elif not isinstance(trainer, gluon.Trainer):
+            raise ValueError("Trainer must be a Gluon Trainer instance, refer to "
+                             "gluon.Trainer:{}".format(trainer))
+        return trainer
+
+    def _is_initialized(self):
+        param_dict = self.net.collect_params()
+        for param in param_dict:
+            try:
+                param_dict[param].list_ctx()
+            except RuntimeError:
+                return False
+        return True
+
+    def _get_data_and_label(self, batch, ctx, batch_axis=0):
+        data = batch[0]
+        label = batch[1]
+        data = gluon.utils.split_and_load(data, ctx_list=ctx, batch_axis=batch_axis)
+        label = gluon.utils.split_and_load(label, ctx_list=ctx, batch_axis=batch_axis)
+        return data, label
+
+    def prepare_loss_and_metrics(self):
+        """
+        Based on loss functions and training metrics in estimator
+        Create metric wrappers to record loss values,
+        Create copies of train loss/metric objects to record validation values
+        Returns train_metrics and val_metrics
+
+        """
+        if any(not hasattr(self, attribute) for attribute in
+               ['train_metrics', 'val_metrics']):
+            # Use default mx.metric.Accuracy() for gluon.loss.SoftmaxCrossEntropyLoss()
+            if not self.train_metrics and any([isinstance(l, gluon.loss.SoftmaxCrossEntropyLoss) for l in self.loss]):
+                self.train_metrics = [Accuracy()]
+            self.val_metrics = []
+            for loss in self.loss:
+                # remove trailing numbers from loss name to avoid confusion
+                self.train_metrics.append(Loss(loss.name.rstrip('1234567890')))
+            for metric in self.train_metrics:
+                val_metric = copy.deepcopy(metric)
+                metric.name = "train " + metric.name
+                val_metric.name = "validation " + val_metric.name
+                self.val_metrics.append(val_metric)
+        return self.train_metrics, self.val_metrics
+
+    def evaluate(self,
+                 val_data,
+                 val_metrics,
+                 batch_axis=0):
+        """Evaluate model on validation data
+
+         Parameters
+         ----------
+         val_data : DataLoader
+             Validation data loader with data and labels.
+         val_metrics : EvalMetric or list of EvalMetrics
+             Metrics to update validation result.
+         batch_axis : int, default 0
+             Batch axis to split the validation data into devices.
+         """
+        if not isinstance(val_data, gluon.data.DataLoader):
+            raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you "
+                             "can transform your DataIter or any NDArray into Gluon DataLoader. "
+                             "Refer to gluon.data.dataloader")
+
+        for metric in val_metrics:
+            metric.reset()
+
+        for _, batch in enumerate(val_data):
+            data, label = self._get_data_and_label(batch, self.context, batch_axis)
+            pred = [self.net(x) for x in data]
+            loss = [self.loss[0](y_hat, y) for y_hat, y in zip(pred, label)]
+            # update metrics
+            for metric in val_metrics:
+                if isinstance(metric, Loss):
+                    metric.update(0, loss)
+                else:
+                    metric.update(label, pred)
+
+    def fit(self, train_data,
+            val_data=None,
+            epochs=None,
+            event_handlers=None,
+            batches=None,
+            batch_axis=0):
+        """Trains the model with a given :py:class:`DataLoader` for a specified
+        number of epochs or batches. The batch size is inferred from the
+        data loader's batch_size.
+
+        Parameters
+        ----------
+        train_data : DataLoader
+            Training data loader with data and labels.
+        val_data : DataLoader, default None
+            Validation data loader with data and labels.
+        epochs : int, default None
+            Number of epochs to iterate on the training data.
+            You can only specify one and only one type of iteration(epochs or batches).
+        event_handlers : EventHandler or list of EventHandler
+            List of :py:class:`EventHandlers` to apply during training.
+        batches : int, default None
+            Number of batches to iterate on the training data.
+            You can only specify one and only one type of iteration(epochs or batches).
+        batch_axis : int, default 0
+            Batch axis to split the training data into devices.
+        """
+        if not isinstance(train_data, gluon.data.DataLoader):
+            raise ValueError("Estimator only support input as Gluon DataLoader. Alternatively, you "
+                             "can transform your DataIter or any NDArray into Gluon DataLoader. "
+                             "Refer to gluon.data.dataloader")
+
+        # must specify one and only one of epochs or batches
+        if (not epochs) == (not batches):
+            raise ValueError(
+                "Fit only support exactly one type of iteration, "
+                "train by number of epochs or number of batches."
+                "Please specify one and only one of: epochs or batches.")
+
+        self.max_epoch = epochs
+        self.max_batch = batches
+
+        # provide default handlers
+        event_handlers = self._prepare_default_handlers(val_data, event_handlers)
+
+        train_begin, epoch_begin, batch_begin, \
+        batch_end, epoch_end, train_end = self._categorize_handlers(event_handlers)
+
+        # pass a reference to all event handlers
+        estimator_ref = self
+        # training begin
+        for handler in train_begin:
+            handler.train_begin(estimator_ref)
+
+        while True:
+            # epoch begin
+            for handler in epoch_begin:
+                handler.epoch_begin(estimator_ref)
+
+            for i, batch in enumerate(train_data):
+                data, label = self._get_data_and_label(batch, self.context, batch_axis)
+
+                batch_size = batch[0].shape[0]
+
+                # batch begin
+                for handler in batch_begin:
+                    handler.batch_begin(estimator_ref, batch=batch)
+
+                with autograd.record():
+                    pred = [self.net(x) for x in data]
+                    loss = [self.loss[0](y_hat, y) for y_hat, y in zip(pred, label)]
+
+                for l in loss:
+                    l.backward()
+
+                self.trainer.step(batch_size)
+                # batch end
+
+                batch_end_result = []
+                for handler in batch_end:
+                    batch_end_result.append(handler.batch_end(estimator_ref, batch=batch,
+                                                              pred=pred, label=label, loss=loss))
+                # if any handler signaled to stop
+                if any(batch_end_result):
+                    break
+
+            # epoch end
+            epoch_end_result = []
+            for handler in epoch_end:
+                epoch_end_result.append(handler.epoch_end(estimator_ref))
+            # if any handler signaled to stop
+            if any(epoch_end_result):
+                break
+
+        # train end
+        for handler in train_end:
+            handler.train_end(estimator_ref)
+
+    def _prepare_default_handlers(self, val_data, event_handlers):
+        event_handlers = event_handlers or []
+        default_handlers = []
+        train_metrics, val_metrics = self.prepare_loss_and_metrics()
+
+        # no need to add to default handler check as StoppingHandler does not use metrics
+        event_handlers.append(StoppingHandler(self.max_epoch, self.max_batch))
+
+        if not any(isinstance(handler, MetricHandler) for handler in event_handlers):
+            event_handlers.append(MetricHandler(train_metrics=train_metrics))
+            default_handlers.append("MetricHandler")
+
+        if val_data and not any(isinstance(handler, ValidationHandler) for handler in event_handlers):
+            event_handlers.append(ValidationHandler(val_data=val_data, eval_fn=self.evaluate,
+                                                    val_metrics=val_metrics))
+            default_handlers.append("ValidationHandler")
+
+        if not any(isinstance(handler, LoggingHandler) for handler in event_handlers):
+            event_handlers.append(LoggingHandler(train_metrics=train_metrics,
+                                                 val_metrics=val_metrics))
+            default_handlers.append("LoggingHandler")
+
+        # if there is a mix of user defined event handlers and default event handlers
+        # they should have the same set of loss and metrics
+        if default_handlers:
+            msg = "You are training with the following default event handlers: %s. " \
+                  "They use loss and metrics from estimator.prepare_loss_and_metrics(). " \
+                  "Please use the same set of metrics for all your other handlers." % \
+                  ", ".join(default_handlers)
+            warnings.warn(msg)
+            # check if all handlers has the same set of references to loss and metrics
+            references = []
+            for handler in event_handlers:
+                for attribute in dir(handler):
+                    if any(keyword in attribute for keyword in ['metric' or 'monitor']):
+                        reference = getattr(handler, attribute)
+                        if isinstance(reference, list):
+                            references += reference
+                        else:
+                            references.append(reference)
+            # remove None metric references
+            references = set([ref for ref in references if ref])
+            for metric in references:
+                if metric not in train_metrics + val_metrics:
+                    msg = "We have added following default handlers for you: %s and used " \
+                          "estimator.prepare_loss_and_metrics() to pass metrics to " \
+                          "those handlers. Please use the same set of metrics " \
+                          "for all your handlers." % \
+                          ", ".join(default_handlers)
+                    raise ValueError(msg)
+
+        event_handlers.sort(key=lambda handler: getattr(handler, 'priority', 0))
+        return event_handlers
+
+    def _categorize_handlers(self, event_handlers):
+        """
+        categorize handlers into 6 event lists to avoid calling empty methods
+        for example, only event handlers with train_begin method
+        implemented will be called at train begin
+        """
+
+        train_begin = []
+        epoch_begin = []
+        batch_begin = []
+        batch_end = []
+        epoch_end = []
+        train_end = []
+        for handler in event_handlers:
+            if isinstance(handler, TrainBegin):
+                train_begin.append(handler)
+            if isinstance(handler, EpochBegin):
+                epoch_begin.append(handler)
+            if isinstance(handler, BatchBegin):
+                batch_begin.append(handler)
+            if isinstance(handler, BatchEnd):
+                batch_end.append(handler)
+            if isinstance(handler, EpochEnd):
+                epoch_end.append(handler)
+            if isinstance(handler, TrainEnd):
+                train_end.append(handler)
+        return train_begin, epoch_begin, batch_begin, batch_end, epoch_end, train_end
diff --git a/python/mxnet/gluon/contrib/estimator/event_handler.py b/python/mxnet/gluon/contrib/estimator/event_handler.py
new file mode 100644
index 000000000000..ed97c7bc3d19
--- /dev/null
+++ b/python/mxnet/gluon/contrib/estimator/event_handler.py
@@ -0,0 +1,705 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-argument
+"""Gluon EventHandlers for Estimators"""
+
+import logging
+import os
+import time
+import warnings
+
+import numpy as np
+
+from ....metric import EvalMetric, Loss
+
+
+class TrainBegin(object):
+    def train_begin(self, estimator, *args, **kwargs):
+        pass
+
+
+class TrainEnd(object):
+    def train_end(self, estimator, *args, **kwargs):
+        pass
+
+
+class EpochBegin(object):
+    def epoch_begin(self, estimator, *args, **kwargs):
+        pass
+
+
+class EpochEnd(object):
+    def epoch_end(self, estimator, *args, **kwargs):
+        return False
+
+
+class BatchBegin(object):
+    def batch_begin(self, estimator, *args, **kwargs):
+        pass
+
+
+class BatchEnd(object):
+    def batch_end(self, estimator, *args, **kwargs):
+        return False
+
+
+class StoppingHandler(TrainBegin, BatchEnd, EpochEnd):
+    """Stop conditions to stop training
+    Stop training if maximum number of batches or epochs
+    reached.
+
+    Parameters
+    ----------
+    max_epoch : int, default None
+        Number of maximum epochs to train.
+    max_batch : int, default None
+        Number of maximum batches to train.
+
+    """
+
+    def __init__(self, max_epoch=None, max_batch=None):
+        self.max_epoch = max_epoch
+        self.max_batch = max_batch
+        self.current_batch = 0
+        self.current_epoch = 0
+        self.stop_training = False
+
+    def train_begin(self, estimator, *args, **kwargs):
+        self.max_epoch = estimator.max_epoch
+        self.max_batch = estimator.max_batch
+        self.current_batch = 0
+        self.current_epoch = 0
+
+    def batch_end(self, estimator, *args, **kwargs):
+        self.current_batch += 1
+        if self.current_batch == self.max_batch:
+            self.stop_training = True
+        return self.stop_training
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        self.current_epoch += 1
+        if self.current_epoch == self.max_epoch:
+            self.stop_training = True
+        return self.stop_training
+
+
+class MetricHandler(EpochBegin, BatchEnd):
+    """Metric Handler that update metric values at batch end
+
+    :py:class:`MetricHandler` takes model predictions and true labels
+    and update the metrics, it also update metric wrapper for loss with loss values.
+    Validation loss and metrics will be handled by :py:class:`ValidationHandler`
+
+    Parameters
+    ----------
+    train_metrics : List of EvalMetrics
+        Training metrics to be updated at batch end.
+    """
+
+    def __init__(self, train_metrics):
+        self.train_metrics = train_metrics or []
+        # order to be called among all callbacks
+        # metrics need to be calculated before other callbacks can access them
+        self.priority = -np.Inf
+
+    def epoch_begin(self, estimator, *args, **kwargs):
+        for metric in self.train_metrics:
+            metric.reset()
+
+    def batch_end(self, estimator, *args, **kwargs):
+        pred = kwargs['pred']
+        label = kwargs['label']
+        loss = kwargs['loss']
+        for metric in self.train_metrics:
+            if isinstance(metric, Loss):
+                # metric wrapper for loss values
+                metric.update(0, loss)
+            else:
+                metric.update(label, pred)
+
+
+class ValidationHandler(TrainBegin, BatchEnd, EpochEnd):
+    """"Validation Handler that evaluate model on validation dataset
+
+    :py:class:`ValidationHandler` takes validation dataset, an evaluation function,
+    metrics to be evaluated, and how often to run the validation. You can provide custom
+    evaluation function or use the one provided my :py:class:`Estimator`
+
+    Parameters
+    ----------
+    val_data : DataLoader
+        Validation data set to run evaluation.
+    eval_fn : function
+        A function defines how to run evaluation and
+        calculate loss and metrics.
+    val_metrics : List of EvalMetrics
+        Validation metrics to be updated.
+    epoch_period : int, default 1
+        How often to run validation at epoch end, by default
+        :py:class:`ValidationHandler` validate every epoch.
+    batch_period : int, default None
+        How often to run validation at batch end, by default
+        :py:class:`ValidationHandler` does not validate at batch end.
+    """
+
+    def __init__(self,
+                 val_data,
+                 eval_fn,
+                 val_metrics=None,
+                 epoch_period=1,
+                 batch_period=None):
+        self.val_data = val_data
+        self.eval_fn = eval_fn
+        self.epoch_period = epoch_period
+        self.batch_period = batch_period
+        self.val_metrics = val_metrics
+        self.current_batch = 0
+        self.current_epoch = 0
+        # order to be called among all callbacks
+        # validation metrics need to be calculated before other callbacks can access them
+        self.priority = -np.Inf
+        self.logger = logging.getLogger(__name__)
+
+    def train_begin(self, estimator, *args, **kwargs):
+        # reset epoch and batch counter
+        self.current_batch = 0
+        self.current_epoch = 0
+
+    def batch_end(self, estimator, *args, **kwargs):
+        self.current_batch += 1
+        if self.batch_period and self.current_batch % self.batch_period == 0:
+            self.eval_fn(val_data=self.val_data,
+                         val_metrics=self.val_metrics)
+            msg = '[Epoch %d] ValidationHandler: %d batches reached, ' \
+                  % (self.current_epoch, self.current_batch)
+            for monitor in self.val_metrics:
+                name, value = monitor.get()
+                msg += '%s: %.4f, ' % (name, value)
+            self.logger.info(msg.rstrip(','))
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        self.current_epoch += 1
+        if self.epoch_period and self.current_epoch % self.epoch_period == 0:
+            self.eval_fn(val_data=self.val_data,
+                         val_metrics=self.val_metrics)
+
+
+class LoggingHandler(TrainBegin, TrainEnd, EpochBegin, EpochEnd, BatchBegin, BatchEnd):
+    """Basic Logging Handler that applies to every Gluon estimator by default.
+
+    :py:class:`LoggingHandler` logs hyper-parameters, training statistics,
+    and other useful information during training
+
+    Parameters
+    ----------
+    file_name : str
+        File name to save the logs.
+    file_location : str
+        File location to save the logs.
+    filemode : str, default 'a'
+        Logging file mode, default using append mode.
+    verbose : int, default LOG_PER_EPOCH
+        Limit the granularity of metrics displayed during training process.
+        verbose=LOG_PER_EPOCH: display metrics every epoch
+        verbose=LOG_PER_BATCH: display metrics every batch
+    train_metrics : list of EvalMetrics
+        Training metrics to be logged, logged at batch end, epoch end, train end.
+    val_metrics : list of EvalMetrics
+        Validation metrics to be logged, logged at epoch end, train end.
+    """
+
+    LOG_PER_EPOCH = 1
+    LOG_PER_BATCH = 2
+
+    def __init__(self, file_name=None,
+                 file_location=None,
+                 filemode='a',
+                 verbose=LOG_PER_EPOCH,
+                 train_metrics=None,
+                 val_metrics=None):
+        super(LoggingHandler, self).__init__()
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging.INFO)
+        stream_handler = logging.StreamHandler()
+        self.logger.addHandler(stream_handler)
+        # save logger to file only if file name or location is specified
+        if file_name or file_location:
+            file_name = file_name or 'estimator_log'
+            file_location = file_location or './'
+            file_handler = logging.FileHandler(os.path.join(file_location, file_name), mode=filemode)
+            self.logger.addHandler(file_handler)
+        if verbose not in [self.LOG_PER_EPOCH, self.LOG_PER_BATCH]:
+            raise ValueError("verbose level must be either LOG_PER_EPOCH or "
+                             "LOG_PER_BATCH, received %s. "
+                             "E.g: LoggingHandler(verbose=LoggingHandler.LOG_PER_EPOCH)"
+                             % verbose)
+        self.verbose = verbose
+        self.train_metrics = train_metrics or []
+        self.val_metrics = val_metrics or []
+        self.batch_index = 0
+        self.current_epoch = 0
+        self.processed_samples = 0
+        # logging handler need to be called at last to make sure all states are updated
+        # it will also shut down logging at train end
+        self.priority = np.Inf
+
+    def train_begin(self, estimator, *args, **kwargs):
+        self.train_start = time.time()
+        trainer = estimator.trainer
+        optimizer = trainer.optimizer.__class__.__name__
+        lr = trainer.learning_rate
+        self.logger.info("Training begin: using optimizer %s "
+                         "with current learning rate %.4f ",
+                         optimizer, lr)
+        if estimator.max_epoch:
+            self.logger.info("Train for %d epochs.", estimator.max_epoch)
+        else:
+            self.logger.info("Train for %d batches.", estimator.max_batch)
+        # reset all counters
+        self.current_epoch = 0
+        self.batch_index = 0
+        self.processed_samples = 0
+
+    def train_end(self, estimator, *args, **kwargs):
+        train_time = time.time() - self.train_start
+        msg = 'Train finished using total %ds with %d epochs. ' % (train_time, self.current_epoch)
+        # log every result in train stats including train/validation loss & metrics
+        for metric in self.train_metrics + self.val_metrics:
+            name, value = metric.get()
+            msg += '%s: %.4f, ' % (name, value)
+        self.logger.info(msg.rstrip(', '))
+        # make a copy of handler list and remove one by one
+        # as removing handler will edit the handler list
+        for handler in self.logger.handlers[:]:
+            handler.close()
+            self.logger.removeHandler(handler)
+        logging.shutdown()
+
+    def batch_begin(self, estimator, *args, **kwargs):
+        if self.verbose == self.LOG_PER_BATCH:
+            self.batch_start = time.time()
+
+    def batch_end(self, estimator, *args, **kwargs):
+        if self.verbose == self.LOG_PER_BATCH:
+            batch_time = time.time() - self.batch_start
+            msg = '[Epoch %d][Batch %d]' % (self.current_epoch, self.batch_index)
+            self.processed_samples += kwargs['batch'][0].shape[0]
+            msg += '[Samples %s] ' % (self.processed_samples)
+            msg += 'time/batch: %.3fs ' % batch_time
+            for metric in self.train_metrics:
+                # only log current training loss & metric after each batch
+                name, value = metric.get()
+                msg += '%s: %.4f, ' % (name, value)
+            self.logger.info(msg.rstrip(', '))
+        self.batch_index += 1
+
+    def epoch_begin(self, estimator, *args, **kwargs):
+        if self.verbose >= self.LOG_PER_EPOCH:
+            self.epoch_start = time.time()
+            self.logger.info("[Epoch %d] Begin, current learning rate: %.4f",
+                             self.current_epoch, estimator.trainer.learning_rate)
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        if self.verbose >= self.LOG_PER_EPOCH:
+            epoch_time = time.time() - self.epoch_start
+            msg = '[Epoch %d] Finished in %.3fs, ' % (self.current_epoch, epoch_time)
+            for monitor in self.train_metrics + self.val_metrics:
+                name, value = monitor.get()
+                msg += '%s: %.4f, ' % (name, value)
+            self.logger.info(msg.rstrip(', '))
+        self.current_epoch += 1
+        self.batch_index = 0
+
+
+class CheckpointHandler(TrainBegin, BatchEnd, EpochEnd):
+    """Save the model after user define period
+
+    :py:class:`CheckpointHandler` saves the network architecture after first batch if the model
+    can be fully hybridized, saves model parameters and trainer states after user defined period,
+    default saves every epoch.
+
+    Parameters
+    ----------
+    model_dir : str
+        File directory to save all the model related files including model architecture,
+        model parameters, and trainer states.
+    model_prefix : str default 'model'
+        Prefix to add for all checkpoint file names.
+    monitor: EvalMetric, default None
+        The metrics to monitor and determine if model has improved
+    verbose: int, default 0
+        Verbosity mode, 1 means inform user every time a checkpoint is saved
+    save_best: bool, default False
+        If True, monitor must not be None, :py:class:`CheckpointHandler` will save the
+        model parameters and trainer states with the best monitored value.
+    mode: str, default 'auto'
+        One of {auto, min, max}, if `save_best=True`, the comparison to make
+        and determine if the monitored value has improved. if 'auto' mode,
+        :py:class:`CheckpointHandler` will try to use min or max based on
+        the monitored metric name.
+    epoch_period: int, default 1
+        Epoch intervals between saving the network. By default, checkpoints are
+        saved every epoch.
+    batch_period: int, default None
+        Batch intervals between saving the network.
+        By default, checkpoints are not saved based on the number of batches.
+    max_checkpoints : int, default 5
+        Maximum number of checkpoint files to keep in the model_dir, older checkpoints
+        will be removed. Best checkpoint file is not counted.
+    resume_from_checkpoint : bool, default False
+        Whether to resume training from checkpoint in model_dir. If True and checkpoints
+        found, :py:class:`CheckpointHandler` will load net parameters and trainer states,
+        and train the remaining of epochs and batches.
+    """
+
+    def __init__(self,
+                 model_dir,
+                 model_prefix='model',
+                 monitor=None,
+                 verbose=0,
+                 save_best=False,
+                 mode='auto',
+                 epoch_period=1,
+                 batch_period=None,
+                 max_checkpoints=5,
+                 resume_from_checkpoint=False):
+        self.monitor = monitor
+        self.verbose = verbose
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir)
+        self.model_dir = model_dir
+        self.model_prefix = model_prefix
+        self.save_best = save_best
+        if self.save_best and not isinstance(self.monitor, EvalMetric):
+            raise ValueError("To save best model only, please provide one of the metric objects as monitor, "
+                             "You can get these objects using estimator.prepare_loss_and_metric()")
+        self.epoch_period = epoch_period
+        self.batch_period = batch_period
+        self.current_batch = 0
+        self.current_epoch = 0
+        self.max_checkpoints = max_checkpoints
+        self.resume_from_checkpoint = resume_from_checkpoint
+        self.saved_checkpoints = []
+        self.logger = logging.getLogger(__name__)
+        if self.save_best:
+            if mode not in ['auto', 'min', 'max']:
+                warnings.warn('ModelCheckpoint mode %s is unknown, '
+                              'fallback to auto mode. CheckpointHandler will use'
+                              'max mode for f1 and accuracy metric comparison and '
+                              'use min mode other wise' % (mode),
+                              RuntimeWarning)
+                mode = 'auto'
+
+            if mode == 'min':
+                self.monitor_op = np.less
+                self.best = np.Inf
+            elif mode == 'max':
+                self.monitor_op = np.greater
+                self.best = -np.Inf
+            else:
+                # use greater for accuracy and f1 and less otherwise
+                if 'acc' or 'f1' in self.monitor.get()[0].lower():
+                    self.logger.info("`greater` operator will be used to determine "
+                                     "if %s has improved, please use `min` for mode "
+                                     "if you want otherwise", self.monitor.get()[0])
+                    self.monitor_op = np.greater
+                else:
+                    self.logger.info("`less` operator will be used to determine "
+                                     "if %s has improved, please use `max` for mode "
+                                     "if you want otherwise", self.monitor.get()[0])
+                    self.monitor_op = np.less
+
+    def train_begin(self, estimator, *args, **kwargs):
+        # reset all counters
+        self.current_epoch = 0
+        self.current_batch = 0
+        if self.save_best:
+            self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable
+        if self.resume_from_checkpoint:
+            error_msg = "To use resume from checkpoint, you must only specify " \
+                        "the same type of period you used for training." \
+                        "For example, if you are training based on number of epochs," \
+                        "you must save only based on epochs, and set batch_period to None."
+            if estimator.max_batch:
+                assert self.batch_period, error_msg
+                assert not self.epoch_period, error_msg
+            if estimator.max_epoch:
+                assert self.epoch_period, error_msg
+                assert not self.batch_period, error_msg
+
+            self._resume_from_checkpoint(estimator)
+
+    def batch_end(self, estimator, *args, **kwargs):
+        # only save symbol once after first batch
+        if self.current_batch == 0:
+            self._save_symbol(estimator)
+        if self.batch_period and (self.current_batch + 1) % self.batch_period == 0:
+            self._save_checkpoint(estimator)
+        self.current_batch += 1
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        if self.epoch_period and (self.current_epoch + 1) % self.epoch_period == 0:
+            self._save_checkpoint(estimator)
+        self.current_epoch += 1
+
+    def _save_checkpoint(self, estimator):
+        # if resumed from checkpoint, increment checkpoint number
+        if self.resume_from_checkpoint:
+            save_epoch_number = self.current_epoch + self.trained_epoch + 1
+            if estimator.max_epoch:
+                # checkpoint saved at epoch end, batch number already incremented
+                save_batch_number = self.current_batch + self.trained_batch
+            else:
+                save_batch_number = self.current_batch + self.trained_batch + 1
+        else:
+            save_epoch_number = self.current_epoch
+            save_batch_number = self.current_batch
+        prefix = "%s-epoch%dbatch%d" % (self.model_prefix, save_epoch_number, save_batch_number)
+        self._save_params_and_trainer(estimator, prefix)
+        if self.verbose > 0:
+            self.logger.info('[Epoch %d] CheckpointHandler: trained total %d batches, '
+                             'saving model at %s with prefix: %s',
+                             self.current_epoch, self.current_batch + 1, self.model_dir, prefix)
+
+        if self.save_best:
+            monitor_name, monitor_value = self.monitor.get()
+            # check if monitor exists in train stats
+            if np.isnan(monitor_value):
+                warnings.warn(RuntimeWarning('Skipping save best because %s is not updated, make sure you '
+                                             'pass one of the metric objects as monitor, '
+                                             'you can use estimator.prepare_loss_and_metrics to'
+                                             'create all metric objects', monitor_name))
+            else:
+                if self.monitor_op(monitor_value, self.best):
+                    prefix = self.model_prefix + '-best'
+                    self._save_params_and_trainer(estimator, prefix)
+                    self.best = monitor_value
+                    if self.verbose > 0:
+                        self.logger.info('[Epoch %d] CheckpointHandler: '
+                                         '%s improved from %0.5f to %0.5f, '
+                                         'updating best model at %s with prefix: %s',
+                                         self.current_epoch, monitor_name,
+                                         self.best, monitor_value, self.model_dir, prefix)
+                else:
+                    if self.verbose > 0:
+                        self.logger.info('[Epoch %d] CheckpointHandler: '
+                                         '%s did not improve from %0.5f, '
+                                         'skipping updating best model',
+                                         self.current_batch, monitor_name,
+                                         self.best)
+
+    def _save_symbol(self, estimator):
+        symbol_file = os.path.join(self.model_dir, self.model_prefix + '-symbol.json')
+        if hasattr(estimator.net, '_cached_graph'):
+            sym = estimator.net._cached_graph[1]
+            sym.save(symbol_file)
+        else:
+            self.logger.info("Model architecture(symbol file) is not saved, please use HybridBlock"
+                             "to construct your model, can call net.hybridize() before passing to"
+                             "Estimator in order to save model architecture as %s.", symbol_file)
+
+    def _save_params_and_trainer(self, estimator, file_prefix):
+        param_file = os.path.join(self.model_dir, file_prefix + '.params')
+        trainer_file = os.path.join(self.model_dir, file_prefix + '.states')
+        estimator.net.save_parameters(param_file)
+        estimator.trainer.save_states(trainer_file)
+
+        # only count checkpoints with epoch or batch number in file name
+        if 'best' not in file_prefix:
+            self.saved_checkpoints.append(file_prefix)
+        # remove old checkpoint when max number of checkpoints reached
+        if len(self.saved_checkpoints) > self.max_checkpoints:
+            prefix = self.saved_checkpoints.pop(0)
+            for fname in os.listdir(self.model_dir):
+                if fname.startswith(prefix):
+                    os.remove(os.path.join(self.model_dir, fname))
+
+    def _resume_from_checkpoint(self, estimator):
+        prefix = self.model_prefix + '-epoch'
+        self.trained_epoch = self._find_max_iteration(
+            dir=self.model_dir,
+            prefix=prefix,
+            start='epoch',
+            end='batch',
+            saved_checkpoints=self.saved_checkpoints)
+        prefix += str(self.trained_epoch)
+        self.trained_batch = self._find_max_iteration(
+            dir=self.model_dir,
+            prefix=prefix,
+            start='batch',
+            end='.params')
+
+        if self.trained_epoch == -1:
+            msg = "CheckpointHandler: No checkpoint found, training from scratch for "
+            if estimator.max_batch:
+                msg += "%d batches" % estimator.max_batch
+            else:
+                msg += "%d epochs" % estimator.max_epoch
+            self.logger.info(msg)
+        else:
+            msg = "CheckpointHandler: Checkpoint resumed from epoch %d batch %d, " \
+                  "continue to train for " % (self.trained_epoch, self.trained_batch)
+            # change maximum number of epoch or batch to train if resumed from epoch checkpoint
+            if estimator.max_epoch:
+                if self.trained_epoch >= estimator.max_epoch - 1:
+                    raise ValueError("Found checkpoint with maximum number of epoch %d reached, please specify "
+                                     "resume_from_checkpoint=False (default value) if you wan to train from scratch."
+                                     % estimator.max_epoch)
+                estimator.max_epoch = estimator.max_epoch - self.trained_epoch - 1
+                msg += "%d epochs " % estimator.max_epoch
+            if estimator.max_batch:
+                if self.trained_batch >= estimator.max_batch - 1:
+                    raise ValueError("Found checkpoint with maximum number of batch %d reached, please specify"
+                                     "resume_from_checkpoint=False (default value) if you wan to train from scratch."
+                                     % self.trained_batch)
+                estimator.max_batch = estimator.max_batch - self.trained_batch - 1
+                msg += "%d batches " % estimator.max_batch
+            # load checkpoint
+            param_file = "%s-epoch%dbatch%d.params" % (self.model_prefix, self.trained_epoch, self.trained_batch)
+            param_file = os.path.join(self.model_dir, param_file)
+            trainer_file = "%s-epoch%dbatch%d.states" % (self.model_prefix, self.trained_epoch, self.trained_batch)
+            trainer_file = os.path.join(self.model_dir, trainer_file)
+            assert os.path.exists(param_file), "Failed to load checkpoint, %s does not exist" % param_file
+            assert os.path.exists(trainer_file), "Failed to load checkpoint, %s does not exist" % trainer_file
+            estimator.net.load_parameters(param_file, ctx=estimator.context)
+            estimator.trainer.load_states(trainer_file)
+            self.logger.warning(msg)
+
+    def _find_max_iteration(self, dir, prefix, start, end, saved_checkpoints=None):
+        error_msg = "Error parsing checkpoint file, please check your " \
+                    "checkpoints have the format: " \
+                    "{model_name}-epoch{epoch_number}batch{batch_number}.params, " \
+                    "there should also be a .states file for each .params file "
+        max_iter = -1
+        for fname in os.listdir(dir):
+            if fname.startswith(prefix) and '.params' in fname:
+                if saved_checkpoints:
+                    # save prefix of existing checkpoints
+                    saved_checkpoints.append(fname[:fname.find('.params')])
+                try:
+                    # find trained number of epoch
+                    iter = int(fname[fname.find(start) + len(start): fname.find(end)])
+                    if iter > max_iter:
+                        max_iter = iter
+                except ValueError:
+                    raise ValueError(error_msg)
+        return max_iter
+
+
+class EarlyStoppingHandler(TrainBegin, EpochEnd, TrainEnd):
+    """Early stop training if monitored value is not improving
+
+    Parameters
+    ----------
+    monitor: EvalMetric
+        The metric to monitor, and stop training if this metric does not improve.
+    min_delta: float, default 0
+        Minimal change in monitored value to be considered as an improvement.
+    patience: int, default 0
+        Number of epochs to wait for improvement before terminate training.
+    mode: str, default 'auto'
+        One of {auto, min, max}, if `save_best_only=True`, the comparison to make
+        and determine if the monitored value has improved. if 'auto' mode, checkpoint
+        handler will try to use min or max based on the monitored metric name.
+    baseline: float
+        Baseline value to compare the monitored value with.
+    """
+
+    def __init__(self,
+                 monitor,
+                 min_delta=0,
+                 patience=0,
+                 mode='auto',
+                 baseline=None):
+        super(EarlyStoppingHandler, self).__init__()
+
+        if not isinstance(monitor, EvalMetric):
+            raise ValueError("Please provide one of the metric objects as monitor, "
+                             "You can create these objects using estimator.prepare_loss_and_metric()")
+        self.monitor = monitor
+        self.baseline = baseline
+        self.patience = patience
+        self.min_delta = min_delta
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.current_epoch = 0
+        self.stop_training = False
+        self.logger = logging.getLogger(__name__)
+
+        if mode not in ['auto', 'min', 'max']:
+            warnings.warn('EarlyStopping mode %s is unknown, '
+                          'fallback to auto mode. CheckpointHandler will use'
+                          'max mode for f1 and accuracy metric comparison and '
+                          'use min mode other wise' % (mode),
+                          RuntimeWarning)
+            mode = 'auto'
+
+        if mode == 'min':
+            self.monitor_op = np.less
+        elif mode == 'max':
+            self.monitor_op = np.greater
+        else:
+            if 'acc' or 'f1' in self.monitor.get()[0].lower():
+                self.logger.info("`greater` operator is used to determine "
+                                 "if %s has improved, please use `min` for mode "
+                                 "if you want otherwise", self.monitor.get()[0])
+                self.monitor_op = np.greater
+            else:
+                self.logger.info("`less` operator is used to determine "
+                                 "if %s has improved, please use `max` for mode "
+                                 "if you want otherwise", self.monitor.get()[0])
+                self.monitor_op = np.less
+
+        if self.monitor_op == np.greater: # pylint: disable=comparison-with-callable
+            self.min_delta *= 1
+        else:
+            self.min_delta *= -1
+
+    def train_begin(self, estimator, *args, **kwargs):
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.current_epoch = 0
+        self.stop_training = False
+        if self.baseline is not None:
+            self.best = self.baseline
+        else:
+            self.best = np.Inf if self.monitor_op == np.less else -np.Inf # pylint: disable=comparison-with-callable
+
+    def epoch_end(self, estimator, *args, **kwargs):
+        monitor_name, monitor_value = self.monitor.get()
+        if np.isnan(monitor_value):
+            warnings.warn(RuntimeWarning('%s is not updated, make sure you pass one of the metric objects'
+                                         'as monitor, you can use estimator.prepare_loss_and_metrics to'
+                                         'create all metric objects', monitor_name))
+        else:
+            if self.monitor_op(monitor_value - self.min_delta, self.best):
+                self.best = monitor_value
+                self.wait = 0
+            else:
+                self.wait += 1
+                if self.wait >= self.patience:
+                    self.stopped_epoch = self.current_epoch
+                    self.stop_training = True
+        self.current_epoch += 1
+        return self.stop_training
+
+    def train_end(self, estimator, *args, **kwargs):
+        if self.stopped_epoch > 0:
+            self.logger.info('[Epoch %d] EarlyStoppingHanlder: early stopping due to %s not improving',
+                             self.stopped_epoch, self.monitor.get()[0])
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index 6cbf988fc94a..706e5e4dfb12 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -24,7 +24,7 @@
            'PixelShuffle3D']
 
 import warnings
-from .... import nd, test_utils
+from .... import nd, context
 from ...block import HybridBlock, Block
 from ...nn import Sequential, HybridSequential, BatchNorm
 
@@ -233,7 +233,7 @@ def _get_num_devices(self):
         warnings.warn("Caution using SyncBatchNorm: "
                       "if not using all the GPUs, please mannually set num_devices",
                       UserWarning)
-        num_devices = len(test_utils.list_gpus())
+        num_devices = context.num_gpus()
         num_devices = num_devices if num_devices > 0 else 1
         return num_devices
 
diff --git a/python/mxnet/gluon/contrib/rnn/conv_rnn_cell.py b/python/mxnet/gluon/contrib/rnn/conv_rnn_cell.py
index b7a19f78b2f9..69ec92f0fe8a 100644
--- a/python/mxnet/gluon/contrib/rnn/conv_rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/conv_rnn_cell.py
@@ -462,10 +462,10 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
         forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
         in_transform = self._get_activation(F, slice_gates[2], self._activation, name=prefix+'c')
         out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
-        next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
-                                   name=prefix+'state')
-        next_h = F._internal._mul(out_gate, self._get_activation(F, next_c, self._activation),
-                                  name=prefix+'out')
+        next_c = F.elemwise_add(forget_gate * states[1], in_gate * in_transform,
+                                name=prefix+'state')
+        next_h = F.elemwise_mul(out_gate, self._get_activation(F, next_c, self._activation),
+                                name=prefix+'out')
 
         return next_h, [next_h, next_c]
 
@@ -753,8 +753,8 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
         next_h_tmp = self._get_activation(F, i2h + reset_gate * h2h, self._activation,
                                           name=prefix+'h_act')
 
-        next_h = F._internal._plus((1. - update_gate) * next_h_tmp, update_gate * states[0],
-                                   name=prefix+'out')
+        next_h = F.elemwise_add((1. - update_gate) * next_h_tmp, update_gate * states[0],
+                                name=prefix+'out')
 
         return next_h, [next_h]
 
diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
index 3bd8e7810978..a161ec75b76e 100644
--- a/python/mxnet/gluon/contrib/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -27,8 +27,7 @@
 class VariationalDropoutCell(ModifierCell):
     """
     Applies Variational Dropout on base cell.
-    (https://arxiv.org/pdf/1512.05287.pdf, \
-     https://www.stat.berkeley.edu/~tsmoon/files/Conference/asru2015.pdf).
+    https://arxiv.org/pdf/1512.05287.pdf
 
     Variational dropout uses the same dropout mask across time-steps. It can be applied to RNN
     inputs, outputs, and states. The masks for them are not shared.
@@ -312,10 +311,10 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
         forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
         in_transform = F.Activation(slice_gates[2], act_type="tanh", name=prefix+'c')
         out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
-        next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
-                                   name=prefix+'state')
-        hidden = F._internal._mul(out_gate, F.Activation(next_c, act_type="tanh"),
-                                  name=prefix+'hidden')
+        next_c = F.elemwise_add(forget_gate * states[1], in_gate * in_transform,
+                                name=prefix+'state')
+        hidden = F.elemwise_mul(out_gate, F.Activation(next_c, act_type="tanh"),
+                                name=prefix+'hidden')
         next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
                                   weight=h2r_weight, no_bias=True, name=prefix+'out')
 
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 934f2d5954c1..accd968cc9df 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -409,7 +409,7 @@ def _thread_worker_fn(samples, batchify_fn, dataset):
 class _MultiWorkerIter(object):
     """Internal multi-worker iterator for DataLoader."""
     def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
-                 pin_device_id=0, worker_fn=_worker_fn, prefetch=0, dataset=None):
+                 pin_device_id=0, worker_fn=_worker_fn, prefetch=0, dataset=None, data_loader=None):
         self._worker_pool = worker_pool
         self._batchify_fn = batchify_fn
         self._batch_sampler = batch_sampler
@@ -421,6 +421,7 @@ def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
         self._pin_memory = pin_memory
         self._pin_device_id = pin_device_id
         self._dataset = dataset
+        self._data_loader = data_loader
         # pre-fetch
         for _ in range(prefetch):
             self._push_next()
@@ -582,7 +583,8 @@ def same_process_iter():
                                 pin_memory=self._pin_memory, pin_device_id=self._pin_device_id,
                                 worker_fn=_thread_worker_fn if self._thread_pool else _worker_fn,
                                 prefetch=self._prefetch,
-                                dataset=self._dataset if self._thread_pool else None)
+                                dataset=self._dataset if self._thread_pool else None,
+                                data_loader=self)
 
     def __len__(self):
         return len(self._batch_sampler)
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index dff7f66b032d..955f2b2e4a66 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -100,7 +100,7 @@ class ToTensor(HybridBlock):
 
     Converts an image NDArray of shape (H x W x C) in the range
     [0, 255] to a float32 tensor NDArray of shape (C x H x W) in
-    the range [0, 1).
+    the range [0, 1].
 
     If batch input, converts a batch image NDArray of shape (N x H x W x C) in the
     range [0, 255] to a float32 tensor NDArray of shape (N x C x H x W).
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index f660b97f8789..19f3883286a2 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -30,7 +30,8 @@
 from .. import symbol, ndarray, initializer, context
 from ..context import Context, cpu
 from .. import autograd
-from .utils import _indent, _brief_print_list
+from .utils import _indent, _brief_print_list, shape_is_known
+from .. import is_np_shape
 
 # pylint: disable= invalid-name
 tensor_types = (symbol.Symbol, ndarray.NDArray)
@@ -156,7 +157,20 @@ def grad_req(self, req):
 
     @property
     def shape(self):
-        return self._shape
+        """The shape of the parameter.
+
+        By default, an unknown dimension size is 0. However, when the NumPy semantic
+        is turned on, unknown dimension size is -1.
+        """
+        if self._shape is None:
+            return None
+        elif is_np_shape():
+            # Parameters shouldn't be zero-size. If one of its dimension is 0,
+            # it means the parameter isn't initialized. In the NumPy semantics,
+            # the unknown dimension should be marked with -1.
+            return tuple(i if i != 0 else -1 for i in self._shape)
+        else:
+            return self._shape
 
     @shape.setter
     def shape(self, new_shape):
@@ -227,7 +241,7 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
         self._trainer._row_sparse_pull(self, results, row_id)
         return results
 
-    def _load_init(self, data, ctx):
+    def _load_init(self, data, ctx, cast_dtype=False):
         """(Re)initializes by loading from data."""
         if self.shape:
             for self_dim, data_dim in zip(self.shape, data.shape):
@@ -237,9 +251,12 @@ def _load_init(self, data, ctx):
                         self.name, str(self.shape), str(data.shape))
             self.shape = tuple(i if i != 0 else j for i, j in zip(self.shape, data.shape))
         if self.dtype:
+            if cast_dtype and np.dtype(self.dtype).type != data.dtype:
+                data = data.astype(self.dtype, copy=False)
             assert np.dtype(self.dtype).type == data.dtype, \
                 "Failed loading Parameter '%s' from saved params: " \
-                "dtype incompatible expected %s vs saved %s"%(
+                "dtype incompatible expected %s vs saved %s. " \
+                "Set cast_dtype=True to cast the dtype of saved params."%(
                     self.name, str(self.dtype), str(data.dtype))
         if self._stype != data.stype:
             data = data.tostype(self._stype)
@@ -269,7 +286,7 @@ def _finish_deferred_init(self):
             return
         init, ctx, default_init, data = self._deferred_init
         self._deferred_init = ()
-        assert self.shape is not None and np.prod(self.shape) > 0, \
+        assert shape_is_known(self.shape), \
             "Cannot initialize Parameter '%s' because it has " \
             "invalid shape: %s. Please specify in_units, " \
             "in_channels, etc for `Block`s."%(
@@ -380,7 +397,7 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
             ctx = [ctx]
         if init is None:
             init = default_init if self.init is None else self.init
-        if not self.shape or np.prod(self.shape) <= 0:
+        if not shape_is_known(self.shape):
             if self._allow_deferred_init:
                 self._deferred_init = (init, ctx, default_init, None)
                 return
@@ -877,7 +894,7 @@ def save(self, filename, strip_prefix=''):
         ndarray.save(filename, arg_dict)
 
     def load(self, filename, ctx=None, allow_missing=False,
-             ignore_extra=False, restore_prefix=''):
+             ignore_extra=False, restore_prefix='', cast_dtype=False):
         """Load parameters from file.
 
         Parameters
@@ -893,6 +910,9 @@ def load(self, filename, ctx=None, allow_missing=False,
             present in this ParameterDict.
         restore_prefix : str, default ''
             prepend prefix to names of stored parameters before loading.
+        cast_dtype : bool, default False
+            Cast the data type of the NDArray loaded from the checkpoint to the dtype
+            provided by the Parameter if any.
         """
         if restore_prefix:
             for name in self.keys():
@@ -900,8 +920,9 @@ def load(self, filename, ctx=None, allow_missing=False,
                     "restore_prefix is '%s' but Parameters name '%s' does not start " \
                     "with '%s'"%(restore_prefix, name, restore_prefix)
         lprefix = len(restore_prefix)
+        ndarray_load = ndarray.load(filename)
         loaded = [(k[4:] if k.startswith('arg:') or k.startswith('aux:') else k, v) \
-                  for k, v in ndarray.load(filename).items()]
+                  for k, v in ndarray_load.items()] if isinstance(ndarray_load, dict) else ndarray_load
         arg_dict = {restore_prefix+k: v for k, v in loaded}
         if not allow_missing:
             for name in self.keys():
@@ -917,4 +938,4 @@ def load(self, filename, ctx=None, allow_missing=False,
                     "Please make sure source and target networks have the same prefix."%(
                         name[lprefix:], filename, _brief_print_list(self._params.keys()))
                 continue
-            self[name]._load_init(arg_dict[name], ctx)
+            self[name]._load_init(arg_dict[name], ctx, cast_dtype=cast_dtype)
diff --git a/python/mxnet/gluon/rnn/rnn_cell.py b/python/mxnet/gluon/rnn/rnn_cell.py
index 6ef3604eb973..71c7b3f84aa5 100644
--- a/python/mxnet/gluon/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/rnn/rnn_cell.py
@@ -114,7 +114,10 @@ def _reverse_sequences(sequences, unroll_step, valid_length=None):
         reversed_sequences = F.SequenceReverse(F.stack(*sequences, axis=0),
                                                sequence_length=valid_length,
                                                use_sequence_length=True)
-        reversed_sequences = F.split(reversed_sequences, axis=0, num_outputs=unroll_step, squeeze_axis=True)
+        if unroll_step > 1 or F is symbol:
+            reversed_sequences = F.split(reversed_sequences, axis=0, num_outputs=unroll_step, squeeze_axis=True)
+        else:
+            reversed_sequences = [reversed_sequences[0]]
 
     return reversed_sequences
 
@@ -539,11 +542,11 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
             F, slice_gates[2], self._activation, name=prefix+'c')
         out_gate = self._get_activation(
             F, slice_gates[3], self._recurrent_activation, name=prefix+'o')
-        next_c = F._internal._plus(F.elemwise_mul(forget_gate, states[1], name=prefix+'mul0'),
-                                   F.elemwise_mul(in_gate, in_transform, name=prefix+'mul1'),
-                                   name=prefix+'state')
-        next_h = F._internal._mul(out_gate, F.Activation(next_c, act_type=self._activation, name=prefix+'activation0'),
-                                  name=prefix+'out')
+        next_c = F.elemwise_add(F.elemwise_mul(forget_gate, states[1], name=prefix+'mul0'),
+                                F.elemwise_mul(in_gate, in_transform, name=prefix+'mul1'),
+                                name=prefix+'state')
+        next_h = F.elemwise_mul(out_gate, F.Activation(next_c, act_type=self._activation, name=prefix+'activation0'),
+                                name=prefix+'out')
 
         return next_h, [next_h, next_c]
 
@@ -667,11 +670,11 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
                                   name=prefix+'h_act')
 
         ones = F.ones_like(update_gate, name=prefix+"ones_like0")
-        next_h = F._internal._plus(F.elemwise_mul(F.elemwise_sub(ones, update_gate, name=prefix+'minus0'),
-                                                  next_h_tmp,
-                                                  name=prefix+'mul1'),
-                                   F.elemwise_mul(update_gate, prev_state_h, name=prefix+'mul20'),
-                                   name=prefix+'out')
+        next_h = F.elemwise_add(F.elemwise_mul(F.elemwise_sub(ones, update_gate, name=prefix+'minus0'),
+                                               next_h_tmp,
+                                               name=prefix+'mul1'),
+                                F.elemwise_mul(update_gate, prev_state_h, name=prefix+'mul20'),
+                                name=prefix+'out')
 
         return next_h, [next_h]
 
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 6935c2752e1a..0939490a8307 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -255,6 +255,13 @@ def learning_rate(self):
         else:
             return self._optimizer.learning_rate
 
+    @property
+    def optimizer(self):
+        if isinstance(self._optimizer, opt.Optimizer):
+            return self._optimizer
+        else:
+            raise UserWarning("Optimizer has not been initialized yet")
+
     def set_learning_rate(self, lr):
         """Sets a new learning rate of the optimizer.
 
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 861542220927..3957b7402688 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -38,6 +38,7 @@ class requests_failed_to_import(object):
 import numpy as np
 
 from .. import ndarray
+from ..util import is_np_shape
 
 def split_data(data, num_slice, batch_axis=0, even_split=True):
     """Splits an NDArray into `num_slice` slices along `batch_axis`.
@@ -412,3 +413,20 @@ def __enter__(self):
 
     def __exit__(self, ptype, value, trace):
         self.detach()
+
+def shape_is_known(shape):
+    """Check whether a shape is completely known with or without np semantics.
+
+    Please see the doc of is_np_shape for more details.
+    """
+    if shape is None:
+        return False
+    unknown_dim_size = -1 if is_np_shape() else 0
+    if len(shape) == 0:
+        return unknown_dim_size == -1
+    for dim_size in shape:
+        if dim_size == unknown_dim_size:
+            return False
+        assert dim_size > unknown_dim_size, "shape dimension size cannot be less than {}, while " \
+                                            "received {}".format(unknown_dim_size, dim_size)
+    return True
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index f7dc27b72951..a142282c83a6 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -102,11 +102,11 @@ def imresize(src, w, h, *args, **kwargs):
         Possible values:
         0: Nearest Neighbors Interpolation.
         1: Bilinear interpolation.
-        2: Area-based (resampling using pixel area relation). It may be a
+        2: Bicubic interpolation over 4x4 pixel neighborhood.
+        3: Area-based (resampling using pixel area relation). It may be a
         preferred method for image decimation, as it gives moire-free
         results. But when the image is zoomed, it is similar to the Nearest
         Neighbors method. (used by default).
-        3: Bicubic interpolation over 4x4 pixel neighborhood.
         4: Lanczos interpolation over 8x8 pixel neighborhood.
         9: Cubic for enlarge, area for shrink, bilinear for others
         10: Random select from interpolation method metioned above.
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 9ff23b7afbc5..7e324a1d6b7e 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -391,7 +391,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
     # end of all epochs
 
 
-def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params):
+def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params, remove_amp_cast=True):
     """Checkpoint the model data into file.
 
     Parameters
@@ -406,13 +406,15 @@ def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params):
         Model parameter, dict of name to NDArray of net's weights.
     aux_params : dict of str to NDArray
         Model parameter, dict of name to NDArray of net's auxiliary states.
+    remove_amp_cast : bool, optional
+        Whether to remove the amp_cast and amp_multicast operators, before saving the model.
     Notes
     -----
     - ``prefix-symbol.json`` will be saved for symbol.
     - ``prefix-epoch.params`` will be saved for parameters.
     """
     if symbol is not None:
-        symbol.save('%s-symbol.json' % prefix)
+        symbol.save('%s-symbol.json' % prefix, remove_amp_cast=remove_amp_cast)
 
     save_dict = {('arg:%s' % k) : v.as_in_context(cpu()) for k, v in arg_params.items()}
     save_dict.update({('aux:%s' % k) : v.as_in_context(cpu()) for k, v in aux_params.items()})
@@ -905,7 +907,7 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
                             sym_gen=self.sym_gen)
 
 
-    def save(self, prefix, epoch=None):
+    def save(self, prefix, epoch=None, remove_amp_cast=True):
         """Checkpoint the model checkpoint into file.
         You can also use `pickle` to do the job if you only work on Python.
         The advantage of `load` and `save` (as compared to `pickle`) is that
@@ -916,6 +918,8 @@ def save(self, prefix, epoch=None):
         ----------
         prefix : str
             Prefix of model name.
+        remove_amp_cast : bool, optional
+            Whether to remove the amp_cast and amp_multicast operators, before saving the model.
 
         Notes
         -----
@@ -925,7 +929,7 @@ def save(self, prefix, epoch=None):
         if epoch is None:
             epoch = self.num_epoch
         assert epoch is not None
-        save_checkpoint(prefix, epoch, self.symbol, self.arg_params, self.aux_params)
+        save_checkpoint(prefix, epoch, self.symbol, self.arg_params, self.aux_params, remove_amp_cast=remove_amp_cast)
 
     @staticmethod
     def load(prefix, epoch, ctx=None, **kwargs):
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index e83751d42974..c1867282e215 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -162,7 +162,7 @@ def load(prefix, epoch, load_optimizer_states=False, **kwargs):
             mod._preload_opt_states = '%s-%04d.states'%(prefix, epoch)
         return mod
 
-    def save_checkpoint(self, prefix, epoch, save_optimizer_states=False):
+    def save_checkpoint(self, prefix, epoch, save_optimizer_states=False, remove_amp_cast=True):
         """Saves current progress to checkpoint.
         Use `mx.callback.module_checkpoint` as `epoch_end_callback` to save during training.
 
@@ -175,7 +175,7 @@ def save_checkpoint(self, prefix, epoch, save_optimizer_states=False):
         save_optimizer_states : bool
             Whether to save optimizer states to continue training.
         """
-        self._symbol.save('%s-symbol.json'%prefix)
+        self._symbol.save('%s-symbol.json'%prefix, remove_amp_cast=remove_amp_cast)
         param_name = '%s-%04d.params' % (prefix, epoch)
         self.save_params(param_name)
         logging.info('Saved checkpoint to \"%s\"', param_name)
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 1c182731c78e..7e21daedcde1 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -4156,7 +4156,7 @@ def from_dlpack(dlpack):
     assert ctypes.pythonapi.PyCapsule_IsValid(dlpack, _c_str_dltensor), ValueError(
         'Invalid DLPack Tensor. DLTensor capsules can be consumed only once.')
     dlpack_handle = ctypes.c_void_p(ctypes.pythonapi.PyCapsule_GetPointer(dlpack, _c_str_dltensor))
-    check_call(_LIB.MXNDArrayFromDLPack(dlpack_handle, ctypes.byref(handle)))
+    check_call(_LIB.MXNDArrayFromDLPackEx(dlpack_handle, False, ctypes.byref(handle)))
     # Rename PyCapsule (DLPack)
     ctypes.pythonapi.PyCapsule_SetName(dlpack, _c_str_used_dltensor)
     # delete the deleter of the old dlpack
@@ -4212,7 +4212,12 @@ def dl_managed_tensor_deleter(dl_managed_tensor_handle):
 
 
 def from_numpy(ndarray, zero_copy=True):
-    """Returns an MXNet's NDArray backed by Numpy's ndarray.
+    """Returns an MXNet's ndarray backed by numpy's ndarray.
+    When `zero_copy` is set to be true,
+    this API consumes numpy's ndarray and produces MXNet's ndarray
+    without having to copy the content. In this case, we disallow
+    users to modify the given numpy ndarray, and it is suggested
+    not to read the numpy ndarray as well for internal correctness.
 
     Parameters
     ----------
@@ -4261,9 +4266,8 @@ def _make_dl_managed_tensor(array):
 
     if not ndarray.flags['C_CONTIGUOUS']:
         raise ValueError("Only c-contiguous arrays are supported for zero-copy")
+    ndarray.flags['WRITEABLE'] = False
     c_obj = _make_dl_managed_tensor(ndarray)
-    address = ctypes.addressof(c_obj)
-    address = ctypes.cast(address, ctypes.c_void_p)
     handle = NDArrayHandle()
-    check_call(_LIB.MXNDArrayFromDLPack(address, ctypes.byref(handle)))
+    check_call(_LIB.MXNDArrayFromDLPackEx(ctypes.byref(c_obj), True, ctypes.byref(handle)))
     return NDArray(handle=handle)
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 613ae8985aca..c2c1aa6a76f4 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -28,7 +28,7 @@
 from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                        mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update,
+                       signsgd_update, signum_update, nag_mom_update, mp_nag_mom_update,
                        multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
                        multi_mp_sgd_mom_update)
 from ..ndarray import sparse
@@ -1029,7 +1029,7 @@ def update(self, index, weight, grad, state):
 
 @register
 class NAG(Optimizer):
-    """Nesterov accelerated SGD.
+    """Nesterov accelerated gradient.
 
     This optimizer updates each weight by::
 
@@ -1051,33 +1051,59 @@ def __init__(self, momentum=0.0, **kwargs):
         super(NAG, self).__init__(**kwargs)
         self.momentum = momentum
 
+    def create_state_multi_precision(self, index, weight):
+        weight_master_copy = None
+        if self.multi_precision and weight.dtype == numpy.float16:
+            weight_master_copy = weight.astype(numpy.float32)
+            return (self.create_state(index, weight_master_copy), weight_master_copy)
+        if weight.dtype == numpy.float16 and not self.multi_precision:
+            warnings.warn("Accumulating with float16 in optimizer can lead to "
+                          "poor accuracy or slow convergence. "
+                          "Consider using multi_precision=True option of the "
+                          "NAG optimizer")
+        return self.create_state(index, weight)
+
     def create_state(self, index, weight):
         momentum = None
         if self.momentum != 0.0:
             momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
         return momentum
 
-    def update(self, index, weight, grad, state):
+    def _update_impl(self, index, weight, grad, state, multi_precision=False):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
 
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
 
-        if state is not None:
-            mom = state
-            mom[:] *= self.momentum
-            mom[:] += grad
-            mom[:] += wd * weight
-            grad[:] += self.momentum * mom
-            weight[:] -= lr * grad
+        if not multi_precision:
+            if state is not None:
+                nag_mom_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
         else:
-            assert self.momentum == 0.0
-            weight[:] += -lr * (grad + wd * weight)
+            if state[0] is not None:
+                mp_nag_mom_update(weight, grad, state[0], state[1], out=weight,
+                                  lr=lr, wd=wd, **kwargs)
+            else:
+                mp_sgd_update(weight, grad, state[1], out=weight,
+                              lr=lr, wd=wd, **kwargs)
+
+    def update(self, index, weight, grad, state):
+        self._update_impl(index, weight, grad, state, multi_precision=False)
+
+    def update_multi_precision(self, index, weight, grad, state):
+        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 \
+                                and isinstance(state, (tuple, list))
+        self._update_impl(index, weight, grad, state,
+                          multi_precision=use_multi_precision)
+
 
 @register
 class SGLD(Optimizer):
@@ -1380,7 +1406,7 @@ def update(self, index, weight, grad, state):
         # preprocess grad
         grad *= self.rescale_grad
         if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad = clip(grad, - self.clip_gradient, self.clip_gradient)
 
         # accumulated g and delta initlization
         acc_g, acc_delta = state
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index 0b5e85b1eb54..373a11592f86 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -47,7 +47,7 @@ def set_config(**kwargs):
         whether to profile memory usage
     profile_api : boolean,
         whether to profile the C API
-    contiguous_dump : boolean,
+    continuous_dump : boolean,
         whether to periodically dump profiling data to file
     dump_period : float,
         seconds between profile data dumps
@@ -148,17 +148,44 @@ def dump_profile():
     dump(True)
 
 
-def dumps(reset=False):
+def dumps(reset=False, format='table', sort_by='avg', ascending=False):
     """Return a printable string of aggregate profile stats.
 
     Parameters
     ----------
     reset: boolean
-        Indicates whether to clean aggeregate statistical data collected up to this point
+        indicates whether to clean aggeregate statistical data collected up to this point
+    format: string
+        whether to return the aggregate stats in table of json format
+        can take 'table' or 'json'
+        defaults to 'table'
+    sort_by: string
+        can take 'avg', 'min', 'max', or 'count'
+        by which stat to sort the entries in each category
+        defaults to 'avg'
+    ascending: boolean
+        whether to sort ascendingly
+        defaults to False
     """
     debug_str = ctypes.c_char_p()
-    do_reset = 1 if reset is True else 0
-    check_call(_LIB.MXAggregateProfileStatsPrint(ctypes.byref(debug_str), int(do_reset)))
+    reset_to_int = {False: 0, True: 1}
+    format_to_int = {'table': 0, 'json': 1}
+    sort_by_to_int = {'avg': 0, 'min': 1, 'max': 2, 'count': 3}
+    asc_to_int = {False: 0, True: 1}
+    assert format in format_to_int.keys(),\
+            "Invalid value provided for format: {0}. Support: 'table', 'json'".format(format)
+    assert sort_by in sort_by_to_int.keys(),\
+            "Invalid value provided for sort_by: {0}. Support: 'avg', 'min', 'max', 'count'"\
+            .format(sort_by)
+    assert  ascending in asc_to_int.keys(),\
+            "Invalid value provided for ascending: {0}. Support: False, True".format(ascending)
+    assert  reset in reset_to_int.keys(),\
+            "Invalid value provided for reset: {0}. Support: False, True".format(reset)
+    check_call(_LIB.MXAggregateProfileStatsPrintEx(ctypes.byref(debug_str),
+                                                   reset_to_int[reset],
+                                                   format_to_int[format],
+                                                   sort_by_to_int[sort_by],
+                                                   asc_to_int[ascending]))
     return py_str(debug_str.value)
 
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 7c800dfd0c88..d3cd519b9a8c 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -34,7 +34,7 @@
 
 from ..attribute import AttrScope
 from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
-from ..base import mx_uint, py_str, string_types, integer_types, mx_int, is_np_compat
+from ..base import mx_uint, py_str, string_types, integer_types, mx_int
 from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
 from ..context import Context, current_context
@@ -45,6 +45,7 @@
 from . import _internal
 from . import op
 from ._internal import SymbolBase, _set_symbol_class
+from ..util import is_np_shape
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
            "pow", "power", "maximum", "minimum", "hypot", "eye", "zeros",
@@ -1078,7 +1079,7 @@ def infer_shape(self, *args, **kwargs):
                 arg_names = self.list_arguments()
                 unknowns = []
                 for name, shape in zip(arg_names, arg_shapes):
-                    if is_np_compat():
+                    if is_np_shape():
                         shape_is_none = not shape or -1 in shape
                     else:
                         shape_is_none = not shape or 0 in shape
@@ -1275,7 +1276,7 @@ def debug_str(self):
             self.handle, ctypes.byref(debug_str)))
         return py_str(debug_str.value)
 
-    def save(self, fname):
+    def save(self, fname, remove_amp_cast=True):
         """Saves symbol to a file.
 
         You can also use pickle to do the job if you only work on python.
@@ -1292,6 +1293,8 @@ def save(self, fname):
             - "s3://my-bucket/path/my-s3-symbol"
             - "hdfs://my-bucket/path/my-hdfs-symbol"
             - "/path-to/my-local-symbol"
+        remove_amp_cast : bool, optional
+            Whether to remove the amp_cast and amp_multicast operators, before saving the model.
 
         See Also
         --------
@@ -1299,7 +1302,12 @@ def save(self, fname):
         """
         if not isinstance(fname, string_types):
             raise TypeError('fname need to be string')
-        check_call(_LIB.MXSymbolSaveToFile(self.handle, c_str(fname)))
+        if remove_amp_cast:
+            handle = SymbolHandle()
+            check_call(_LIB.MXSymbolRemoveAmpCast(self.handle, ctypes.byref(handle)))
+            check_call(_LIB.MXSymbolSaveToFile(handle, c_str(fname)))
+        else:
+            check_call(_LIB.MXSymbolSaveToFile(self.handle, c_str(fname)))
 
     def tojson(self):
         """Saves symbol to a JSON string.
@@ -1371,6 +1379,12 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
             raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
         return c_array(NDArrayHandle, arg_handles), arg_arrays
 
+    def _gen_atomic_symbol(self):
+        handle = SymbolHandle()
+        check_call(_LIB.MXGenAtomicSymbolFromSymbol(self.handle, ctypes.byref(handle)))
+        return Symbol(handle)
+
+
     # pylint: disable=too-many-locals
     def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
                     group2ctx=None, shared_arg_names=None, shared_exec=None,
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index d80fab58be42..bd102412c6e2 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -80,6 +80,11 @@ def get_rtol(rtol=None):
     # be needed for different device and dtype
     return 1e-5 if rtol is None else rtol
 
+def get_etol(etol=None):
+    """Get default numerical threshold for regression test."""
+    # _TODO: get from env variable, different threshold might
+    # be needed for different device and dtype
+    return 0 if etol is None else etol
 
 def random_arrays(*shapes):
     """Generate some random numpy arrays."""
@@ -255,7 +260,7 @@ def assign_each2(input1, input2, function):
 
 def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=None,
                         data_init=None, rsp_indices=None, modifier_func=None,
-                        shuffle_csr_indices=False):
+                        shuffle_csr_indices=False, ctx=None):
     """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np)
 
     Parameters
@@ -296,6 +301,7 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=Non
     >>> assert(row4nnz == 2*row3nnz)
 
     """
+    ctx = ctx if ctx else default_context()
     density = rnd.rand() if density is None else density
     dtype = default_dtype() if dtype is None else dtype
     distribution = "uniform" if distribution is None else distribution
@@ -310,7 +316,7 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=Non
             idx_sample = rnd.rand(shape[0])
             indices = np.argwhere(idx_sample < density).flatten()
         if indices.shape[0] == 0:
-            result = mx.nd.zeros(shape, stype='row_sparse', dtype=dtype)
+            result = mx.nd.zeros(shape, stype='row_sparse', dtype=dtype, ctx=ctx)
             return result, (np.array([], dtype=dtype), np.array([]))
         # generate random values
         val = rnd.rand(indices.shape[0], *shape[1:]).astype(dtype)
@@ -321,17 +327,17 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=Non
         if modifier_func is not None:
             val = assign_each(val, modifier_func)
 
-        arr = mx.nd.sparse.row_sparse_array((val, indices), shape=shape, dtype=dtype)
+        arr = mx.nd.sparse.row_sparse_array((val, indices), shape=shape, dtype=dtype, ctx=ctx)
         return arr, (val, indices)
     elif stype == 'csr':
         assert len(shape) == 2
         if distribution == "uniform":
             csr = _get_uniform_dataset_csr(shape[0], shape[1], density,
                                            data_init=data_init,
-                                           shuffle_csr_indices=shuffle_csr_indices, dtype=dtype)
+                                           shuffle_csr_indices=shuffle_csr_indices, dtype=dtype).as_in_context(ctx)
             return csr, (csr.indptr, csr.indices, csr.data)
         elif distribution == "powerlaw":
-            csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density=density, dtype=dtype)
+            csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density=density, dtype=dtype).as_in_context(ctx)
             return csr, (csr.indptr, csr.indices, csr.data)
         else:
             assert(False), "Distribution not supported: %s" % (distribution)
@@ -340,15 +346,17 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=Non
         assert(False), "unknown storage type"
         return False
 
-def rand_ndarray(shape, stype='default', density=None, dtype=None,
-                 modifier_func=None, shuffle_csr_indices=False, distribution=None):
+def rand_ndarray(shape, stype='default', density=None, dtype=None, modifier_func=None,
+                 shuffle_csr_indices=False, distribution=None, ctx=None):
+    """Generate a random sparse ndarray. Returns the generated ndarray."""
+    ctx = ctx if ctx else default_context()
     if stype == 'default':
-        arr = mx.nd.array(random_arrays(shape), dtype=dtype)
+        arr = mx.nd.array(random_arrays(shape), dtype=dtype, ctx=ctx)
     else:
         arr, _ = rand_sparse_ndarray(shape, stype, density=density,
                                      modifier_func=modifier_func, dtype=dtype,
                                      shuffle_csr_indices=shuffle_csr_indices,
-                                     distribution=distribution)
+                                     distribution=distribution, ctx=ctx)
     return arr
 
 
@@ -411,6 +419,12 @@ def rand_shape_nd(num_dim, dim=10):
     return tuple(rnd.randint(1, dim+1, size=num_dim))
 
 
+def rand_coord_2d(x_low, x_high, y_low, y_high):
+    x = np.random.randint(x_low, x_high, dtype=np.int64)
+    y = np.random.randint(y_low, y_high, dtype=np.int64)
+    return x, y
+
+
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     """Compatible reduce for old version of NumPy.
 
@@ -494,6 +508,50 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
                             names=names)
     raise AssertionError(msg)
 
+def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None, names=('a', 'b'), equal_nan=False):
+    """Test that two numpy arrays are almost equal within given error rate. Raise exception message if not.
+
+    Parameters
+    ----------
+    a : np.ndarray
+    b : np.ndarray
+    threshold : None or float
+        The checking threshold. Default threshold will be used if set to ``None``.
+    etol : None or float
+        The error rate threshold. If etol is float, return true if error_rate < etol even if
+        any error is found.
+    """
+    rtol = get_rtol(rtol)
+    atol = get_atol(atol)
+    etol = get_etol(etol)
+    if etol:
+        equals = np.isclose(a, b, rtol=rtol, atol=atol)
+        err = 1 - np.count_nonzero(equals) / equals.size
+        if err > etol:
+            #if True:
+            index, rel = find_max_violation(a, b, rtol, atol)
+            np.set_printoptions(threshold=4, suppress=True)
+            msg = npt.build_err_msg([a, b],
+                                    err_msg="Error %f exceeds tolerance rtol=%f, atol=%f, etol=%f."
+                                            " Error_rate=%f. Location of maximum error:%s, a=%f, b=%f"
+                                    % (rel, rtol, atol, etol, err, str(index), a[index], b[index]),
+                                    names=names)
+            raise AssertionError(msg)
+
+        if almost_equal(a, b, rtol, atol, equal_nan=equal_nan):
+            return
+    else:
+        if almost_equal(a, b, rtol, atol, equal_nan=equal_nan):
+            return
+        index, rel = find_max_violation(a, b, rtol, atol)
+        np.set_printoptions(threshold=4, suppress=True)
+        msg = npt.build_err_msg([a, b],
+                                err_msg="Error %f exceeds tolerance rtol=%f, atol=%f. "
+                                        " Location of maximum error:%s, a=%f, b=%f"
+                                % (rel, rtol, atol, str(index), a[index], b[index]),
+                                names=names)
+        raise AssertionError(msg)
+
 
 def almost_equal_ignore_nan(a, b, rtol=None, atol=None):
     """Test that two NumPy arrays are almost equal (ignoring NaN in either array).
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index fc8d985b9566..5bc1dc809c88 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -19,6 +19,7 @@
 import ctypes
 import os
 import sys
+import functools
 
 from .base import _LIB, check_call
 
@@ -44,3 +45,207 @@ def get_gpu_memory(gpu_dev_id):
     total_mem = ctypes.c_uint64(0)
     check_call(_LIB.MXGetGPUMemoryInformation64(gpu_dev_id, ctypes.byref(free_mem), ctypes.byref(total_mem)))
     return free_mem.value, total_mem.value
+
+
+def set_np_shape(active):
+    """
+    Turns on/off NumPy shape semantics, in which `()` represents the shape of scalar tensors,
+    and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent the shapes
+    of zero-size tensors. This is turned off by default for keeping backward compatibility.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy within this semantics.
+
+    Parameters
+    ----------
+    active : bool
+        Indicates whether to turn on/off NumPy shape semantics.
+
+    Returns
+    -------
+        A bool value indicating the previous state of NumPy shape semantics.
+
+    Example
+    -------
+    >>> import mxnet as mx
+    >>> prev_state = mx.set_np_shape(True)
+    >>> print(prev_state)
+    False
+    >>> print(mx.is_np_shape())
+    True
+    """
+    prev = ctypes.c_int()
+    check_call(_LIB.MXSetIsNumpyShape(ctypes.c_int(active), ctypes.byref(prev)))
+    return bool(prev.value)
+
+
+def is_np_shape():
+    """
+    Checks whether the NumPy shape semantics is currently turned on.
+    In NumPy shape semantics, `()` represents the shape of scalar tensors,
+    and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent
+    the shapes of zero-size tensors. This is turned off by default for keeping
+    backward compatibility.
+
+    In the NumPy shape semantics, `-1` indicates an unknown size. For example,
+    `(-1, 2, 2)` means that the size of the first dimension is unknown. Its size
+    may be inferred during shape inference.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy within this semantics.
+
+    Returns
+    -------
+        A bool value indicating whether the NumPy shape semantics is currently on.
+
+    Example
+    -------
+    >>> import mxnet as mx
+    >>> prev_state = mx.set_np_shape(True)
+    >>> print(prev_state)
+    False
+    >>> print(mx.is_np_shape())
+    True
+    """
+    curr = ctypes.c_bool()
+    check_call(_LIB.MXIsNumpyShape(ctypes.byref(curr)))
+    return curr.value
+
+
+class _NumpyShapeScope(object):
+    """Scope for managing NumPy shape semantics.
+    In NumPy shape semantics, `()` represents the shape of scalar tensors,
+    and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent
+    the shapes of zero-size tensors.
+
+    Do not use this class directly. Use `np_shape(active)` instead.
+
+    Example::
+
+        with _NumpyShapeScope(True):
+            y = model(x)
+            backward([y])
+
+    """
+    def __init__(self, is_np_shape):  #pylint: disable=redefined-outer-name
+        self._enter_is_np_shape = is_np_shape
+        self._prev_is_np_shape = None
+
+    def __enter__(self):
+        if self._enter_is_np_shape is not None:
+            self._prev_is_np_shape = set_np_shape(self._enter_is_np_shape)
+
+    def __exit__(self, ptype, value, trace):
+        if self._enter_is_np_shape is not None and self._prev_is_np_shape != self._enter_is_np_shape:
+            set_np_shape(self._prev_is_np_shape)
+
+
+def np_shape(active=True):
+    """Returns an activated/deactivated NumPy shape scope to be used in 'with' statement
+    and captures code that needs the NumPy shape semantics, i.e. support of scalar and
+    zero-size tensors.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy even within this scope.
+
+    Parameters
+    ----------
+    active : bool
+        Indicates whether to activate NumPy-shape semantics.
+
+    Returns
+    -------
+    _NumpyShapeScope
+        A scope object for wrapping the code w/ or w/o NumPy-shape semantics.
+
+    Example::
+
+        with mx.np_shape(active=True):
+            # A scalar tensor's shape is `()`, whose `ndim` is `0`.
+            scalar = mx.nd.ones(shape=())
+            assert scalar.shape == ()
+
+            # If NumPy shape semantics is enabled, 0 in a shape means that
+            # dimension contains zero elements.
+            data = mx.sym.var("data", shape=(0, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape()
+            assert arg_shapes[0] == (0, 2, 3)
+            assert out_shapes[0] == (0, 2, 3)
+
+            # -1 means unknown shape dimension size in the new NumPy shape definition
+            data = mx.sym.var("data", shape=(-1, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == (-1, 2, 3)
+            assert out_shapes[0] == (-1, 2, 3)
+
+            # When a shape is completely unknown when NumPy shape semantics is on, it is
+            # represented as `None` in Python.
+            data = mx.sym.var("data")
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] is None
+            assert out_shapes[0] is None
+
+        with mx.np_shape(active=False):
+            # 0 means unknown shape dimension size in the legacy shape definition.
+            data = mx.sym.var("data", shape=(0, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == (0, 2, 3)
+            assert out_shapes[0] == (0, 2, 3)
+
+            # When a shape is completely unknown in the legacy mode (default), its ndim is
+            # equal to 0 and it is represented as `()` in Python.
+            data = mx.sym.var("data")
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == ()
+            assert out_shapes[0] == ()
+    """
+    return _NumpyShapeScope(active)
+
+
+def use_np_shape(func):
+    """Wraps a function with an activated NumPy-shape scope. This ensures
+    that the execution of the function is guaranteed with the support of
+    scalar and zero-size tensors as in NumPy.
+
+    Please note that this is designed as an infrastructure for the incoming
+    MXNet-NumPy operators. Legacy operators registered in the modules
+    `mx.nd` and `mx.sym` are not guaranteed to behave like their counterparts
+    in NumPy even within this scope.
+
+
+    Parameters
+    ----------
+    func : a user-provided callable function to be scoped by the NumPy-shape semantics.
+
+    Returns
+    -------
+    Function
+        A function for wrapping the user functions in the NumPy-shape semantics.
+
+
+    Examples
+    --------
+    >>> import mxnet as mx
+    >>> @mx.use_np_shape
+    ... def scalar_one():
+    ...     return mx.nd.ones(())
+    ...
+    >>> print(scalar_one())
+    """
+    @functools.wraps(func)
+    def _with_np_shape(*args, **kwargs):
+        with np_shape(active=True):
+            return func(*args, **kwargs)
+
+    return _with_np_shape
diff --git a/python/setup.py b/python/setup.py
index 0cd6569a49e6..e8545a5c7a90 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -20,8 +20,8 @@
 from __future__ import absolute_import
 import os
 import sys
+from setuptools import find_packages # This must precede distutils
 
-from setuptools import find_packages
 # need to use distutils.core for correct placement of cython dll
 kwargs = {}
 if "--inplace" in sys.argv:
@@ -30,7 +30,7 @@
 else:
     from setuptools import setup
     from setuptools.extension import Extension
-    kwargs = {'install_requires': ['numpy<=1.15.2,>=1.8.2', 'requests>=2.20.0,<3', 'graphviz<0.9.0,>=0.8.1'], 'zip_safe': False}
+    kwargs = {'install_requires': ['numpy>1.16.0,<2.0.0', 'requests>=2.20.0,<3', 'graphviz<0.9.0,>=0.8.1'], 'zip_safe': False}
 
 with_cython = False
 if '--with-cython' in sys.argv:
@@ -70,7 +70,6 @@ def config_cython():
 
     try:
         from Cython.Build import cythonize
-        # from setuptools.extension import Extension
         if sys.version_info >= (3, 0):
             subdir = "_cy3"
         else:
@@ -81,20 +80,27 @@ def config_cython():
             library_dirs = ['mxnet', '../build/Release', '../build']
             libraries = ['libmxnet']
         else:
-            library_dirs = None
-            libraries = None
+            library_dirs = [os.path.dirname(p) for p in LIB_PATH]
+            libraries = ['mxnet']
+            # Default paths to libmxnet.so relative to the shared library file generated by cython.
+            # These precede LD_LIBRARY_PATH.
+            extra_link_args = ["-Wl,-rpath=$ORIGIN/..:$ORIGIN/../../../lib:$ORIGIN/../../../build"]
 
         for fn in os.listdir(path):
             if not fn.endswith(".pyx"):
                 continue
             ret.append(Extension(
-                "mxnet/%s/.%s" % (subdir, fn[:-4]),
+                "mxnet.%s.%s" % (subdir, fn[:-4]),
                 ["mxnet/cython/%s" % fn],
                 include_dirs=["../include/", "../3rdparty/tvm/nnvm/include"],
                 library_dirs=library_dirs,
                 libraries=libraries,
+                extra_link_args=extra_link_args,
                 language="c++"))
-        return cythonize(ret)
+
+        # If `force=True` is not used and you cythonize the modules for python2 and python3
+        # successively, you need to delete `mxnet/cython/ndarray.cpp` after the first cythonize.
+        return cythonize(ret, force=True)
     except ImportError:
         print("WARNING: Cython is not installed, will compile without cython module")
         return []
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index b888b5549e7a..d7b9f907df55 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -114,6 +114,7 @@
                 <include>NDArrayRandomAPIBase.scala</include>
                 <include>javaapi/NDArrayBase.scala</include>
                 <include>SymbolAPIBase.scala</include>
+                <include>SymbolBase.scala</include>
                 <include>SymbolRandomAPIBase.scala</include>
               </includes>
               <followSymlinks>false</followSymlinks>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
index aba618540141..640ecf5d5978 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
@@ -350,6 +350,6 @@ private[mxnet] class LibInfo {
   @native def mxDumpProfile(finished: Int): Int
 
   // Numpy
-  @native def mxIsNumpyCompatible(compatible: RefInt): Int
-  @native def mxSetIsNumpyCompatible(isNpComp: Int, prev: RefInt): Int
+  @native def mxIsNumpyShape(compatible: RefInt): Int
+  @native def mxSetIsNumpyShape(isNpComp: Int, prev: RefInt): Int
 }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala
index d3e76f1044a7..b63095a10cc1 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala
@@ -25,24 +25,24 @@ import org.apache.mxnet.Base._
   * is introduced first to support zero-dim and zero-size tensors as in NumPy.
   */
 object NumpyScope {
-  def setNumpyCompatible(isNpComp: Boolean): Boolean = {
+  def setNumpyShape(isNpComp: Boolean): Boolean = {
     val prev = new RefInt()
-    checkCall(_LIB.mxSetIsNumpyCompatible(if (isNpComp) 1 else 0, prev))
+    checkCall(_LIB.mxSetIsNumpyShape(if (isNpComp) 1 else 0, prev))
     if (prev.value != 0) true else false
   }
 
-  def isNumpyCompatible: Boolean = {
+  def isNumpyShape: Boolean = {
     val curr = new RefInt
-    checkCall(_LIB.mxIsNumpyCompatible(curr))
+    checkCall(_LIB.mxIsNumpyShape(curr))
     if (curr.value != 0) true else false
   }
 
-  def enableNumpyCompatible: NumpyScope = {
+  def enableNumpyShape: NumpyScope = {
     new NumpyScope(true)
   }
 
 
-  def disableNumpyCompatible: NumpyScope = {
+  def disableNumpyShape: NumpyScope = {
     new NumpyScope(false)
   }
 }
@@ -51,12 +51,12 @@ class NumpyScope(var isCompatible: Boolean) {
   private var prev: Boolean = false
 
   def withScope[T](body: => T): T = {
-    prev = NumpyScope.setNumpyCompatible(isCompatible)
+    prev = NumpyScope.setNumpyShape(isCompatible)
     try {
       body
     } finally {
       if (prev != isCompatible) {
-        NumpyScope.setNumpyCompatible(prev)
+        NumpyScope.setNumpyShape(prev)
       }
     }
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 68db2b1d9144..80f4dc935282 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -293,7 +293,7 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
       val (argShapes, _, _) = inferShapeImpl(partial = true, keys, indPtr, values)
       val argNames = listArguments()
       val unknown = (argNames zip argShapes).map { case (name, shape) =>
-        val shapeIsNone = if (NumpyScope.isNumpyCompatible) {
+        val shapeIsNone = if (NumpyScope.isNumpyShape) {
           shape == null || shape.toVector.contains(-1)
         } else {
           shape == null || shape.toVector.contains(0)
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
index bf6627ac7e91..0581a9890d84 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
@@ -21,14 +21,14 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
 class NumpyScopeSuite extends FunSuite with BeforeAndAfterAll {
   test("compatible") {
-    NumpyScope.enableNumpyCompatible.withScope {
-      assert(NumpyScope.isNumpyCompatible === true)
+    NumpyScope.enableNumpyShape.withScope {
+      assert(NumpyScope.isNumpyShape === true)
     }
   }
 
   test("incompatible") {
-    NumpyScope.disableNumpyCompatible.withScope {
-      assert(NumpyScope.isNumpyCompatible === false)
+    NumpyScope.disableNumpyShape.withScope {
+      assert(NumpyScope.isNumpyShape === false)
     }
   }
 }
diff --git a/scala-package/deploy/pom.xml b/scala-package/deploy/pom.xml
index b4dd273719bb..74338f2173da 100644
--- a/scala-package/deploy/pom.xml
+++ b/scala-package/deploy/pom.xml
@@ -37,7 +37,7 @@
   <properties>
     <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
     <ARTIFACT_ID>mxnet-full_2.11-${platform}-${flavor}</ARTIFACT_ID>
-    <revision>1.5.0-SNAPSHOT</revision>
+    <revision>${base.revision}-SNAPSHOT</revision>
     <repositoryId>apache.snapshots.https</repositoryId>
     <deploy_asc_types/>
     <deploy_asc_classifers/>
@@ -56,13 +56,13 @@
     <profile>
       <id>staging</id>
       <properties>
-        <revision>1.5.0</revision>
+        <revision>${base.revision}</revision>
         <repositoryId>apache.releases.https</repositoryId>
-        <repo_url>https://repository.apache.org/content/repositories/staging</repo_url>
-        <deploy_asc_types>jar.asc,asc,asc</deploy_asc_types>
-        <deploy_asc_classifers>,sources.jar,javadoc.jar</deploy_asc_classifers>
+        <repo_url>https://repository.apache.org/service/local/staging/deploy/maven2</repo_url>
+        <deploy_asc_types>jar.asc,asc,asc,pom.asc</deploy_asc_types>
+        <deploy_asc_classifers>,sources.jar,javadoc.jar,</deploy_asc_classifers>
         <deploy_asc_files>
-          ../assembly/target/mxnet-full_2.11-INTERNAL.jar.asc,../assembly/target/mxnet-full_2.11-INTERNAL-src.jar.asc,../assembly/target/mxnet-full_2.11-INTERNAL-bundle.jar.asc
+          ../assembly/target/mxnet-full_2.11-INTERNAL.jar.asc,../assembly/target/mxnet-full_2.11-INTERNAL-src.jar.asc,../assembly/target/mxnet-full_2.11-INTERNAL-bundle.jar.asc,../externalPom/target/deploy.xml.asc
         </deploy_asc_files>
       </properties>
     </profile>
@@ -85,42 +85,6 @@
         </configuration>
       </plugin>
 
-      <plugin>
-        <groupId>com.google.code.maven-replacer-plugin</groupId>
-        <artifactId>replacer</artifactId>
-        <version>1.5.3</version>
-        <executions>
-          <execution>
-            <phase>deploy</phase>
-            <goals>
-              <goal>replace</goal>
-            </goals>
-          </execution>
-        </executions>
-        <configuration>
-          <file>${basedir}/src/main/deploy/deploy.xml</file>
-          <outputFile>${project.build.directory}/deploy.xml</outputFile>
-          <replacements>
-            <replacement>
-              <token>DESCRIPTION</token>
-              <value>${project.description}</value>
-            </replacement>
-            <replacement>
-              <token>ARTIFACT_ID</token>
-              <value>${ARTIFACT_ID}</value>
-            </replacement>
-            <replacement>
-              <token>PROJECT_VERSION</token>
-              <value>${project.version}</value>
-            </replacement>
-            <replacement>
-              <token>SCALA_VERSION</token>
-              <value>${scala.version}</value>
-            </replacement>
-          </replacements>
-        </configuration>
-      </plugin>
-
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-deploy-plugin</artifactId>
@@ -140,7 +104,7 @@
               <artifactId>${ARTIFACT_ID}</artifactId>
               <version>${project.version}</version>
               <packaging>jar</packaging>
-              <pomFile>${project.build.directory}/deploy.xml</pomFile>
+              <pomFile>${rootdir}/externalPom/target/deploy.xml</pomFile>
               <file>${rootdir}/assembly/target/mxnet-full_2.11-INTERNAL.jar</file>
               <sources>${rootdir}/assembly/target/mxnet-full_2.11-INTERNAL-src.jar</sources>
               <javadoc>${rootdir}/assembly/target/mxnet-full_2.11-INTERNAL-bundle.jar</javadoc>
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
index b65f237c8621..b6894f148b11 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
@@ -21,6 +21,7 @@ import java.net.URL
 
 import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
+import org.apache.mxnet.ResourceScope;
 import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
@@ -37,21 +38,23 @@ class CustomOpExampleSuite extends FunSuite with BeforeAndAfterAll {
       System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
       logger.info("CPU test only, skipped...")
     } else {
-      logger.info("Downloading mnist model")
-      val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      val modelDirPath = tempDirPath + File.separator + "mnist/"
-      val tmpFile = new File(tempDirPath + "/mnist/mnist.zip")
-      if (!tmpFile.exists()) {
-        FileUtils.copyURLToFile(new URL(baseUrl + "/mnist/mnist.zip"),
-          tmpFile)
+      ResourceScope.using() {
+        logger.info("Downloading mnist model")
+        val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
+        val tempDirPath = System.getProperty("java.io.tmpdir")
+        val modelDirPath = tempDirPath + File.separator + "mnist/"
+        val tmpFile = new File(tempDirPath + "/mnist/mnist.zip")
+        if (!tmpFile.exists()) {
+          FileUtils.copyURLToFile(new URL(baseUrl + "/mnist/mnist.zip"),
+                                  tmpFile)
+        }
+        // TODO: Need to confirm with Windows
+        Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
+                  + tempDirPath + "/mnist/") !
+        val context = Context.cpu()
+        val output = ExampleCustomOp.test(modelDirPath, context)
+        assert(output >= 0.95f)
       }
-      // TODO: Need to confirm with Windows
-      Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
-        + tempDirPath + "/mnist/") !
-      val context = Context.cpu()
-      val output = ExampleCustomOp.test(modelDirPath, context)
-      assert(output >= 0.95f)
     }
   }
 
@@ -62,18 +65,20 @@ class CustomOpExampleSuite extends FunSuite with BeforeAndAfterAll {
     if (RTC_fixed) {
       if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
         System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-        logger.info("Downloading mnist model")
-        val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-        val tempDirPath = System.getProperty("java.io.tmpdir")
-        val modelDirPath = tempDirPath + File.separator + "mnist/"
-        Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
-          tempDirPath + "/mnist/mnist.zip")
-        // TODO: Need to confirm with Windows
-        Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
-          + tempDirPath + "/mnist/") !
-        val context = Context.gpu()
-        val output = ExampleCustomOpWithRtc.test(modelDirPath, context)
-        assert(output >= 0.95f)
+        ResourceScope.using() {
+          logger.info("Downloading mnist model")
+          val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
+          val tempDirPath = System.getProperty("java.io.tmpdir")
+          val modelDirPath = tempDirPath + File.separator + "mnist/"
+          Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
+                           tempDirPath + "/mnist/mnist.zip")
+          // TODO: Need to confirm with Windows
+          Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
+                    + tempDirPath + "/mnist/") !
+          val context = Context.gpu()
+          val output = ExampleCustomOpWithRtc.test(modelDirPath, context)
+          assert(output >= 0.95f)
+        }
       } else {
         logger.info("GPU test only, skipped...")
       }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
index e6f4f6fcc908..1b5e362fec2b 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
@@ -19,7 +19,7 @@ package org.apache.mxnetexamples.imclassification
 
 import java.io.File
 
-import org.apache.mxnet.{Context, DType}
+import org.apache.mxnet.{Context, DType, ResourceScope}
 import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
@@ -35,35 +35,41 @@ class IMClassificationExampleSuite extends FunSuite with BeforeAndAfterAll {
 
   test("Example CI: Test MNIST Training") {
 
-    logger.info("Downloading mnist model")
-    val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    val modelDirPath = tempDirPath + File.separator + "mnist/"
-    logger.info("tempDirPath: %s".format(tempDirPath))
-    Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
-      tempDirPath + "/mnist/mnist.zip")
-    // TODO: Need to confirm with Windows
-    Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
-      + tempDirPath + "/mnist/") !
+    ResourceScope.using() {
+      logger.info("Downloading mnist model")
+      val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
+      val tempDirPath = System.getProperty("java.io.tmpdir")
+      val modelDirPath = tempDirPath + File.separator + "mnist/"
+      logger.info("tempDirPath: %s".format(tempDirPath))
+      Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
+        tempDirPath + "/mnist/mnist.zip")
+      // TODO: Need to confirm with Windows
+      Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
+        + tempDirPath + "/mnist/") !
 
-    var context = Context.cpu()
+      var context = Context.cpu()
 
-    val valAccuracy = TrainModel.test("mlp", modelDirPath)
-    Process("rm -rf " + modelDirPath) !
+      val valAccuracy = TrainModel.test("mlp", modelDirPath)
+      Process("rm -rf " + modelDirPath) !
 
-    assert(valAccuracy >= 0.95f)
+      assert(valAccuracy >= 0.95f)
+    }
   }
 
   for(model <- List("mlp", "lenet", "resnet")) {
     test(s"Example CI: Test Image Classification Model ${model}") {
-      val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true)
+      ResourceScope.using() {
+        val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true)
+      }
     }
   }
 
   for(model <- List("mlp", "lenet", "resnet")) {
     test(s"Example CI: Test Image Classification Model ${model} with Float64 input") {
-      val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true,
-        dtype = DType.Float64)
+      ResourceScope.using() {
+        val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true,
+          dtype = DType.Float64)
+      }
     }
   }
 
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
index 9c16aca420ef..d7233a277e59 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
@@ -21,6 +21,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 import java.io.File
 import org.apache.mxnet.Context
+import org.apache.mxnet.ResourceScope
 import org.apache.mxnetexamples.Util
 
 import scala.language.postfixOps
@@ -49,27 +50,29 @@ class ImageClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
     Util.downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
       tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg")
 
-    val modelDirPath = tempDirPath + File.separator + "resnet18/"
-    val inputImagePath = tempDirPath + File.separator +
-      "inputImages/resnet18/Pug-Cookie.jpg"
-    val inputImageDir = tempDirPath + File.separator + "inputImages/resnet18/"
+    ResourceScope.using() {
+      val modelDirPath = tempDirPath + File.separator + "resnet18/"
+      val inputImagePath = tempDirPath + File.separator +
+        "inputImages/resnet18/Pug-Cookie.jpg"
+      val inputImageDir = tempDirPath + File.separator + "inputImages/resnet18/"
 
-    var context = Context.cpu()
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      context = Context.gpu()
-    }
+      var context = Context.cpu()
+      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+            System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+        context = Context.gpu()
+      }
 
-    val output = ImageClassifierExample.runInferenceOnSingleImage(modelDirPath + "resnet-18",
-     inputImagePath, context)
+      val output = ImageClassifierExample.runInferenceOnSingleImage(modelDirPath + "resnet-18",
+                                                                    inputImagePath, context)
 
-    val outputList = ImageClassifierExample.runInferenceOnBatchOfImage(modelDirPath + "resnet-18",
-        inputImageDir, context)
+      val outputList = ImageClassifierExample.runInferenceOnBatchOfImage(modelDirPath + "resnet-18",
+                                                                         inputImageDir, context)
 
-    Process("rm -rf " + modelDirPath + " " + inputImageDir) !
+      Process("rm -rf " + modelDirPath + " " + inputImageDir) !
 
-    assert(output(0).toList.head._1 === "n02110958 pug, pug-dog")
-    assert(outputList(0).toList.head._1 === "n02110958 pug, pug-dog")
+      assert(output(0).toList.head._1 === "n02110958 pug, pug-dog")
+      assert(outputList(0).toList.head._1 === "n02110958 pug, pug-dog")
+    }
 
   }
 }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/predictor/PredictorExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/predictor/PredictorExampleSuite.scala
index 97ca33e71e35..f04f06a37cdd 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/predictor/PredictorExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/predictor/PredictorExampleSuite.scala
@@ -65,23 +65,25 @@ class PredictorExampleSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("test Predictor With Fixed Shape and random shape") {
-    val inputDesc = IndexedSeq(new DataDesc("data", Shape(1, 3, 224, 224),
-      DType.Float32, Layout.NCHW))
-    val predictor = PredictorExample.loadModel(modelDirPrefix, inputDesc, context, 0)
-    // fix size
-    var img = PredictorExample.preProcess(inputImagePath, 224, 224)
-    var result = PredictorExample.doInference(predictor, img)(0)
-    var top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
-    assert(top1 === "n02110958 pug, pug-dog")
-    // random size 512
-    img = PredictorExample.preProcess(inputImagePath, 512, 512)
-    result = PredictorExample.doInference(predictor, img)(0)
-    top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
-    assert(top1 === "n02110958 pug, pug-dog")
-    // original size
-    img = PredictorExample.preProcess(inputImagePath, 1024, 576)
-    result = PredictorExample.doInference(predictor, img)(0)
-    top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
-    assert(top1 === "n02110958 pug, pug-dog")
+    ResourceScope.using() {
+      val inputDesc = IndexedSeq(new DataDesc("data", Shape(1, 3, 224, 224),
+                                              DType.Float32, Layout.NCHW))
+      val predictor = PredictorExample.loadModel(modelDirPrefix, inputDesc, context, 0)
+      // fix size
+      var img = PredictorExample.preProcess(inputImagePath, 224, 224)
+      var result = PredictorExample.doInference(predictor, img)(0)
+      var top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
+      assert(top1 === "n02110958 pug, pug-dog")
+      // random size 512
+      img = PredictorExample.preProcess(inputImagePath, 512, 512)
+      result = PredictorExample.doInference(predictor, img)(0)
+      top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
+      assert(top1 === "n02110958 pug, pug-dog")
+      // original size
+      img = PredictorExample.preProcess(inputImagePath, 1024, 576)
+      result = PredictorExample.doInference(predictor, img)(0)
+      top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
+      assert(top1 === "n02110958 pug, pug-dog")
+    }
   }
 }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
index 65902c7ad391..45361f79005d 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
@@ -34,16 +34,18 @@ class MultiTaskSuite extends FunSuite {
       System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
       logger.info("Multitask Test...")
 
-      val batchSize = 100
-      val numEpoch = 3
-      val ctx = Context.gpu()
+      ResourceScope.using() {
+        val batchSize = 100
+        val numEpoch = 3
+        val ctx = Context.gpu()
 
-      val modelPath = ExampleMultiTask.getTrainingData
-      val (executor, evalMetric) = ExampleMultiTask.train(batchSize, numEpoch, ctx, modelPath)
-      evalMetric.get.foreach { case (name, value) =>
-        assert(value >= 0.95f)
+        val modelPath = ExampleMultiTask.getTrainingData
+        val (executor, evalMetric) = ExampleMultiTask.train(batchSize, numEpoch, ctx, modelPath)
+        evalMetric.get.foreach { case (name, value) =>
+          assert(value >= 0.95f)
+        }
+        executor.dispose()
       }
-      executor.dispose()
     } else {
       logger.info("GPU test only, skipped...")
     }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
index 5b1cbc525890..5264769c6e5d 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.mxnetexamples.neuralstyle
 
-import org.apache.mxnet.Context
+import org.apache.mxnet.{Context, ResourceScope}
 import org.apache.mxnetexamples.Util
 import org.apache.mxnetexamples.neuralstyle.end2end.{BoostInference, BoostTrain}
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
@@ -61,8 +61,10 @@ class NeuralStyleSuite extends FunSuite with BeforeAndAfterAll {
       System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
       ctx = Context.gpu()
     }
-    BoostInference.runInference(tempDirPath + "/NS/model", tempDirPath + "/NS", 2,
-      tempDirPath + "/NS/IMG_4343.jpg", ctx)
+    ResourceScope.using() {
+      BoostInference.runInference(tempDirPath + "/NS/model", tempDirPath + "/NS", 2,
+                                  tempDirPath + "/NS/IMG_4343.jpg", ctx)
+    }
   }
 
   test("Example CI: Test Boost Training") {
@@ -70,8 +72,10 @@ class NeuralStyleSuite extends FunSuite with BeforeAndAfterAll {
     if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
       System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
       val ctx = Context.gpu()
-      BoostTrain.runTraining(tempDirPath + "/NS/images", tempDirPath + "/NS/vgg19.params", ctx,
-        tempDirPath + "/NS/starry_night.jpg", tempDirPath + "/NS")
+      ResourceScope.using() {
+        BoostTrain.runTraining(tempDirPath + "/NS/images", tempDirPath + "/NS/vgg19.params", ctx,
+                               tempDirPath + "/NS/starry_night.jpg", tempDirPath + "/NS")
+      }
     } else {
       logger.info("GPU test only, skip CPU...")
     }
@@ -82,10 +86,12 @@ class NeuralStyleSuite extends FunSuite with BeforeAndAfterAll {
     if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
       System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
       val ctx = Context.gpu()
-      NeuralStyle.runTraining("vgg19", tempDirPath + "/NS/IMG_4343.jpg",
-        tempDirPath + "/NS/starry_night.jpg",
-        ctx, tempDirPath + "/NS/vgg19.params", tempDirPath + "/NS",
-        1f, 20f, 0.01f, 1, 10f, 60, 600, 50, 0.0005f)
+      ResourceScope.using() {
+        NeuralStyle.runTraining("vgg19", tempDirPath + "/NS/IMG_4343.jpg",
+                                tempDirPath + "/NS/starry_night.jpg",
+                                ctx, tempDirPath + "/NS/vgg19.params", tempDirPath + "/NS",
+                                1f, 20f, 0.01f, 1, 10f, 60, 600, 50, 0.0005f)
+      }
     } else {
       logger.info("GPU test only, skip CPU")
     }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
index ca62f484ac20..16982df8ea2e 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.mxnetexamples.rnn
 
 
-import org.apache.mxnet.Context
+import org.apache.mxnet.{Context, ResourceScope}
 import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
 import org.slf4j.LoggerFactory
@@ -44,37 +44,43 @@ class ExampleRNNSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("Example CI: Test LSTM Bucketing") {
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    var ctx = Context.cpu()
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      ctx = Context.gpu()
-    }
-    if (!System.getenv().containsKey("CI")) {
-      LstmBucketing.runTraining(tempDirPath + "/RNN/sherlockholmes.train.txt",
-                                tempDirPath + "/RNN/sherlockholmes.valid.txt", Array(ctx), 1)
-    } else {
-      logger.info("Skipping test on CI...")
+    ResourceScope.using() {
+      val tempDirPath = System.getProperty("java.io.tmpdir")
+      var ctx = Context.cpu()
+      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+        System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+        ctx = Context.gpu()
+      }
+      if (!System.getenv().containsKey("CI")) {
+        LstmBucketing.runTraining(tempDirPath + "/RNN/sherlockholmes.train.txt",
+                                  tempDirPath + "/RNN/sherlockholmes.valid.txt", Array(ctx), 1)
+      } else {
+        logger.info("Skipping test on CI...")
+      }
     }
   }
 
   test("Example CI: Test TrainCharRNN") {
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-          System.getenv("SCALA_TEST_ON_GPU").toInt == 1 &&
-          !System.getenv().containsKey("CI")) {
-      val ctx = Context.gpu()
-      TrainCharRnn.runTrainCharRnn(tempDirPath + "/RNN/obama.txt",
-        tempDirPath, ctx, 1)
-    } else {
-      logger.info("CPU not supported for this test, skipped...")
+    ResourceScope.using() {
+      val tempDirPath = System.getProperty("java.io.tmpdir")
+      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+            System.getenv("SCALA_TEST_ON_GPU").toInt == 1 &&
+            !System.getenv().containsKey("CI")) {
+        val ctx = Context.gpu()
+        TrainCharRnn.runTrainCharRnn(tempDirPath + "/RNN/obama.txt",
+          tempDirPath, ctx, 1)
+      } else {
+        logger.info("CPU not supported for this test, skipped...")
+      }
     }
   }
 
   test("Example CI: Test Inference on CharRNN") {
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    val ctx = Context.gpu()
-    TestCharRnn.runInferenceCharRNN(tempDirPath + "/RNN/obama.txt",
-      tempDirPath + "/RNN/obama", "The joke")
+    ResourceScope.using() {
+      val tempDirPath = System.getProperty("java.io.tmpdir")
+      val ctx = Context.gpu()
+      TestCharRnn.runInferenceCharRNN(tempDirPath + "/RNN/obama.txt",
+        tempDirPath + "/RNN/obama", "The joke")
+    }
   }
 }
diff --git a/scala-package/externalPom/pom.xml b/scala-package/externalPom/pom.xml
new file mode 100644
index 000000000000..515beeb94f40
--- /dev/null
+++ b/scala-package/externalPom/pom.xml
@@ -0,0 +1,152 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+     ~ Licensed to the Apache Software Foundation (ASF) under one or more
+     ~ contributor license agreements.  See the NOTICE file distributed with
+     ~ this work for additional information regarding copyright ownership.
+     ~ The ASF licenses this file to You under the Apache License, Version 2.0
+     ~ (the "License"); you may not use this file except in compliance with
+     ~ the License.  You may obtain a copy of the License at
+     ~
+     ~    http://www.apache.org/licenses/LICENSE-2.0
+     ~
+     ~ Unless required by applicable law or agreed to in writing, software
+     ~ distributed under the License is distributed on an "AS IS" BASIS,
+     ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     ~ See the License for the specific language governing permissions and
+     ~ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.mxnet</groupId>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>mxnet-external-pom</artifactId>
+  <version>${revision}</version>
+  <name>MXNet Scala Package - Full ${platform}-only</name>
+  <packaging>pom</packaging>
+  <description>
+    Scala Package for Apache MXNet (Incubating) - flexible and efficient library for deep learning.
+  </description>
+
+  <properties>
+    <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
+    <ARTIFACT_ID>mxnet-full_2.11-${platform}-${flavor}</ARTIFACT_ID>
+    <revision>${base.revision}-SNAPSHOT</revision>
+    <skipGpg>true</skipGpg>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-full_2.11</artifactId>
+      <version>INTERNAL</version>
+    </dependency>
+  </dependencies>
+
+  <profiles>
+    <profile>
+      <id>staging</id>
+      <properties>
+        <revision>${base.revision}</revision>
+        <skipGpg>false</skipGpg>
+      </properties>
+    </profile>
+  </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <attach>false</attach>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>com.google.code.maven-replacer-plugin</groupId>
+        <artifactId>replacer</artifactId>
+        <version>1.5.3</version>
+        <executions>
+          <execution>
+            <phase>deploy</phase>
+            <goals>
+              <goal>replace</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <file>${basedir}/src/main/deploy/deploy.xml</file>
+          <outputFile>${project.build.directory}/deploy.xml</outputFile>
+          <replacements>
+            <replacement>
+              <token>DESCRIPTION</token>
+              <value>${project.description}</value>
+            </replacement>
+            <replacement>
+              <token>ARTIFACT_ID</token>
+              <value>${ARTIFACT_ID}</value>
+            </replacement>
+            <replacement>
+              <token>PROJECT_VERSION</token>
+              <value>${project.version}</value>
+            </replacement>
+            <replacement>
+              <token>SCALA_VERSION</token>
+              <value>${scala.version}</value>
+            </replacement>
+          </replacements>
+        </configuration>
+      </plugin>
+
+      <plugin>
+          <groupId>org.codehaus.mojo</groupId>
+          <artifactId>build-helper-maven-plugin</artifactId>
+          <version>3.0.0</version>
+          <executions>
+              <execution>
+                  <id>attach-artifacts</id>
+                  <phase>deploy</phase>
+                  <goals>
+                      <goal>attach-artifact</goal>
+                  </goals>
+                  <configuration>
+                      <artifacts>
+                          <artifact>
+                              <file>target/deploy.xml</file>
+                              <type>xml</type>
+                          </artifact>
+                      </artifacts>
+                  </configuration>
+              </execution>
+          </executions>
+      </plugin>
+
+      <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-gpg-plugin</artifactId>
+          <version>1.6</version>
+          <executions>
+              <execution>
+                  <id>sign-artifacts</id>
+                  <phase>deploy</phase>
+                  <goals>
+                      <goal>sign</goal>
+                  </goals>
+              </execution>
+          </executions>
+          <configuration>
+              <skip>${skipGpg}</skip>
+          </configuration>
+      </plugin>
+
+    </plugins>
+  </build>
+
+</project>
diff --git a/scala-package/deploy/src/main/deploy/deploy.xml b/scala-package/externalPom/src/main/deploy/deploy.xml
similarity index 100%
rename from scala-package/deploy/src/main/deploy/deploy.xml
rename to scala-package/externalPom/src/main/deploy/deploy.xml
diff --git a/scala-package/mxnet-demo/java-demo/README.md b/scala-package/mxnet-demo/java-demo/README.md
index 3e742fc29004..33cd000b3e6b 100644
--- a/scala-package/mxnet-demo/java-demo/README.md
+++ b/scala-package/mxnet-demo/java-demo/README.md
@@ -18,6 +18,7 @@
 # MXNet Java Sample Project
 This is a project demonstrating how to use the Maven published Scala/Java MXNet package. 
 The examples provided include:
+* Hello World
 * NDArray creation
 * NDArray operation
 * Object Detection using the Inference API
@@ -120,4 +121,4 @@ sudo apt install libopencv-imgcodecs3.4
 Is there any other version available?
 
 You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0-SNAPSHOT~~).
-Please keep the same version in the pom file or [other versions in here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.
\ No newline at end of file
+Please keep the same version in the pom file or [other versions in here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.
diff --git a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java
new file mode 100644
index 000000000000..e119e56e67c4
--- /dev/null
+++ b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package mxnet;
+
+import org.apache.mxnet.javaapi.*;
+import java.util.Arrays;
+
+public class HelloWorld {
+    static NDArray$ NDArray = NDArray$.MODULE$;
+
+    public static void main(String[] args) {
+    	System.out.println("Hello World!");
+        NDArray nd = new NDArray(new float[]{2.0f, 3.0f}, new Shape(new int[]{1, 2}), Context.cpu());
+        System.out.println(nd.shape());
+        NDArray nd2 = NDArray.dot(new dotParam(nd, nd.T()))[0];
+        System.out.println(Arrays.toString(nd2.toArray()));
+    }
+}
diff --git a/scala-package/mxnet-demo/scala-demo/src/main/scala/sample/ImageClassificationExample.scala b/scala-package/mxnet-demo/scala-demo/src/main/scala/sample/ImageClassificationExample.scala
index b5af654e6916..bb6114dfede4 100644
--- a/scala-package/mxnet-demo/scala-demo/src/main/scala/sample/ImageClassificationExample.scala
+++ b/scala-package/mxnet-demo/scala-demo/src/main/scala/sample/ImageClassificationExample.scala
@@ -94,4 +94,4 @@ object ImageClassificationExample {
     }
 
   }
-}
\ No newline at end of file
+}
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 7323d23ac556..9b19fd360fc4 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -2707,18 +2707,18 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
 }
 
 // Numpy
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyCompatible
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyShape
   (JNIEnv *env, jobject obj, jobject compatibleRef) {
-  bool isCompatible;
-  int ret = MXIsNumpyCompatible(&isCompatible);
-  SetIntField(env, compatibleRef, static_cast<int>(isCompatible));
+  bool isNumpyShape;
+  int ret = MXIsNumpyShape(&isNumpyShape);
+  SetIntField(env, compatibleRef, static_cast<int>(isNumpyShape));
   return ret;
 }
 
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyCompatible
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyShape
   (JNIEnv *env, jobject obj, jint isNpComp, jobject prevRef) {
   int prev;
-  int ret = MXSetIsNumpyCompatible(isNpComp, &prev);
+  int ret = MXSetIsNumpyShape(isNpComp, &prev);
   SetIntField(env, prevRef, prev);
   return ret;
 }
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
index 467272cea9cf..fac32bb0a410 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
@@ -873,18 +873,18 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
 
 /*
  * Class:     org_apache_mxnet_LibInfo
- * Method:    mxIsNumpyCompatible
+ * Method:    mxIsNumpyShape
  * Signature: (Lorg/apache/mxnet/Base/RefInt;)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyCompatible
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyShape
   (JNIEnv *, jobject, jobject);
 
 /*
  * Class:     org_apache_mxnet_LibInfo
- * Method:    mxSetIsNumpyCompatible
+ * Method:    mxSetIsNumpyShape
  * Signature: (ILorg/apache/mxnet/Base/RefInt;)I
  */
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyCompatible
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyShape
   (JNIEnv *, jobject, jint, jobject);
 
 #ifdef __cplusplus
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 46726bc00472..147d7615a380 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -57,6 +57,7 @@
     <scala.version>2.11.8</scala.version>
     <build.platform/>
     <scala.binary.version>2.11</scala.binary.version>
+    <base.revision>1.5.0</base.revision>
     <build.platform />
     <cxx>g++</cxx>
     <dollar>$</dollar>
@@ -76,6 +77,7 @@
     <module>examples</module>
     <module>spark</module>
     <module>assembly</module>
+    <module>externalPom</module>
     <module>deploy</module>
   </modules>
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f549ddd13994..35bd3eeb477a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -521,7 +521,7 @@ int MXNDArrayGetShapeEx(NDArrayHandle handle,
   NDArray *arr = static_cast<NDArray*>(handle);
   if (!arr->is_none()) {
     mxnet::TShape s = arr->shape();
-    if (!Imperative::Get()->is_np_comp()) {
+    if (!Imperative::Get()->is_np_shape()) {
       common::ConvertToLegacyShape(&s);
     }
     *out_dim = s.ndim();
@@ -532,7 +532,7 @@ int MXNDArrayGetShapeEx(NDArrayHandle handle,
       *out_pdata = buffer.data();
     }
   } else {
-    if (Imperative::Get()->is_np_comp()) {
+    if (Imperative::Get()->is_np_shape()) {
       *out_dim = -1;
     } else {
       *out_dim = 0;
@@ -563,9 +563,16 @@ int MXNDArrayToDLPack(NDArrayHandle handle,
 
 int MXNDArrayFromDLPack(DLManagedTensorHandle dlpack,
                         NDArrayHandle *out_handle) {
+  return MXNDArrayFromDLPackEx(dlpack, false, out_handle);
+}
+
+int MXNDArrayFromDLPackEx(DLManagedTensorHandle dlpack,
+                          const bool transient_handle,
+                          NDArrayHandle *out_handle) {
   API_BEGIN();
   *out_handle = new NDArray(NDArray::FromDLPack(
-              static_cast<DLManagedTensor*>(dlpack)));
+              static_cast<DLManagedTensor*>(dlpack),
+              transient_handle));
   API_END();
 }
 
@@ -1526,3 +1533,50 @@ int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
 
   API_END();
 }
+
+int MXEnginePushAsyncND(EngineAsyncFunc async_func, void* func_param,
+                      EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                      NDArrayHandle const_nds_handle, int num_const_nds,
+                      NDArrayHandle mutable_nds_handle, int num_mutable_nds,
+                      EngineFnPropertyHandle prop_handle, int priority,
+                      const char* opr_name, bool wait) {
+  API_BEGIN();
+  NDArray* const_nds = static_cast<NDArray*>(const_nds_handle);
+  NDArray* mutable_nds = static_cast<NDArray*>(mutable_nds_handle);
+  std::vector<VarHandle> const_var_vec(num_const_nds);
+  for (int i = 0; i < num_const_nds; ++i) const_var_vec[i] = (const_nds+i)->var();
+  std::vector<VarHandle> mutable_var_vec(num_mutable_nds);
+  for (int i = 0; i < num_mutable_nds; ++i) mutable_var_vec[i] = (mutable_nds+i)->var();
+  return MXEnginePushAsync(async_func, func_param, deleter, ctx_handle,
+                           const_var_vec.data(), num_const_nds,
+                           mutable_var_vec.data(), num_mutable_nds,
+                           prop_handle, priority, opr_name, wait);
+  API_END();
+}
+
+int MXEnginePushSyncND(EngineSyncFunc sync_func, void* func_param,
+                     EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                     NDArrayHandle const_nds_handle, int num_const_nds,
+                     NDArrayHandle mutable_nds_handle, int num_mutable_nds,
+                     EngineFnPropertyHandle prop_handle, int priority,
+                     const char* opr_name) {
+  API_BEGIN();
+  NDArray* const_nds = static_cast<NDArray*>(const_nds_handle);
+  NDArray* mutable_nds = static_cast<NDArray*>(mutable_nds_handle);
+  std::vector<VarHandle> const_var_vec(num_const_nds);
+  for (int i = 0; i < num_const_nds; ++i) const_var_vec[i] = (const_nds+i)->var();
+  std::vector<VarHandle> mutable_var_vec(num_mutable_nds);
+  for (int i = 0; i < num_mutable_nds; ++i) mutable_var_vec[i] = (mutable_nds+i)->var();
+  return MXEnginePushSync(sync_func, func_param, deleter, ctx_handle,
+                          const_var_vec.data(), num_const_nds,
+                          mutable_var_vec.data(), num_mutable_nds,
+                          prop_handle, priority, opr_name);
+  API_END();
+}
+
+int MXStorageEmptyCache(int dev_type, int dev_id) {
+  API_BEGIN();
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  Storage::Get()->ReleaseAll(ctx);
+  API_END();
+}
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 8fade7df223e..ebe3f17d7d90 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -415,7 +415,7 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
     CHECK(p.second) << "Duplicate shapes are provided for argument "
       << provided_arg_shape_names[i] << " in simple_bind";
   }
-  if (!Imperative::Get()->is_np_comp()) {
+  if (!Imperative::Get()->is_np_shape()) {
     for (auto &kv : arg_shape_map) {
       common::ConvertToNumpyShape(&kv.second);
     }
@@ -749,7 +749,7 @@ int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
     CHECK(p.second) << "Duplicate shapes are provided for argument "
       << provided_arg_shape_names[i] << " in simple_bind";
   }
-  if (!Imperative::Get()->is_np_comp()) {
+  if (!Imperative::Get()->is_np_shape()) {
     for (auto &kv : arg_shape_map) {
       common::ConvertToNumpyShape(&kv.second);
     }
diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
index 50f9b32d6e47..3cd70379b68f 100644
--- a/src/c_api/c_api_function.cc
+++ b/src/c_api/c_api_function.cc
@@ -56,7 +56,7 @@ std::vector<nnvm::NodeEntry> Gradient(
 
   std::vector<nnvm::NodeEntry> ret;
   for (uint32_t i = 0; i < g->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, i, 0});
+    ret.emplace_back(g, i, 0);
   }
 
   return ret;
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 0e136b03ecd7..c9c6000e2f6f 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -276,15 +276,15 @@ int MXAutogradSetIsRecording(int is_recording, int* prev) {
   API_END();
 }
 
-int MXIsNumpyCompatible(bool* curr) {
+int MXIsNumpyShape(bool* curr) {
   API_BEGIN();
-  *curr = Imperative::Get()->is_np_comp();
+  *curr = Imperative::Get()->is_np_shape();
   API_END();
 }
 
-int MXSetIsNumpyCompatible(int is_np_comp, int* prev) {
+int MXSetIsNumpyShape(int is_np_shape, int* prev) {
   API_BEGIN();
-  *prev = Imperative::Get()->set_is_np_comp(static_cast<bool>(is_np_comp));
+  *prev = Imperative::Get()->set_is_np_shape(static_cast<bool>(is_np_shape));
   API_END();
 }
 
diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc
index 0de7b485531c..cec70288b126 100644
--- a/src/c_api/c_api_profile.cc
+++ b/src/c_api/c_api_profile.cc
@@ -198,6 +198,10 @@ enum class ProfileProcess {
   kWorker, kServer
 };
 
+enum class PrintFormat {
+  table, json
+};
+
 struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
   bool profile_all;
   bool profile_symbolic;
@@ -303,6 +307,11 @@ int MXSetProfilerConfig(int num_params, const char* const* keys, const char* con
 }
 
 int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
+  return MXAggregateProfileStatsPrintEx(out_str, reset, 0, 0, 0);
+}
+
+int MXAggregateProfileStatsPrintEx(const char **out_str, int reset, int format, int sort_by,
+                                  int ascending) {
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
     CHECK_NOTNULL(out_str);
@@ -314,8 +323,15 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
     std::shared_ptr<profiler::AggregateStats> stats = profiler->GetAggregateStats();
     std::ostringstream os;
     if (stats) {
-      stats->Dump(os, reset != 0);
+      if (static_cast<PrintFormat>(format) == PrintFormat::table)
+        stats->DumpTable(os, sort_by, ascending);
+      else if (static_cast<PrintFormat>(format) == PrintFormat::json)
+        stats->DumpJson(os, sort_by, ascending);
+      else
+        LOG(FATAL) << "Invalid value for parameter format";
     }
+    if (reset != 0)
+      stats->clear();
     ret->ret_str = os.str();
     *out_str = (ret->ret_str).c_str();
   API_END();
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 24a88520376f..4c6229ee29b0 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -441,6 +441,16 @@ int MXSymbolCreateFromJSON(const char *json, SymbolHandle *out) {
   API_END_HANDLE_ERROR(delete s);
 }
 
+int MXSymbolRemoveAmpCast(SymbolHandle sym_handle, SymbolHandle* ret_sym_handle) {
+  nnvm::Symbol* s = new nnvm::Symbol();
+  API_BEGIN();
+  nnvm::Symbol *source = static_cast<nnvm::Symbol*>(sym_handle);
+  *s = source->Copy();
+  s->outputs = nnvm::ApplyPass(Symbol2Graph(*s), "RemoveAmpCast").outputs;
+  *ret_sym_handle = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
 int MXSymbolSaveToFile(SymbolHandle symbol, const char *fname) {
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(symbol);
   API_BEGIN();
@@ -546,7 +556,7 @@ int MXSymbolInferShape(SymbolHandle sym,
 
   // if use legacy shape definition, need to convert numpy shape to legacy shape
   mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
-  if (!Imperative::Get()->is_np_comp()) {
+  if (!Imperative::Get()->is_np_shape()) {
     common::ConvertToLegacyShape(&shapes);
   }
 
@@ -619,7 +629,7 @@ int MXSymbolInferShapeEx(SymbolHandle sym,
 
   // if use legacy shape definition, need to convert numpy shape to legacy shape
   mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
-  if (!Imperative::Get()->is_np_comp()) {
+  if (!Imperative::Get()->is_np_shape()) {
     common::ConvertToLegacyShape(&shapes);
   }
 
@@ -839,3 +849,17 @@ int MXGenBackendSubgraph(SymbolHandle sym_handle, const char *backend,
   *ret_sym_handle = s;
   API_END_HANDLE_ERROR(delete s);
 }
+
+int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_handle) {
+  nnvm::Symbol *s = new nnvm::Symbol();
+  API_BEGIN();
+  nnvm::Symbol *source = static_cast<nnvm::Symbol *>(sym_handle);
+  CHECK_EQ(source->outputs.size(), 1U)
+    << "Generating atomic symbol from other symbol only works for nongrouped symbol.";
+  const auto& node = source->outputs[0];
+  const auto *op = node.node->op();
+  const auto attrs = source->ListAttrs(nnvm::Symbol::ListAttrOption::kShallow);
+  *s = nnvm::Symbol::CreateFunctor(op, attrs);
+  *ret_sym_handle = s;
+  API_END_HANDLE_ERROR(delete s);
+}
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 93853c459298..c321e48d562d 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -142,6 +142,10 @@ class NaiveEngine final : public Engine {
       opr->opr_name);
   }
 
+/*!
+ * \brief NaiveEngine's PushAsync was intentionally synchronous. 
+ * User should not make any assumption about execution order when using async interface of any engine.
+ */
   void PushAsync(AsyncFn exec_fun,
                  Context exec_ctx,
                  std::vector<VarHandle> const& const_vars,
@@ -167,10 +171,6 @@ class NaiveEngine final : public Engine {
       opr->opr_profile.reset(new profiler::ProfileOperator(opr->opr_name, attrs.release()));
       opr->opr_profile->start(exec_ctx.dev_type, exec_ctx.dev_id);
     }
-    // increment mutable var version
-    for (auto var : mutable_vars) {
-      ++var->version_;
-    }
     if (exec_ctx.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       size_t dev_id = static_cast<size_t>(exec_ctx.dev_id);
@@ -190,6 +190,10 @@ class NaiveEngine final : public Engine {
     } else {
       exec_fun(RunContext{exec_ctx, &cpu_stream_, nullptr, false}, callback);
     }
+    // increment mutable var version
+    for (auto var : mutable_vars) {
+      ++var->version_;
+    }
     CHECK(this->req_completed_)
         << "NaiveEngine only support synchronize Push so far";
     if (profiling) {
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index da1f13bce6c6..82e591e8f17d 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -223,11 +223,12 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
     ng->attrs.op = Op::Get("_zeros_without_dtype");
     ng->attrs.name = "zeros_without_dtype";
     ng->attrs.op->attr_parser(&(ng->attrs));
-    return nnvm::NodeEntry{ng, 0, 0};
+    return nnvm::NodeEntry(std::move(ng), 0, 0);
   }
 
   // remove zero in the sum. at least keep 1.
   auto begin = std::remove_if(v.begin(), v.end(), [](const nnvm::NodeEntry& nodeEntry) {
+     CHECK(nodeEntry.node);
      return nodeEntry.node->op() == zeros_op || nodeEntry.node->op() == zeros_like_op;
   });
   if (begin == v.begin()) ++begin;
@@ -244,7 +245,7 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
       sum_node->attrs.dict["num_args"] = std::to_string(v.size());
       sum_node->attrs.op->attr_parser(&(sum_node->attrs));
       sum_node->inputs = std::move(v);
-      return nnvm::NodeEntry{sum_node, 0, 0};
+      return nnvm::NodeEntry(std::move(sum_node), 0, 0);
     } else {
       // use a stream line of plus instead
       nnvm::NodeEntry ret = v[0];
@@ -274,7 +275,7 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
         x->attrs.op = ewise_plus_op;
         x->attrs.name = os.str();
         x->inputs = {ret, v[i]};
-        ret = nnvm::NodeEntry{x, 0, 0};
+        ret = nnvm::NodeEntry(std::move(x), 0, 0);
       }
       // identity node is used to avoid exposure of dummy plus node
       // when its output get assigned to another space.
@@ -323,7 +324,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
   }
   if (!need_grad_) return g;
   for (size_t i = 0; i < g.outputs.size(); ++i) {
-    NodeEntry ngrad{nnvm::Node::Create(), 0, 0};
+    NodeEntry ngrad(nnvm::Node::Create(), 0, 0);
     head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i]));
     head_grad_map_[ngrad.node.get()] = i;
   }
@@ -331,7 +332,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
   std::vector<NodeEntry> xs;
   for (size_t i = 0; i < grad_req_types.size(); ++i) {
     if (grad_req_types[i] != kNullOp) {
-      xs.emplace_back(NodeEntry{args[i], 0, 0});
+      xs.emplace_back(args[i]);
     }
   }
 
@@ -1022,8 +1023,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) {
       auto eid = idx.entry_id(nid, i);
       data_context[eid] = vctx[nid];
-      CHECK_NE(vstorage_type[nid], kUndefinedStorage);
-      data_storage_type[eid] = (NDArrayStorageType) vstorage_type[nid];
+      CHECK_NE(vstorage_type[eid], kUndefinedStorage);
+      data_storage_type[eid] = (NDArrayStorageType) vstorage_type[eid];
     }
   }
 
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index fa7aee518486..d72325392604 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -470,7 +470,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
   std::vector<int> is_dynamic(rshape.size(), 0);
 
   // convert to numpy compatible shape to use operator's infer shape function
-  if (!Imperative::Get()->is_np_comp()) {
+  if (!Imperative::Get()->is_np_shape()) {
     common::ConvertToNumpyShape(&rshape);
   }
 
@@ -490,7 +490,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
         if (it != inode.source->attrs.dict.end()) {
           std::istringstream is(it->second);
           CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
-          if (!Imperative::Get()->is_np_comp()) {
+          if (!Imperative::Get()->is_np_shape()) {
             common::ConvertToNumpyShape(&rshape[out_ent_id]);
           }
         }
@@ -628,7 +628,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
     }
     if (dispatch_mode_name) {
       for (size_t i = node_start; i < node_end; i++) {
-        if (dispatch_modes[i] == DispatchMode::kUndefined) ++num_unknown;
+        if (dispatch_modes[i] == DispatchMode::kUndefined) {
+          ++num_unknown;
+        }
       }
     }
     ++i;
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 72e69df4b33e..b49cad4eb77b 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -98,7 +98,7 @@ CachedOp::CachedOp(
   using namespace nnvm;
   using namespace imperative;
   static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
-  static const auto _copy = Op::Get("_copy");
+  static const auto _copy_op = Op::Get("_copy");
   config_.Init(flags);
 
   if (config_.static_shape) {
@@ -107,21 +107,21 @@ CachedOp::CachedOp(
 
   // construct forward graph
   {
-    NodeEntryMap<int> dedup_out;
-    for (const auto& i : sym.outputs) {
-      if (dedup_out.count(i)) {
+    NodeEntryMap<size_t> dedup_out;
+    for (const NodeEntry& nodeEntry : sym.outputs) {
+      if (dedup_out.find(nodeEntry) != dedup_out.end()) {
         NodePtr copy_node = Node::Create();
-        copy_node->attrs.op = _copy;
+        copy_node->attrs.op = _copy_op;
         copy_node->attrs.name =
-            i.node->attrs.name + "_copy" + std::to_string(dedup_out[i]++);
-        copy_node->inputs.emplace_back(i);
-        if (_copy->attr_parser != nullptr) {
-          _copy->attr_parser(&(copy_node->attrs));
+            nodeEntry.node->attrs.name + "_copy" + std::to_string(dedup_out[nodeEntry]++);
+        copy_node->inputs.emplace_back(nodeEntry);
+        if (_copy_op->attr_parser != nullptr) {
+          _copy_op->attr_parser(&(copy_node->attrs));
         }
-        fwd_graph_.outputs.push_back(NodeEntry{copy_node, 0, 0});
+        fwd_graph_.outputs.emplace_back(std::move(copy_node));
       } else {
-        dedup_out.insert({i, 0});
-        fwd_graph_.outputs.push_back(i);
+        dedup_out.emplace(nodeEntry, 0);
+        fwd_graph_.outputs.push_back(nodeEntry);
       }
     }
     const auto& idx = fwd_graph_.indexed_graph();
@@ -143,14 +143,15 @@ CachedOp::CachedOp(
 
   // Set params
   {
-    const auto& idx = fwd_graph_.indexed_graph();
+    const auto& indexed_graph = fwd_graph_.indexed_graph();
     if (config_.data_indices.ndim() || config_.param_indices.ndim()) {
       CHECK_EQ(config_.data_indices.ndim() + config_.param_indices.ndim(),
-               idx.input_nodes().size());
+               indexed_graph.input_nodes().size());
     } else {
       std::vector<uint32_t> tmp;
-      for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-        tmp.push_back(i);
+      tmp.reserve(indexed_graph.input_nodes().size());
+      for (size_t i = 0; i < indexed_graph.input_nodes().size(); ++i) {
+        tmp.emplace_back(i);
       }
       config_.data_indices.assign(tmp.begin(), tmp.end());
     }
@@ -159,20 +160,20 @@ CachedOp::CachedOp(
   // construct backward graph
   {
     ograd_entries_.reserve(fwd_graph_.outputs.size());
-    for (size_t i = 0; i < fwd_graph_.outputs.size(); ++i) {
-      ograd_entries_.emplace_back(NodeEntry{Node::Create(), 0, 0});
-    }
+    for (size_t i = 0; i < fwd_graph_.outputs.size(); ++i)
+      ograd_entries_.emplace_back(Node::Create());
 
     std::vector<NodeEntry> xs;
-    const auto& idx = fwd_graph_.indexed_graph();
-    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-      auto nid = idx.input_nodes()[i];
-      if (idx.mutable_input_nodes().count(nid)) continue;
+    const IndexedGraph& indexed_graph = fwd_graph_.indexed_graph();
+    for (size_t i = 0; i < indexed_graph.input_nodes().size(); ++i) {
+      const uint32_t node_id = indexed_graph.input_nodes()[i];
+      if (indexed_graph.mutable_input_nodes().count(node_id))
+        continue;
       fwd_input_to_grad_output_[i] = xs.size();
-      xs.emplace_back(NodeEntry{idx[nid].weak_ref.lock(), 0, 0});
+      xs.emplace_back(indexed_graph[node_id].weak_ref.lock());
     }
 
-    CHECK_GT(xs.size(), 0)
+    CHECK(!xs.empty())
         << "There are no inputs in computation graph that require gradients.";
 
     grad_graph_ = pass::MXGradient(
@@ -199,7 +200,7 @@ CachedOp::CachedOp(
     }
 
     auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >("forward_ref_count");
-    for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count[i] += ref_count[i];
+    for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count.at(i) += ref_count[i];
     fwd_graph_.attrs["full_ref_count"] =
         std::make_shared<dmlc::any>(std::move(full_ref_count));
 
@@ -238,9 +239,12 @@ std::vector<nnvm::NodeEntry> CachedOp::Gradient(
   p->attrs.parsed = node->attrs.parsed;
   p->control_deps.push_back(node);
   p->inputs.reserve(bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size());
-  for (auto i : bwd_ograd_dep_) p->inputs.push_back(ograds[i]);
-  for (auto i : bwd_in_dep_) p->inputs.push_back(node->inputs[i]);
-  for (auto i : bwd_out_dep_) p->inputs.emplace_back(NodeEntry{node, i, 0});
+  for (auto i : bwd_ograd_dep_)
+    p->inputs.push_back(ograds[i]);
+  for (auto i : bwd_in_dep_)
+    p->inputs.push_back(node->inputs[i]);
+  for (auto i : bwd_out_dep_)
+    p->inputs.emplace_back(node, i, 0);
   std::vector<NodeEntry> ret;
   ret.reserve(num_inputs());
   const auto& auxs = mutable_input_nodes();
@@ -251,13 +255,14 @@ std::vector<nnvm::NodeEntry> CachedOp::Gradient(
     uint32_t k = 0;
     for (const auto& i : fwd_graph_.indexed_graph().input_nodes()) {
       if (auxs.count(i)) {
-        ret.emplace_back(NodeEntry{nop, 0, 0});
+        ret.emplace_back(nop);
       } else {
-        ret.emplace_back(NodeEntry{p, k++, 0});
+        ret.emplace_back(p, k++, 0);
       }
     }
   } else {
-    for (uint32_t i = 0; i < num_inputs(); ++i) ret.emplace_back(NodeEntry{p, i, 0});
+    for (uint32_t i = 0; i < num_inputs(); ++i)
+        ret.emplace_back(p, i, 0);
   }
   return ret;
 }
@@ -285,7 +290,7 @@ bool CachedOp::CheckDynamicShapeExists(const Context& default_ctx,
   CheckAndInferShape(&g, std::move(shape_inputs), true,
                      {0, 0}, {0, 0},
                      &contain_dynamic_shape);
-  if (contain_dynamic_shape && erase_result) {
+  if (!config_.static_shape && erase_result) {
     g.attrs.erase("shape");
     g.attrs.erase("shape_inputs");
   }
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index b027de0a0f6f..d8fba1c169ec 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -25,11 +25,11 @@ namespace mxnet {
 #if DMLC_CXX11_THREAD_LOCAL
 thread_local bool Imperative::is_train_ = false;
 thread_local bool Imperative::is_recording_ = false;
-thread_local bool Imperative::is_np_comp_ = false;
+thread_local bool Imperative::is_np_shape_ = false;
 #else
 MX_THREAD_LOCAL bool Imperative::is_train_ = false;
 MX_THREAD_LOCAL bool Imperative::is_recording_ = false;
-MX_THREAD_LOCAL bool Imperative::is_np_comp_ = false;
+MX_THREAD_LOCAL bool Imperative::is_np_shape_ = false;
 #endif
 
 Imperative* Imperative::Get() {
@@ -167,7 +167,7 @@ void Imperative::GetBackwardDependency(
     std::vector<nnvm::NodeEntry> ograd_entries;
     ograd_entries.reserve(num_outputs);
     for (uint32_t i = 0; i < num_outputs; ++i) {
-      ograd_entries.emplace_back(nnvm::NodeEntry{nullptr, i, 1});
+      ograd_entries.emplace_back(nullptr, i, 1);
     }
     auto igrad_entries = fgradient[node->op()](node, ograd_entries);
     for (const auto& i : igrad_entries) {
@@ -363,7 +363,7 @@ std::vector<NDArray*> Imperative::Backward(
       auto node = Node::Create();
       node->attrs.op = copy_op;
       node->inputs.push_back(e);
-      graph.outputs.push_back(NodeEntry{node, 0, 0});
+      graph.outputs.emplace_back(std::move(node));
     } else {
       graph.outputs.push_back(e);
     }
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 5c9706834b2d..4e63e4d2b3be 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -122,7 +122,7 @@ inline void SetShapeType(const Context& ctx,
   if (!infershape.count(attrs.op)) {
     is_dynamic_shape_existing = true;
   } else {
-    if (!Imperative::Get()->is_np_comp()) {
+    if (!Imperative::Get()->is_np_shape()) {
       common::ConvertToNumpyShape(&in_shapes);
       common::ConvertToNumpyShape(&out_shapes);
     }
@@ -595,7 +595,10 @@ inline bool CheckAndInferShape(nnvm::Graph* p_g, mxnet::ShapeVector&& shapes,
     *contain_unknown = false;
   }
   nnvm::Graph& g = *p_g;
-  if (g.attrs.count("shape")) {
+  if (use_inputs) {
+    if (g.attrs.count("shape_inputs") && g.GetAttr<mxnet::ShapeVector>("shape_inputs") == shapes)
+      return true;
+  } else if (g.attrs.count("shape")) {
     const auto& prev_shapes = g.GetAttr<mxnet::ShapeVector>("shape");
     if (prev_shapes.size() == shapes.size()) {
       bool match = true;
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index b090d0486ae9..a1975ef487b7 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -268,7 +268,7 @@ class DefaultImageAugmenter : public ImageAugmenter {
         new_height = param_.resize;
         new_width = param_.resize*src.cols/src.rows;
       }
-      CHECK((param_.inter_method >= 1 && param_.inter_method <= 4) ||
+      CHECK((param_.inter_method >= 0 && param_.inter_method <= 4) ||
        (param_.inter_method >= 9 && param_.inter_method <= 10))
         << "invalid inter_method: valid value 0,1,2,3,9,10";
       int interpolation_method = GetInterMethod(param_.inter_method,
diff --git a/src/io/image_det_aug_default.cc b/src/io/image_det_aug_default.cc
index 3bd37200b8e7..f602a63954a3 100644
--- a/src/io/image_det_aug_default.cc
+++ b/src/io/image_det_aug_default.cc
@@ -410,7 +410,7 @@ class DefaultImageDetAugmenter : public ImageAugmenter {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     kwargs_left = param_.InitAllowUnknown(kwargs);
 
-    CHECK((param_.inter_method >= 1 && param_.inter_method <= 4) ||
+    CHECK((param_.inter_method >= 0 && param_.inter_method <= 4) ||
      (param_.inter_method >= 9 && param_.inter_method <= 10))
       << "invalid inter_method: valid value 0,1,2,3,9,10";
 
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 0163a62400f7..b752ce48d417 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -84,7 +84,7 @@ class MNISTIter: public IIterator<TBlobBatch> {
     out_.data.resize(2);
   }
   virtual ~MNISTIter(void) {
-    if (img_.dptr_ != nullptr) delete []img_.dptr_;
+    delete []img_.dptr_;
   }
   // intialize iterator loads data in
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 9fe41c51a2c6..5ac28cb20c8d 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -141,9 +141,7 @@ class KVStoreDist : public KVStoreLocal {
     }
     if (server_) server_->Run();
     ps::Finalize(0, true);
-    if (server_) {
-      delete server_;
-    }
+    delete server_;
     server_ = nullptr;
   }
 
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 0bfca8c10a1a..bee8bef37b44 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -53,7 +53,7 @@ namespace mxnet {
 NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Context ctx,
     bool delay_alloc, int dtype, std::vector<int> aux_types,
     mxnet::ShapeVector aux_shapes, mxnet::TShape storage_shape) : shape_(shape),
-  dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) {
+  dtype_(dtype), storage_type_(stype), entry_(nullptr) {
   // Assign default aux types if not given
   if (aux_types.size() == 0
       && stype != kDefaultStorage) {
@@ -171,7 +171,7 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
 #if MXNET_USE_MKLDNN == 1
 
 NDArray::NDArray(mkldnn::memory::primitive_desc mem_pd)
-    : storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+    : storage_type_(kDefaultStorage), entry_(nullptr) {
   auto mem_desc = mem_pd.desc();
   shape_ = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
   dtype_ = get_mxnet_type(mem_desc.data.data_type);
@@ -181,7 +181,7 @@ NDArray::NDArray(mkldnn::memory::primitive_desc mem_pd)
 }
 
 NDArray::NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem)
-    : storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+    : storage_type_(kDefaultStorage), entry_(nullptr) {
   auto mem_pd = mkldnn_mem->get_primitive_desc();
   auto mem_desc = mem_pd.desc();
   shape_ = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
@@ -355,14 +355,19 @@ DLManagedTensor* NDArray::ToDLPack() const {
   return &(dlmanager->tensor);
 }
 
-NDArray NDArray::FromDLPack(const DLManagedTensor* tensor) {
-  DLManagedTensor tensor_copy = *tensor;
-  auto deleter = [tensor_copy](){
-    if (tensor_copy.deleter != nullptr) {
-      tensor_copy.deleter(const_cast<DLManagedTensor*>(&tensor_copy));
+NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle) {
+  DLManagedTensor *tensor_copy = transient_handle
+                               ? new DLManagedTensor(*tensor)
+                               : const_cast<DLManagedTensor*>(tensor);
+  auto deleter = [tensor_copy, transient_handle](){
+    if (tensor_copy->deleter != nullptr) {
+      tensor_copy->deleter(tensor_copy);
+    }
+    if (transient_handle) {
+      delete tensor_copy;
     }
   };
-  return NDArray(TBlob(tensor_copy.dl_tensor), tensor_copy.dl_tensor.ctx.device_id, deleter);
+  return NDArray(TBlob(tensor_copy->dl_tensor), tensor_copy->dl_tensor.ctx.device_id, deleter);
 }
 
 bool NDArray::fresh_out_grad() const {
@@ -607,12 +612,17 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc)
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp = *this;
+  const auto version = this->version();
   Engine::Get()->PushAsync(
-    [tmp, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-      tmp.ptr_->MKLDNNDataReorder(desc);
-      on_complete();
-    }, ctx(), const_vars, mutable_vars,
-    FnProperty::kNormal, 0, "Reorder");
+      [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+        // MXNet will try to reuse NDArray from memory planning, so we need to ensure
+        // the NDArray is still holding the original trunk data.
+        if (tmp.version() == version) {
+          tmp.ptr_->MKLDNNDataReorder(desc);
+        }
+        on_complete();
+      },
+      ctx(), const_vars, mutable_vars, FnProperty::kNormal, 0, "Reorder");
 }
 
 const mkldnn::memory *NDArray::GetMKLDNNData() const {
@@ -1195,6 +1205,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
   CHECK(!mxnet::op::shape_is_none(from.shape()))
       << "source operands have undefined shape";
+  if (from.shape().Size() == 0U) return;
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
   const int a = from_ctx.dev_mask();
@@ -1575,10 +1586,20 @@ static const uint32_t NDARRAY_V1_MAGIC = 0xF993fac8;
 /* magic number for ndarray version 2, with storage type */
 static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9;
 
+// magic number for ndarray version 3, with np shape semantics.
+// The ndarray must be saved and loaded within np shape semantics.
+static const uint32_t NDARRAY_V3_MAGIC = 0xF993faca;
+
 void NDArray::Save(dmlc::Stream *strm) const {
-  // write magic number to mark this version
-  // for storage type
-  strm->Write(NDARRAY_V2_MAGIC);
+  if (Imperative::Get()->is_np_shape()) {
+    CHECK_EQ(storage_type(), kDefaultStorage)
+        << "only allow serializing ndarray of default storage type in np shape semantics";
+    strm->Write(NDARRAY_V3_MAGIC);
+  } else {
+    // write magic number to mark this version
+    // for storage type
+    strm->Write(NDARRAY_V2_MAGIC);
+  }
 
   // save storage type
   int32_t stype = storage_type();
@@ -1695,13 +1716,28 @@ bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
 bool NDArray::Load(dmlc::Stream *strm) {
   uint32_t magic;
   if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
-  if (magic != NDARRAY_V2_MAGIC) {
+  if (magic == NDARRAY_V3_MAGIC) {
+    CHECK(Imperative::Get()->is_np_shape())
+        << "ndarray was saved in np shape semantics, must be loaded in the same semantics."
+           " Please turn on np shape semantics in Python using `with np_shape(True)`"
+           " or decorator `use_np_shape` to scope the code of loading the ndarray.";
+  } else {
+    CHECK(!Imperative::Get()->is_np_shape())
+        << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
+           " Please turn off np shape semantics in Python using `with np_shape(False)`"
+           " to scope of the code of loading the ndarray.";
+  }
+  if (magic != NDARRAY_V2_MAGIC && magic != NDARRAY_V3_MAGIC) {
     return LegacyLoad(strm, magic);
   }
 
   // load storage type
   int32_t stype;
   if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false;
+  if (Imperative::Get()->is_np_shape()) {
+    CHECK_EQ(stype, kDefaultStorage)
+        << "only allow deserializing ndarray of default storage type in np shape semantics";
+  }
   const int32_t nad = num_aux_data(static_cast<NDArrayStorageType>(stype));
 
   // load storage shape
@@ -1713,10 +1749,12 @@ bool NDArray::Load(dmlc::Stream *strm) {
   // load shape
   mxnet::TShape shape;
   if (!shape.Load(strm)) return false;
-  if (!Imperative::Get()->is_np_comp()) {
-    common::ConvertToNumpyShape(&shape);
-  }
-  if (mxnet::op::shape_is_none(shape)) {
+  if (Imperative::Get()->is_np_shape()) {
+    if (!shape_is_known(shape)) {
+      *this = NDArray();
+      return true;
+    }
+  } else if (shape.ndim() == 0) {
     *this = NDArray(); return true;
   }
 
diff --git a/src/nnvm/gradient.cc b/src/nnvm/gradient.cc
index 4927191a5964..586027129a0b 100644
--- a/src/nnvm/gradient.cc
+++ b/src/nnvm/gradient.cc
@@ -144,13 +144,13 @@ Graph Gradient(Graph src) {
         << "because it is unreachable from the outputs.";
   }
 
-  // construct mirror reduece memory strategy if needed
+  // construct mirror as memory reduction strategy if needed
   std::unordered_map<Node*, NodePtr> mirror_map;
   if (mirror_fun != nullptr) {
-    for (const NodePtr& n : topo_order) {
-      if (mirror_fun(*n)) {
+    for (const NodePtr& node_ptr : topo_order) {
+      if (mirror_fun(*node_ptr)) {
         NodePtr new_node = Node::Create();
-        *new_node = *n;
+        *new_node = *node_ptr;
         new_node->attrs.name += "_mirror";
         for (auto& e : new_node->inputs) {
           e.node = mirror_map.at(e.node.get());
@@ -158,9 +158,9 @@ Graph Gradient(Graph src) {
         for (auto& n : new_node->control_deps) {
           n = mirror_map.at(n.get());
         }
-        mirror_map[n.get()] = std::move(new_node);
+        mirror_map[node_ptr.get()] = std::move(new_node);
       } else {
-        mirror_map[n.get()] = n;
+        mirror_map[node_ptr.get()] = node_ptr;
       }
     }
   }
@@ -186,7 +186,8 @@ Graph Gradient(Graph src) {
     if ((*rit)->inputs.size() != 0) {
       NodePtr fwd_node = (mirror_map.size() == 0 ? ptr : mirror_map.at(ptr.get()));
       std::vector<NodeEntry> input_grads;
-      if (grad_fun_map.count(ptr->op())) {
+      // Check for FGradient
+      if (grad_fun_map.contains(ptr->op())) {
         input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads);
         CHECK_EQ((*rit)->inputs.size(), input_grads.size())
             << "Gradient function not returning enough gradient";
@@ -206,20 +207,23 @@ Graph Gradient(Graph src) {
           if (p->op()->attr_parser != nullptr) {
             p->op()->attr_parser(&(p->attrs));
           }
-          input_grads.emplace_back(nnvm::NodeEntry{p, 0, 0});
+          input_grads.emplace_back(p, 0, 0);
         }
       } else {
         LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable "
                    << "because it didn't register FGradient attribute.";
       }
+      for (const auto& nodeEntry : input_grads)
+        CHECK(nodeEntry.node);
       auto git = input_grads.begin();
+      CHECK((*rit)->inputs.size() <= input_grads.size());
       for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) {
-        auto& ge = output_grads[it->node.get()][it->index];
+        auto& output_grad_entry = output_grads[it->node.get()][it->index];
         // if any of the backward op can do shape inference, the hint is not necessary.
-        if (finfer_shape.count(git->node->op())) {
-          ge.need_attr_hint = false;
+        if (finfer_shape.contains(git->node->op())) {
+          output_grad_entry.need_attr_hint = false;
         }
-        ge.grads.emplace_back(std::move(*git));
+        output_grad_entry.grads.emplace_back(std::move(*git));
       }
     }
   }
diff --git a/src/nnvm/legacy_op_util.cc b/src/nnvm/legacy_op_util.cc
index 16ad0053e29a..698666f94d90 100644
--- a/src/nnvm/legacy_op_util.cc
+++ b/src/nnvm/legacy_op_util.cc
@@ -321,17 +321,18 @@ inline std::vector<NodeEntry> OpPropGradient(
     const NodePtr& ptr,
     const std::vector<NodeEntry>& out_grads) {
   auto& prop = nnvm::get<ParsedOpProp>(ptr->attrs.parsed);
-  std::vector<NodeEntry> out_data(prop.outputs.size());
-  for (uint32_t i = 0; i < out_data.size(); ++i) {
-    out_data[i] = NodeEntry{ptr, i, 0};
-  }
+  std::vector<NodeEntry> out_data;
+  out_data.reserve(prop.outputs.size());
+  for (size_t i = 0; i < prop.outputs.size(); ++i)
+    out_data.emplace_back(ptr, i, 0);
+
   std::vector<NodeEntry> in_data(
       ptr->inputs.begin(), ptr->inputs.begin() + prop.arguments.size());
   std::vector<NodeEntry> ograd(
       out_grads.begin(), out_grads.begin() + prop.ptr->NumVisibleOutputs());
   auto inputs = prop.ptr->BackwardInputs(ograd, in_data, out_data);
   // add all the auxiliary data
-  for (uint32_t i = 0; i < prop.aux_states.size(); ++i) {
+  for (size_t i = 0; i < prop.aux_states.size(); ++i) {
     inputs.emplace_back(ptr->inputs[i + prop.arguments.size()]);
   }
   NodePtr gnode = Node::Create();
@@ -340,17 +341,15 @@ inline std::vector<NodeEntry> OpPropGradient(
   gnode->attrs = ptr->attrs;
   gnode->attrs.op = back_op;
   gnode->attrs.name = ptr->attrs.name + "_backward";
-  std::vector<NodeEntry> in_grad(prop.arguments.size());
-  for (uint32_t i = 0; i < prop.arguments.size(); ++i) {
-    in_grad[i] = NodeEntry{gnode, i, 0};
+  std::vector<NodeEntry> in_grad;
+  in_grad.reserve(prop.arguments.size() + prop.aux_states.size());
+  for (size_t i = 0; i < prop.arguments.size(); ++i) {
+    in_grad.emplace_back(gnode, i, 0);
   }
   // attach no gradient node to forbid gradient on aux_state
   if (prop.aux_states.size() != 0) {
-    NodePtr ng = Node::Create();
-    ng->attrs.op = Op::Get("_NoGradient");
-    ng->attrs.name = "NoGradient";
-    for (uint32_t i = 0; i < prop.aux_states.size(); ++i) {
-      in_grad.emplace_back(NodeEntry{ng, 0, 0});
+    for (size_t i = 0; i < prop.aux_states.size(); ++i) {
+      in_grad.emplace_back(Node::Create(Op::Get("_NoGradient"), "NoGradient"), 0, 0);
     }
   }
   return in_grad;
diff --git a/src/nnvm/plan_memory.cc b/src/nnvm/plan_memory.cc
index ac7084865d87..ce442ed1a0cf 100644
--- a/src/nnvm/plan_memory.cc
+++ b/src/nnvm/plan_memory.cc
@@ -240,10 +240,16 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
         bool ignore_all_inputs = (fignore_inputs.count(inode.source->op()) != 0 &&
                                   fignore_inputs[inode.source->op()](
                                       inode.source->attrs).size() == inode.source->num_inputs());
+        // Identity should only be true if shape.Size() and types match
+        bool real_identity = identity[ipair] &&
+                             ndim_is_known(shape_vec[eid_out]) &&
+                             ndim_is_known(shape_vec[eid_in]) &&
+                             shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
+                             dtype_vec[eid_out] == dtype_vec[eid_in];
         if (taken[kv.first] == false &&
             sid_out == GraphAllocator::kBadStorageID &&
             sid_in >= 0 &&
-            ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || identity[ipair]) &&
+            ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || real_identity) &&
             entry_ref_count[eid_out] > 0 &&
             shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
              (dtype_vec[eid_out] == dtype_vec[eid_in] ||
diff --git a/src/operator/c_lapack_api.cc b/src/operator/c_lapack_api.cc
index c6293bf8f684..33a5b0816c8d 100644
--- a/src/operator/c_lapack_api.cc
+++ b/src/operator/c_lapack_api.cc
@@ -36,15 +36,29 @@
 
   #define MXNET_LAPACK_CWRAPPER2(func, dtype) \
   int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* a, \
-                                 int lda, dtype* tau, dtype* work, int lwork) { \
+                          int lda, dtype* tau, dtype* work, int lwork) { \
     LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
     return 1; \
   }
 
   #define MXNET_LAPACK_CWRAPPER3(func, dtype) \
   int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, \
-                                 int lda, dtype *w, dtype *work, int lwork, \
-                                 int *iwork, int liwork) { \
+                          int lda, dtype *w, dtype *work, int lwork, \
+                          int *iwork, int liwork) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+
+  #define MXNET_LAPACK_CWRAPPER4(func, dtype) \
+  int MXNET_LAPACK_##func(int matrix_layout, int m, int n, \
+                          dtype *a, int lda, int *ipiv) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+
+  #define MXNET_LAPACK_CWRAPPER5(func, dtype) \
+  int MXNET_LAPACK_##func(int matrix_layout, int n, dtype *a, int lda, \
+                          int *ipiv, dtype *work, int lwork) { \
     LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
     return 1; \
   }
@@ -69,4 +83,10 @@
 
   MXNET_LAPACK_CWRAPPER3(ssyevd, float)
   MXNET_LAPACK_CWRAPPER3(dsyevd, double)
+
+  MXNET_LAPACK_CWRAPPER4(sgetrf, float)
+  MXNET_LAPACK_CWRAPPER4(dgetrf, double)
+
+  MXNET_LAPACK_CWRAPPER5(sgetri, float)
+  MXNET_LAPACK_CWRAPPER5(dgetri, double)
 #endif  // MSHADOW_USE_MKL == 0
diff --git a/src/operator/c_lapack_api.h b/src/operator/c_lapack_api.h
index cd69775547b4..c63229c06798 100644
--- a/src/operator/c_lapack_api.h
+++ b/src/operator/c_lapack_api.h
@@ -119,6 +119,30 @@ extern "C" {
 
   MXNET_LAPACK_FSIG_SYEVD(ssyevd, float)
   MXNET_LAPACK_FSIG_SYEVD(dsyevd, double)
+
+  #ifdef __ANDROID__
+    #define MXNET_LAPACK_FSIG_GETRF(func, dtype) \
+      int func##_(int *m, int *n, dtype *a, int *lda, int *ipiv, int *info);
+  #else
+    #define MXNET_LAPACK_FSIG_GETRF(func, dtype) \
+      void func##_(int *m, int *n, dtype *a, int *lda, int *ipiv, int *info);
+  #endif
+
+  MXNET_LAPACK_FSIG_GETRF(sgetrf, float)
+  MXNET_LAPACK_FSIG_GETRF(dgetrf, double)
+
+  #ifdef __ANDROID__
+    #define MXNET_LAPACK_FSIG_GETRI(func, dtype) \
+      int func##_(int *n, dtype *a, int *lda, int *ipiv, dtype *work, \
+                  int *lwork, int *info);
+  #else
+    #define MXNET_LAPACK_FSIG_GETRI(func, dtype) \
+      void func##_(int *n, dtype *a, int *lda, int *ipiv, dtype *work, \
+                   int *lwork, int *info);
+  #endif
+
+  MXNET_LAPACK_FSIG_GETRI(sgetri, float)
+  MXNET_LAPACK_FSIG_GETRI(dgetri, double)
 }
 
 #endif  // MSHADOW_USE_MKL == 0
@@ -171,8 +195,8 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
   // MXNET_LAPACK-signature and have to be wrapped.
   #define MXNET_LAPACK_CWRAP_GELQF(prefix, dtype) \
   inline int MXNET_LAPACK_##prefix##gelqf(int matrix_layout, int m, int n, \
-                                          dtype *a, int lda, dtype* tau, \
-                                          dtype* work, int lwork) { \
+                                          dtype *a, int lda, dtype *tau, \
+                                          dtype *work, int lwork) { \
     if (lwork != -1) { \
       return LAPACKE_##prefix##gelqf(matrix_layout, m, n, a, lda, tau); \
     } \
@@ -184,8 +208,8 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
 
   #define MXNET_LAPACK_CWRAP_ORGLQ(prefix, dtype) \
   inline int MXNET_LAPACK_##prefix##orglq(int matrix_layout, int m, int n, \
-                                          dtype *a, int lda, dtype* tau, \
-                                          dtype* work, int lwork) { \
+                                          dtype *a, int lda, dtype *tau, \
+                                          dtype *work, int lwork) { \
     if (lwork != -1) { \
       return LAPACKE_##prefix##orglq(matrix_layout, m, n, m, a, lda, tau); \
     } \
@@ -215,6 +239,21 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
   MXNET_LAPACK_CWRAP_SYEVD(s, float)
   MXNET_LAPACK_CWRAP_SYEVD(d, double)
 
+  #define MXNET_LAPACK_sgetrf LAPACKE_sgetrf
+  #define MXNET_LAPACK_dgetrf LAPACKE_dgetrf
+
+  #define MXNET_LAPACK_CWRAP_GETRI(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##getri(int matrix_layout, int n, dtype *a, int lda, \
+                                          int *ipiv, dtype *work, int lwork) { \
+    if (lwork != -1) { \
+      return LAPACKE_##prefix##getri(matrix_layout, n, a, lda, ipiv); \
+    } \
+    *work = 0; \
+    return 0; \
+  }
+  MXNET_LAPACK_CWRAP_GETRI(s, float)
+  MXNET_LAPACK_CWRAP_GETRI(d, double)
+
 #elif MXNET_USE_LAPACK
 
   #define MXNET_LAPACK_ROW_MAJOR 101
@@ -322,6 +361,38 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
   MXNET_LAPACK_CWRAP_SYEVD(ssyevd, float)
   MXNET_LAPACK_CWRAP_SYEVD(dsyevd, double)
 
+  // Note: Both MXNET_LAPACK_*getrf, MXNET_LAPACK_*getri can only be called with col-major format
+  // (MXNet) for performance.
+  #define MXNET_LAPACK_CWRAP_GETRF(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##getrf(int matrix_layout, int m, int n, \
+                                          dtype *a, int lda, int *ipiv) { \
+    if (matrix_layout == MXNET_LAPACK_ROW_MAJOR) { \
+      CHECK(false) << "MXNET_LAPACK_" << #prefix << "getri implemented for col-major layout only"; \
+      return 1; \
+    } else { \
+      int info(0); \
+      prefix##getrf_(&m, &n, a, &lda, ipiv, &info); \
+      return info; \
+    } \
+  }
+  MXNET_LAPACK_CWRAP_GETRF(s, float)
+  MXNET_LAPACK_CWRAP_GETRF(d, double)
+
+  #define MXNET_LAPACK_CWRAP_GETRI(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##getri(int matrix_layout, int n, dtype *a, int lda, \
+                                          int *ipiv, dtype *work, int lwork) { \
+    if (matrix_layout == MXNET_LAPACK_ROW_MAJOR) { \
+      CHECK(false) << "MXNET_LAPACK_" << #prefix << "getri implemented for col-major layout only"; \
+      return 1; \
+    } else { \
+      int info(0); \
+      prefix##getri_(&n, a, &lda, ipiv, work, &lwork, &info); \
+      return info; \
+    } \
+  }
+  MXNET_LAPACK_CWRAP_GETRI(s, float)
+  MXNET_LAPACK_CWRAP_GETRI(d, double)
+
 #else
 
 
@@ -335,12 +406,20 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
 
   #define MXNET_LAPACK_CWRAPPER2(func, dtype) \
   int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* a, \
-                                 int lda, dtype* tau, dtype* work, int lwork);
+                          int lda, dtype* tau, dtype* work, int lwork);
 
   #define MXNET_LAPACK_CWRAPPER3(func, dtype) \
   int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, \
-                                 int lda, dtype *w, dtype *work, int lwork, \
-                                 int *iwork, int liwork);
+                          int lda, dtype *w, dtype *work, int lwork, \
+                          int *iwork, int liwork);
+
+  #define MXNET_LAPACK_CWRAPPER4(func, dtype) \
+  int MXNET_LAPACK_##func(int matrix_layout, int m, int n, \
+                          dtype *a, int lda, int *ipiv);
+
+  #define MXNET_LAPACK_CWRAPPER5(func, dtype) \
+  int MXNET_LAPACK_##func(int matrix_layout, int n, dtype *a, int lda, \
+                          int *ipiv, dtype *work, int lwork);
 
   #define MXNET_LAPACK_UNAVAILABLE(func) \
   int mxnet_lapack_##func(...);
@@ -359,9 +438,16 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
 
   MXNET_LAPACK_CWRAPPER3(ssyevd, float)
   MXNET_LAPACK_CWRAPPER3(dsyevd, double)
+
+  MXNET_LAPACK_CWRAPPER4(sgetrf, float)
+  MXNET_LAPACK_CWRAPPER4(dgetrf, double)
+
+  MXNET_LAPACK_CWRAPPER5(sgetri, float)
+  MXNET_LAPACK_CWRAPPER5(dgetri, double)
   #undef MXNET_LAPACK_CWRAPPER1
   #undef MXNET_LAPACK_CWRAPPER2
   #undef MXNET_LAPACK_CWRAPPER3
+  #undef MXNET_LAPACK_CWRAPPER4
   #undef MXNET_LAPACK_UNAVAILABLE
 #endif
 
diff --git a/src/operator/contrib/all_finite-inl.h b/src/operator/contrib/all_finite-inl.h
new file mode 100755
index 000000000000..cf63fce2defd
--- /dev/null
+++ b/src/operator/contrib/all_finite-inl.h
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file all_finite-inl.h
+ * \brief operator for checking if a group of array is all finite
+ * \author Clement Fuji Tsang
+ */
+
+#ifndef MXNET_OPERATOR_CONTRIB_ALL_FINITE_INL_H_
+#define MXNET_OPERATOR_CONTRIB_ALL_FINITE_INL_H_
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <mxnet/op_attr_types.h>
+#include <mshadow/base.h>
+#include <nnvm/op.h>
+#include <nnvm/op_attr_types.h>
+#include <vector>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+#include "../tensor/init_op.h"
+#include "../tensor/util/tensor_util-inl.h"
+
+namespace mxnet {
+namespace op {
+
+struct AllFiniteParam: public dmlc::Parameter<AllFiniteParam> {
+  bool init_output;
+  DMLC_DECLARE_PARAMETER(AllFiniteParam) {
+    DMLC_DECLARE_FIELD(init_output)
+    .set_default(true)
+    .describe("Initialize output to 1.");
+  }
+};
+
+struct MultiAllFiniteParam : public dmlc::Parameter<MultiAllFiniteParam> {
+  int num_arrays;
+  bool init_output;
+  DMLC_DECLARE_PARAMETER(MultiAllFiniteParam) {
+    DMLC_DECLARE_FIELD(num_arrays)
+    .set_default(1)
+    .describe("Number of arrays.");
+    DMLC_DECLARE_FIELD(init_output)
+    .set_default(true)
+    .describe("Initialize output to 1.");
+  }
+};
+
+template<typename DType>
+struct MultiAllFiniteKernelParam {
+  static const int N = 200;
+  int count;
+  size_t max_size;
+  size_t sizes[N];
+  DType *arrays[N];
+};
+
+template<typename xpu, typename DType>
+MultiAllFiniteKernelParam<DType> FillMultiAllFiniteParam(const MultiAllFiniteParam& op_param,
+                                                         const OpContext &ctx,
+                                                         const std::vector<TBlob> &inputs) {
+  MultiAllFiniteKernelParam<DType> param;
+  using namespace mxnet_op;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  param.count = op_param.num_arrays;
+  param.max_size = 0;
+  for (int i = 0; i < param.count; ++i) {
+    param.sizes[i] = inputs[i].shape_.Size();
+    if (param.max_size < param.sizes[i]) {
+      param.max_size = param.sizes[i];
+    }
+    param.arrays[i] = inputs[i].FlatTo2D<xpu, DType>(s).dptr_;
+  }
+  return param;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_ALL_FINITE_INL_H_
diff --git a/src/operator/contrib/all_finite.cc b/src/operator/contrib/all_finite.cc
new file mode 100755
index 000000000000..5e7751093947
--- /dev/null
+++ b/src/operator/contrib/all_finite.cc
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file all_finite.cc 
+ * \brief operator for checking if a group of array is all finite
+ * \author Clement Fuji Tsang
+ */
+#include "./all_finite-inl.h"
+#include <cmath>
+
+namespace mxnet {
+namespace op {
+
+template<typename DType>
+struct AllFiniteCPUKernel {
+  MSHADOW_XINLINE static void Map(int i, const DType* in, float* out) {
+    bool is_finite = true;
+    is_finite = std::isfinite(static_cast<float>(in[i]))  ? is_finite : false;
+    if (!is_finite) {
+      out[0] = 0.;
+    }
+  }
+};
+
+inline void AllFiniteCPU(const nnvm::NodeAttrs& attrs,
+                         const OpContext &ctx,
+                         const std::vector<TBlob> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<cpu>* s = ctx.get_stream<cpu>();
+  const AllFiniteParam& op_param = nnvm::get<AllFiniteParam>(attrs.parsed);
+  Tensor<cpu, 2, float> out = outputs[0].FlatTo2D<cpu, float>(s);
+  if (op_param.init_output) {
+    out = 1.;
+  }
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    Tensor<cpu, 2, DType> in = inputs[0].FlatTo2D<cpu, DType>(s);
+    const int n = in.shape_.Size();
+    Kernel<AllFiniteCPUKernel<DType>, cpu>::Launch(s, n, in.dptr_, out.dptr_);
+  });
+}
+
+template<typename DType>
+struct MultiAllFiniteCPUKernel {
+  MSHADOW_XINLINE static void Map(int i, const MultiAllFiniteKernelParam<DType> param,
+                                  float* out) {
+    bool is_finite = true;
+    for (int index = 0; index < param.count; ++index) {
+      if ((size_t)i < param.sizes[index]) {
+        is_finite = std::isfinite(static_cast<float>(param.arrays[index][i])) ? is_finite : false;
+      }
+    }
+    if (!is_finite) {
+      out[0] = 0.;
+    }
+  }
+};
+
+inline void MultiAllFiniteCPU(const nnvm::NodeAttrs& attrs,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<cpu>* s = ctx.get_stream<cpu>();
+  const MultiAllFiniteParam& op_param = nnvm::get<MultiAllFiniteParam>(attrs.parsed);
+  Tensor<cpu, 2, float> out = outputs[0].FlatTo2D<cpu, float>(s);
+  if (op_param.init_output)
+    out = 1.;
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MultiAllFiniteKernelParam<DType> param =
+      FillMultiAllFiniteParam<cpu, DType>(op_param, ctx, inputs);
+    Kernel<MultiAllFiniteCPUKernel<DType>, cpu>::Launch(s, param.max_size,
+                                                       param, out.dptr_);
+  });
+}
+
+DMLC_REGISTER_PARAMETER(AllFiniteParam);
+
+NNVM_REGISTER_OP(all_finite)
+.describe(R"code(Check if all the float numbers in the array are finite (used for AMP)
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<AllFiniteParam>)
+.set_attr<mxnet::FInferShape>("FInferShape",
+  [](const nnvm::NodeAttrs& attrs,
+     std::vector<TShape> *in_attrs,
+     std::vector<TShape> *out_attrs){
+    (*out_attrs)[0] = TShape({1});
+    return true;
+  })
+.set_attr<nnvm::FInferType>("FInferType",
+  [](const nnvm::NodeAttrs& attrs,
+     std::vector<int> *in_attrs,
+     std::vector<int> *out_attrs){
+    (*out_attrs)[0] = mshadow::kFloat32;
+    return true;
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    std::vector<std::string> ret;
+    ret.emplace_back("data");
+    return ret;
+  })
+.add_argument("data", "NDArray", "Array")
+.add_arguments(AllFiniteParam::__FIELDS__())
+.set_attr<FCompute>("FCompute<cpu>", AllFiniteCPU);
+
+DMLC_REGISTER_PARAMETER(MultiAllFiniteParam);
+
+NNVM_REGISTER_OP(multi_all_finite)
+.describe(R"code(Check if all the float numbers in all the arrays are finite (used for AMP)
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiAllFiniteParam& param = dmlc::get<MultiAllFiniteParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_arrays);
+  })
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<MultiAllFiniteParam>)
+.set_attr<mxnet::FInferShape>("FInferShape",
+  [](const nnvm::NodeAttrs& attrs,
+     std::vector<TShape> *in_attrs,
+     std::vector<TShape> *out_attrs) {
+    (*out_attrs)[0] = TShape({1});
+    return true;
+  })
+.set_attr<nnvm::FInferType>("FInferType",
+  [](const nnvm::NodeAttrs& attrs,
+     std::vector<int> *in_attrs,
+     std::vector<int> *out_attrs) {
+    (*out_attrs)[0] = mshadow::kFloat32;
+    return true;
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiAllFiniteParam>(attrs.parsed).num_arrays;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("array_") + std::to_string(i));
+    }
+    return ret;
+  })
+.add_argument("data", "NDArray-or-Symbol[]", "Arrays")
+.add_arguments(MultiAllFiniteParam::__FIELDS__())
+.set_attr<FCompute>("FCompute<cpu>", MultiAllFiniteCPU);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/all_finite.cu b/src/operator/contrib/all_finite.cu
new file mode 100755
index 000000000000..69ba35f0844a
--- /dev/null
+++ b/src/operator/contrib/all_finite.cu
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file all_finite.cu
+ * \brief operator for checking if a group of array is all finite
+ * \author Clement Fuji Tsang
+ */
+
+#include "./all_finite-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template <typename DType>
+__global__ void AllFiniteGPUKernel(const int size, const DType* in, float* out) {
+  bool is_finite = true;
+  CUDA_KERNEL_LOOP(i, size) {
+    is_finite = isfinite(static_cast<float>(in[i])) ? is_finite : false;
+  }
+  __syncthreads();
+  if (!is_finite) {
+    out[0] = 0.;
+  }
+}
+
+inline void AllFiniteGPU(const nnvm::NodeAttrs& attrs,
+                         const OpContext &ctx,
+                         const std::vector<TBlob> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<gpu>* s = ctx.get_stream<gpu>();
+  const AllFiniteParam& op_param = nnvm::get<AllFiniteParam>(attrs.parsed);
+  Tensor<gpu, 2, float> out = outputs[0].FlatTo2D<gpu, float>(s);
+  if (op_param.init_output)
+    out = 1.;
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    Tensor<gpu, 2, DType> in = inputs[0].FlatTo2D<gpu, DType>(s);
+    const int n = in.shape_.Size();
+    AllFiniteGPUKernel<DType><<<cuda_get_num_blocks(n),
+                                mshadow::cuda::kBaseThreadNum, 0,
+                                mshadow::Stream<gpu>::GetStream(s)>>>(n, in.dptr_, out.dptr_);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(AllFiniteGPUKernel<DType>);
+  });
+}
+
+template <typename DType>
+__global__ void MultiAllFiniteGPUKernel(const MultiAllFiniteKernelParam<DType> param, float* out) {
+  bool is_finite = true;
+  for (int index = 0; index < param.count; ++index) {
+    CUDA_KERNEL_LOOP(i, param.sizes[index]) {
+      is_finite = isfinite(static_cast<float>(param.arrays[index][i])) ? is_finite : false;
+    }
+  }
+  __syncthreads();
+  if (!is_finite) {
+    out[0] = 0.;
+  }
+}
+
+inline void MultiAllFiniteGPU(const nnvm::NodeAttrs& attrs,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<gpu>* s = ctx.get_stream<gpu>();
+  const MultiAllFiniteParam& op_param = nnvm::get<MultiAllFiniteParam>(attrs.parsed);
+  Tensor<gpu, 2, float> out = outputs[0].FlatTo2D<gpu, float>(s);
+  if (op_param.init_output)
+    out = 1.;
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MultiAllFiniteKernelParam<DType> param =
+      FillMultiAllFiniteParam<gpu, DType>(op_param, ctx, inputs);
+    MultiAllFiniteGPUKernel<DType><<<cuda_get_num_blocks(param.max_size),
+                                     mshadow::cuda::kBaseThreadNum, 1,
+                                     mshadow::Stream<gpu>::GetStream(s)>>>(param, out.dptr_);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(MultiAllFiniteGPUKernel<DType>);
+  });
+}
+
+NNVM_REGISTER_OP(all_finite)
+.set_attr<FCompute>("FCompute<gpu>", AllFiniteGPU);
+
+NNVM_REGISTER_OP(multi_all_finite)
+.set_attr<FCompute>("FCompute<gpu>", MultiAllFiniteGPU);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/amp_graph_pass.cc b/src/operator/contrib/amp_graph_pass.cc
new file mode 100644
index 000000000000..abecc4a913bc
--- /dev/null
+++ b/src/operator/contrib/amp_graph_pass.cc
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file amp_graph_pass.cc
+ * \brief graph pass regarding AMP
+ * \author Clement Fuji Tsang
+ */
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <mxnet/op_attr_types.h>
+
+namespace mxnet {
+namespace op {
+
+using nnvm::Node;
+using nnvm::NodePtr;
+using nnvm::Graph;
+
+
+/*
+ * \brief Remove amp_cast and amp_multicast and replug the fp32 weights
+ */
+Graph RemoveAmpCast(Graph&& g) {
+  DFSVisit(g.outputs, [](const NodePtr& n) {
+    for (size_t i = 0; i < n->inputs.size(); ++i) {
+      auto e = n->inputs[i];
+      if (e.node->op() == Op::Get("amp_cast")) {
+        n->inputs[i] = e.node->inputs[0];
+      } else if (e.node->op() == Op::Get("amp_multicast")) {
+        n->inputs[i] = e.node->inputs[e.index];
+      }
+    }
+  });
+  return g;
+}
+
+NNVM_REGISTER_PASS(RemoveAmpCast)
+.describe("")
+.set_body(RemoveAmpCast)
+.set_change_graph(true);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/boolean_mask-inl.h b/src/operator/contrib/boolean_mask-inl.h
index 775981f76aa8..8925e9b96186 100644
--- a/src/operator/contrib/boolean_mask-inl.h
+++ b/src/operator/contrib/boolean_mask-inl.h
@@ -71,6 +71,7 @@ struct BooleanMaskBackwardKernel {
   template<typename DType>
   static void MSHADOW_XINLINE Map(int i,
                                   DType* igrad,
+                                  const OpReqType req,
                                   const DType* ograd,
                                   const int32_t* idx,
                                   const size_t col_size) {
@@ -79,7 +80,13 @@ struct BooleanMaskBackwardKernel {
     int32_t prev = (row_id == 0) ? 0 : idx[row_id - 1];
     int32_t curr = idx[row_id];
     if (prev != curr) {
-      igrad[i] = ograd[prev * col_size + col_id];
+      if (req == kAddTo)
+        igrad[i] += ograd[prev * col_size + col_id];
+      else
+        igrad[i] = ograd[prev * col_size + col_id];
+    } else {
+      if (req == kWriteTo || req == kWriteInplace)
+        igrad[i] = 0;
     }
   }
 };
diff --git a/src/operator/contrib/boolean_mask.cc b/src/operator/contrib/boolean_mask.cc
index 06d8439e23a0..4d66e1ec0a69 100644
--- a/src/operator/contrib/boolean_mask.cc
+++ b/src/operator/contrib/boolean_mask.cc
@@ -90,10 +90,12 @@ struct BooleanMaskForwardCPUKernel {
   }
 };
 
-struct BooleanMaskBackwardCPUKernel {
+
+struct BooleanMaskBackwardCPUWriteKernel {
   template<typename DType>
   static void Map(int i,
                   DType* igrad,
+                  const OpReqType /*req*/,
                   const DType* ograd,
                   const int32_t* idx,
                   const size_t col_size) {
@@ -102,6 +104,8 @@ struct BooleanMaskBackwardCPUKernel {
     int32_t curr = idx[i];
     if (prev != curr) {
       std::memcpy(igrad + i * col_size, ograd + prev * col_size, col_size * sizeof(DType));
+    } else {
+      std::memset(igrad + i * col_size, 0, col_size * sizeof(DType));
     }
   }
 };
@@ -114,6 +118,7 @@ inline void BooleanMaskForward<cpu>(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray> &outputs) {
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
+  CHECK(req[0] == kWriteTo || req[0] == kWriteInplace);
   const BooleanMaskParam& param = nnvm::get<BooleanMaskParam>(attrs.parsed);
   const int axis = param.axis;
   const NDArray &data = inputs[0];
@@ -158,6 +163,7 @@ inline void BooleanMaskBackward<cpu>(const nnvm::NodeAttrs& attrs,
                                      const std::vector<NDArray> &outputs) {
   CHECK_EQ(inputs.size(), 3U);
   CHECK_EQ(outputs.size(), 2U);
+  if (req[0] == kNullOp) return;
   // inputs: {ograd, data, idx}
   // outputs: {igrad_data, igrad_idx}
   const NDArray& ograd = inputs[0];
@@ -175,9 +181,15 @@ inline void BooleanMaskBackward<cpu>(const nnvm::NodeAttrs& attrs,
         prefix_sum[i] += (idx_dptr[i]) ? 1 : 0;
       }
       mshadow::Stream<cpu> *stream = ctx.get_stream<cpu>();
-      mxnet_op::Kernel<BooleanMaskBackwardCPUKernel, cpu>::Launch(
-        stream, idx_size, igrad_data.data().dptr<DType>(), ograd.data().dptr<DType>(),
-        prefix_sum.data(), col_size);
+      if (req[0] == kAddTo) {
+        mxnet_op::Kernel<BooleanMaskBackwardKernel, cpu>::Launch(
+          stream, idx_size, igrad_data.data().dptr<DType>(), req[0],
+          ograd.data().dptr<DType>(), prefix_sum.data(), col_size);
+      } else {
+        mxnet_op::Kernel<BooleanMaskBackwardCPUWriteKernel, cpu>::Launch(
+          stream, idx_size, igrad_data.data().dptr<DType>(), req[0],
+          ograd.data().dptr<DType>(), prefix_sum.data(), col_size);
+      }
     });
   });
 }
diff --git a/src/operator/contrib/boolean_mask.cu b/src/operator/contrib/boolean_mask.cu
index 04f61eea0384..47335bfd6b79 100644
--- a/src/operator/contrib/boolean_mask.cu
+++ b/src/operator/contrib/boolean_mask.cu
@@ -36,6 +36,7 @@ inline void BooleanMaskForward<gpu>(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
+  CHECK(req[0] == kWriteTo || req[0] == kWriteInplace);
   const BooleanMaskParam& param = nnvm::get<BooleanMaskParam>(attrs.parsed);
   const int axis = param.axis;
   const NDArray &data = inputs[0];
@@ -101,6 +102,7 @@ inline void BooleanMaskBackward<gpu>(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(inputs.size(), 3U);
   CHECK_EQ(outputs.size(), 2U);
+  if (req[0] == kNullOp) return;
   // inputs: {ograd, data, idx}
   // outputs: {igrad_data, igrad_idx}
   const NDArray& ograd = inputs[0];
@@ -142,7 +144,7 @@ inline void BooleanMaskBackward<gpu>(const nnvm::NodeAttrs& attrs,
   // Backward pass
   MSHADOW_TYPE_SWITCH(igrad_data.dtype(), DType, {
     mxnet_op::Kernel<BooleanMaskBackwardKernel, gpu>::Launch(
-      s, input_size, igrad_data.data().dptr<DType>(), ograd.data().dptr<DType>(),
+      s, input_size, igrad_data.data().dptr<DType>(), req[0], ograd.data().dptr<DType>(),
       prefix_sum, col_size);
   });
 }
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index 000d703066d7..eb23d99bbb1a 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -61,9 +61,9 @@ struct DeformableConvolutionParam : public dmlc::Parameter<DeformableConvolution
   mxnet::TShape stride;
   mxnet::TShape dilate;
   mxnet::TShape pad;
-  uint32_t num_filter;
-  uint32_t num_group;
-  uint32_t num_deformable_group;
+  index_t num_filter;
+  index_t num_group;
+  index_t num_deformable_group;
   uint64_t workspace;
   bool no_bias;
   dmlc::optional<int> layout;
@@ -109,10 +109,10 @@ class DeformableConvolutionOp : public Operator {
   }
 
   virtual void Forward(const OpContext &ctx,
-    const std::vector<TBlob> &in_data,
-    const std::vector<OpReqType> &req,
-    const std::vector<TBlob> &out_data,
-    const std::vector<TBlob> &aux_args) {
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[conv::kOut], kWriteTo);
@@ -147,10 +147,11 @@ class DeformableConvolutionOp : public Operator {
       Shape4(num_, group_, M, N), s);
     for (index_t n = 0; n < num_; ++n) {
       // transform image to col_buffer in order to use gemm
-      deformable_im2col(s, in_data[conv::kData].dptr<DType>() + n*input_dim_,
-        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_, in_data[conv::kData].shape_,
-        col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
-        param_.num_deformable_group, col_buffer.dptr<DType>());
+      deformable_im2col(s, in_data[conv::kData].dptr<DType>() + n * input_dim_,
+                        in_data[conv::kOffset].dptr<DType>() + n * input_offset_dim_,
+                        in_data[conv::kData].shape_, col_buffer.shape_,
+                        param_.kernel, param_.pad, param_.stride, param_.dilate,
+                        param_.num_deformable_group, col_buffer.dptr<DType>());
       Tensor<xpu, 3, DType> output_3d = output_4d[n];
       for (index_t g = 0; g < group_; ++g) {
         // Legacy approach shown here for comparison:
@@ -168,12 +169,12 @@ class DeformableConvolutionOp : public Operator {
   }
 
   virtual void Backward(const OpContext &ctx,
-    const std::vector<TBlob>& out_grad,
-    const std::vector<TBlob>& in_data,
-    const std::vector<TBlob>& out_data,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& in_grad,
-    const std::vector<TBlob>& aux_args) {
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1U);
@@ -226,26 +227,27 @@ class DeformableConvolutionOp : public Operator {
 
       // gradient w.r.t. input coordinate data
       deformable_col2im_coord(s, col_buffer.dptr<DType>(),
-        in_data[conv::kData].dptr<DType>() + n*input_dim_,
-        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_,
-        in_grad[conv::kData].shape_, col_buffer.shape_,
-        param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group,
-        in_grad[conv::kOffset].dptr<DType>() + n*input_offset_dim_,
-        req[conv::kOffset]);
+                              in_data[conv::kData].dptr<DType>() + n * input_dim_,
+                              in_data[conv::kOffset].dptr<DType>() + n * input_offset_dim_,
+                              in_grad[conv::kData].shape_, col_buffer.shape_,
+                              param_.kernel, param_.pad, param_.stride,
+                              param_.dilate, param_.num_deformable_group,
+                              in_grad[conv::kOffset].dptr<DType>() + n * input_offset_dim_);
 
       // gradient w.r.t. input data
       deformable_col2im(s, col_buffer.dptr<DType>(),
-        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_,
-        in_grad[conv::kData].shape_, col_buffer.shape_,
-        param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group,
-        in_grad[conv::kData].dptr<DType>() + n*input_dim_,
-        req[conv::kData]);
+                        in_data[conv::kOffset].dptr<DType>() + n * input_offset_dim_,
+                        in_grad[conv::kData].shape_, col_buffer.shape_,
+                        param_.kernel, param_.pad, param_.stride,
+                        param_.dilate, param_.num_deformable_group,
+                        in_grad[conv::kData].dptr<DType>() + n * input_dim_);
 
       // gradient w.r.t. weight, dWeight should accumulate across the batch and group
-      deformable_im2col(s, in_data[conv::kData].dptr<DType>() + n*input_dim_,
-        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_, in_data[conv::kData].shape_,
-        col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
-        param_.num_deformable_group, col_buffer.dptr<DType>());
+      deformable_im2col(s, in_data[conv::kData].dptr<DType>() + n * input_dim_,
+                        in_data[conv::kOffset].dptr<DType>() + n * input_offset_dim_,
+                        in_data[conv::kData].shape_, col_buffer.shape_, param_.kernel,
+                        param_.pad, param_.stride, param_.dilate,
+                        param_.num_deformable_group, col_buffer.dptr<DType>());
 
       for (index_t g = 0; g < group_; ++g) {
         auto request = (n == 0) ? req[conv::kWeight] : kAddTo;
@@ -327,9 +329,9 @@ class DeformableConvolutionOp : public Operator {
 
 template<typename xpu>
 Operator* CreateOp(DeformableConvolutionParam param, int dtype,
-  mxnet::ShapeVector *in_shape,
-  mxnet::ShapeVector *out_shape,
-  Context ctx);
+                   mxnet::ShapeVector *in_shape,
+                   mxnet::ShapeVector *out_shape,
+                   Context ctx);
 
 #if DMLC_USE_CXX11
 class DeformableConvolutionProp : public OperatorProperty {
@@ -360,8 +362,8 @@ class DeformableConvolutionProp : public OperatorProperty {
   }
 
   bool InferShape(mxnet::ShapeVector *in_shape,
-    mxnet::ShapeVector *out_shape,
-    mxnet::ShapeVector *aux_shape) const override {
+                  mxnet::ShapeVector *out_shape,
+                  mxnet::ShapeVector *aux_shape) const override {
     using namespace mshadow;
     if (!param_.no_bias) {
       CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]";
@@ -411,8 +413,6 @@ class DeformableConvolutionProp : public OperatorProperty {
       oshape[3] = (dshape[3] + 2 * param_.pad[1] -
         (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
       SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-      CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \
-        << "output num_filter must divide deformable group size";
       CHECK_EQ(oshape[2], offsetshape[2]) \
         << "output height must equal to offset map height";
       CHECK_EQ(oshape[3], offsetshape[3]) \
@@ -450,8 +450,8 @@ class DeformableConvolutionProp : public OperatorProperty {
   }
 
   bool InferType(std::vector<int> *in_type,
-    std::vector<int> *out_type,
-    std::vector<int> *aux_type) const override {
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
     CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
@@ -477,10 +477,9 @@ class DeformableConvolutionProp : public OperatorProperty {
     return "_contrib_DeformableConvolution";
   }
 
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
+  std::vector<int> DeclareBackwardDependency(const std::vector<int> &out_grad,
+                                             const std::vector<int> &in_data,
+                                             const std::vector<int> &out_data) const override {
     return{ out_grad[conv::kOut], in_data[conv::kData],
             in_data[conv::kOffset], in_data[conv::kWeight] };
   }
@@ -501,7 +500,7 @@ class DeformableConvolutionProp : public OperatorProperty {
   }
 
   Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-    std::vector<int> *in_type) const override;
+                             std::vector<int> *in_type) const override;
 
  private:
   DeformableConvolutionParam param_;
diff --git a/src/operator/contrib/deformable_convolution.cc b/src/operator/contrib/deformable_convolution.cc
index 8bb1ae23f40d..60138376f287 100644
--- a/src/operator/contrib/deformable_convolution.cc
+++ b/src/operator/contrib/deformable_convolution.cc
@@ -62,7 +62,7 @@ The deformable convolution operation is described in https://arxiv.org/abs/1703.
 For 2-D deformable convolution, the shapes are
 
 - **data**: *(batch_size, channel, height, width)*
-- **offset**: *(batch_size, num_deformable_group * kernel[0] * kernel[1], height, width)*
+- **offset**: *(batch_size, num_deformable_group * kernel[0] * kernel[1] * 2, height, width)*
 - **weight**: *(num_filter, channel, kernel[0], kernel[1])*
 - **bias**: *(num_filter,)*
 - **out**: *(batch_size, num_filter, out_height, out_width)*.
@@ -89,9 +89,9 @@ the *g* results.
 
 If ``num_deformable_group`` is larger than 1, denoted by *dg*, then split the
 input ``offset`` evenly into *dg* parts along the channel axis, and also evenly
-split ``out`` evenly into *dg* parts along the channel axis. Next compute the
-deformable convolution, apply the *i*-th part of the offset part on the *i*-th
-out.
+split ``data`` into *dg* parts along the channel axis. Next compute the
+deformable convolution, apply the *i*-th part of the offset on the *i*-th part
+of the data.
 
 
 Both ``weight`` and ``bias`` are learnable parameters.
diff --git a/src/operator/contrib/deformable_psroi_pooling-inl.h b/src/operator/contrib/deformable_psroi_pooling-inl.h
index e466c065abbc..78124d2a26a6 100644
--- a/src/operator/contrib/deformable_psroi_pooling-inl.h
+++ b/src/operator/contrib/deformable_psroi_pooling-inl.h
@@ -51,11 +51,11 @@ namespace deformablepsroipool {
 struct DeformablePSROIPoolingParam : public dmlc::Parameter<DeformablePSROIPoolingParam> {
   // mxnet::TShape pooled_size;
   float spatial_scale;
-  int output_dim;
-  int group_size;
-  int pooled_size;
-  int part_size;
-  int sample_per_part;
+  index_t output_dim;
+  index_t group_size;
+  index_t pooled_size;
+  index_t part_size;
+  index_t sample_per_part;
   float trans_std;
   bool no_trans;
   DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) {
@@ -82,10 +82,10 @@ class DeformablePSROIPoolingOp : public Operator {
   }
 
   virtual void Forward(const OpContext &ctx,
-    const std::vector<TBlob> &in_data,
-    const std::vector<OpReqType> &req,
-    const std::vector<TBlob> &out_data,
-    const std::vector<TBlob> &aux_args) {
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     size_t in_expected = param_.no_trans? 2 : 3;
     size_t out_expected = 2;
@@ -119,12 +119,12 @@ class DeformablePSROIPoolingOp : public Operator {
   }
 
   virtual void Backward(const OpContext &ctx,
-    const std::vector<TBlob> &out_grad,
-    const std::vector<TBlob> &in_data,
-    const std::vector<TBlob> &out_data,
-    const std::vector<OpReqType> &req,
-    const std::vector<TBlob> &in_grad,
-    const std::vector<TBlob> &aux_args) {
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     size_t in_expected = param_.no_trans ? 2 : 3;
     size_t out_expected = 2;
@@ -216,8 +216,8 @@ class DeformablePSROIPoolingProp : public OperatorProperty {
   }
 
   bool InferShape(mxnet::ShapeVector *in_shape,
-    mxnet::ShapeVector *out_shape,
-    mxnet::ShapeVector *aux_shape) const override {
+                  mxnet::ShapeVector *out_shape,
+                  mxnet::ShapeVector *aux_shape) const override {
     using namespace mshadow;
     if (param_.no_trans) {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]";
@@ -248,8 +248,8 @@ class DeformablePSROIPoolingProp : public OperatorProperty {
   }
 
   bool InferType(std::vector<int> *in_type,
-    std::vector<int> *out_type,
-    std::vector<int> *aux_type) const override {
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
     CHECK_GE(in_type->size(), 2);
     int dtype = (*in_type)[0];
     CHECK_EQ(dtype, (*in_type)[1]);
@@ -272,10 +272,9 @@ class DeformablePSROIPoolingProp : public OperatorProperty {
   }
 
   // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
+  std::vector<int> DeclareBackwardDependency(const std::vector<int> &out_grad,
+                                             const std::vector<int> &in_data,
+                                             const std::vector<int> &out_data) const override {
     if (param_.no_trans) {
       return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData],
               in_data[deformablepsroipool::kBox], out_data[deformablepsroipool::kTopCount] };
@@ -292,8 +291,9 @@ class DeformablePSROIPoolingProp : public OperatorProperty {
     return NULL;
   }
 
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-    std::vector<int> *in_type) const override;
+  Operator* CreateOperatorEx(Context ctx,
+                             mxnet::ShapeVector *in_shape,
+                             std::vector<int> *in_type) const override;
 
 
  private:
diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc
index d9d4cf8f78c5..697376dd573f 100644
--- a/src/operator/contrib/deformable_psroi_pooling.cc
+++ b/src/operator/contrib/deformable_psroi_pooling.cc
@@ -35,43 +35,309 @@ using std::max;
 using std::min;
 using std::floor;
 using std::ceil;
+using std::round;
 
 namespace mshadow {
+
+  template <typename DType>
+  inline DType bilinear_interp_cpu(const DType* data,
+                                   const DType x, const DType y,
+                                   const index_t width, const index_t height) {
+    index_t x1 = floor(x);
+    index_t x2 = ceil(x);
+    index_t y1 = floor(y);
+    index_t y2 = ceil(y);
+    DType dist_x = static_cast<DType>(x - x1);
+    DType dist_y = static_cast<DType>(y - y1);
+    DType value11 = data[y1 * width + x1];
+    DType value12 = data[y2 * width + x1];
+    DType value21 = data[y1 * width + x2];
+    DType value22 = data[y2 * width + x2];
+    DType value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 +
+      dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+    return value;
+  }
+
+  template <typename DType>
+  inline void DeformablePSROIPoolForwardCPU(const index_t count, const DType* bottom_data,
+                                            const DType spatial_scale, const index_t channels,
+                                            const index_t height, const index_t width,
+                                            const index_t pooled_height, const index_t pooled_width,
+                                            const DType* bottom_rois, const DType* bottom_trans,
+                                            const bool no_trans, const DType trans_std,
+                                            const index_t sample_per_part, const index_t output_dim,
+                                            const index_t group_size, const index_t part_size,
+                                            const index_t num_classes,
+                                            const index_t channels_each_class,
+                                            DType* top_data, DType* top_count) {
+    const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(omp_threads)
+    for (index_t index = 0; index < count; index++) {
+      // The output is in order (n, ctop, ph, pw)
+      index_t pw = index % pooled_width;
+      index_t ph = (index / pooled_width) % pooled_height;
+      index_t ctop = (index / pooled_width / pooled_height) % output_dim;
+      index_t n = index / pooled_width / pooled_height / output_dim;
+
+      // [start, end) interval for spatial sampling
+      const DType* offset_bottom_rois = bottom_rois + n * 5;
+      index_t roi_batch_ind = offset_bottom_rois[0];
+      DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+      DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+      DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+      DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+      // Force too small ROIs to be 1x1
+      DType roi_width = max(roi_end_w - roi_start_w, static_cast<DType>(0.1));  // avoid 0
+      DType roi_height = max(roi_end_h - roi_start_h, static_cast<DType>(0.1));
+
+      // Compute w and h at bottom
+      DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
+      DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
+
+      DType sub_bin_size_h = bin_size_h / static_cast<DType>(sample_per_part);
+      DType sub_bin_size_w = bin_size_w / static_cast<DType>(sample_per_part);
+
+      index_t part_h = floor(static_cast<DType>(ph) / pooled_height * part_size);
+      index_t part_w = floor(static_cast<DType>(pw) / pooled_width * part_size);
+      index_t class_id = ctop / channels_each_class;
+      DType trans_x = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+      DType trans_y = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2 + 1)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+
+      DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
+      wstart += trans_x * roi_width;
+      DType hstart = static_cast<DType>(ph) * bin_size_h + roi_start_h;
+      hstart += trans_y * roi_height;
+
+      DType sum = 0;
+      index_t count = 0;
+      index_t gw = floor(static_cast<DType>(pw) * group_size / pooled_width);
+      index_t gh = floor(static_cast<DType>(ph) * group_size / pooled_height);
+      gw = min(max(gw, static_cast<index_t>(0)), group_size - 1);
+      gh = min(max(gh, static_cast<index_t>(0)), group_size - 1);
+
+      const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+      for (index_t ih = 0; ih < sample_per_part; ih++) {
+        for (index_t iw = 0; iw < sample_per_part; iw++) {
+          DType w = wstart + iw * sub_bin_size_w;
+          DType h = hstart + ih * sub_bin_size_h;
+          // bilinear interpolation
+          if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+            continue;
+          }
+          w = min(max(w, static_cast<DType>(0)), static_cast<DType>(width - 1));
+          h = min(max(h, static_cast<DType>(0)), static_cast<DType>(height - 1));
+          index_t c = (ctop * group_size + gh) * group_size + gw;
+          DType val = bilinear_interp_cpu(offset_bottom_data + c * height * width,
+                                          w, h, width, height);
+          sum += val;
+          count++;
+        }
+      }
+      top_data[index] = count == 0 ? static_cast<DType>(0) : sum / count;
+      top_count[index] = count;
+    }
+  }
+
   template<typename DType>
   inline void DeformablePSROIPoolForward(const Tensor<cpu, 4, DType> &out,
-    const Tensor<cpu, 4, DType> &data,
-    const Tensor<cpu, 2, DType> &bbox,
-    const Tensor<cpu, 4, DType> &trans,
-    const Tensor<cpu, 4, DType> &top_count,
-    const bool no_trans,
-    const float spatial_scale,
-    const int output_dim,
-    const int group_size,
-    const int pooled_size,
-    const int part_size,
-    const int sample_per_part,
-    const float trans_std) {
-    // NOT_IMPLEMENTED;
+                                         const Tensor<cpu, 4, DType> &data,
+                                         const Tensor<cpu, 2, DType> &bbox,
+                                         const Tensor<cpu, 4, DType> &trans,
+                                         const Tensor<cpu, 4, DType> &top_count,
+                                         const bool no_trans, const float spatial_scale,
+                                         const index_t output_dim, const index_t group_size,
+                                         const index_t pooled_size, const index_t part_size,
+                                         const index_t sample_per_part, const float trans_std) {
+    const DType *bottom_data = data.dptr_;
+    const DType *bottom_rois = bbox.dptr_;
+    const DType *bottom_trans = no_trans ? nullptr : trans.dptr_;
+    DType *top_data = out.dptr_;
+    DType *top_count_data = top_count.dptr_;
+    const index_t count = out.shape_.Size();
+    const index_t channels = data.size(1);
+    const index_t height = data.size(2);
+    const index_t width = data.size(3);
+    const index_t pooled_height = pooled_size;
+    const index_t pooled_width = pooled_size;
+    const index_t num_classes = no_trans ? 1 : trans.size(1) / 2;
+    const index_t channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+    DeformablePSROIPoolForwardCPU<DType>(count, bottom_data, spatial_scale,
+                                         channels, height, width,
+                                         pooled_height, pooled_width,
+                                         bottom_rois, bottom_trans,
+                                         no_trans, trans_std, sample_per_part,
+                                         output_dim, group_size, part_size, num_classes,
+                                         channels_each_class, top_data, top_count_data);
+
     return;
   }
 
+  template <typename DType>
+  inline void DeformablePSROIPoolBackwardAccCPU(const index_t count, const DType* top_diff,
+                                                const DType* top_count, const index_t num_rois,
+                                                const DType spatial_scale, const index_t channels,
+                                                const index_t height, const index_t width,
+                                                const index_t pooled_height,
+                                                const index_t pooled_width,
+                                                const index_t output_dim,
+                                                DType* bottom_data_diff,
+                                                DType* bottom_trans_diff,
+                                                const DType* bottom_data,
+                                                const DType* bottom_rois,
+                                                const DType* bottom_trans,
+                                                const bool no_trans,
+                                                const DType trans_std,
+                                                const index_t sample_per_part,
+                                                const index_t group_size,
+                                                const index_t part_size,
+                                                const index_t num_classes,
+                                                const index_t channels_each_class) {
+    for (index_t index = 0; index < count; index++) {
+      // The output is in order (n, ctop, ph, pw)
+      index_t pw = index % pooled_width;
+      index_t ph = (index / pooled_width) % pooled_height;
+      index_t ctop = (index / pooled_width / pooled_height) % output_dim;
+      index_t n = index / pooled_width / pooled_height / output_dim;
+
+      // [start, end) interval for spatial sampling
+      const DType* offset_bottom_rois = bottom_rois + n * 5;
+      index_t roi_batch_ind = offset_bottom_rois[0];
+      DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+      DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+      DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+      DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+      // Force too small ROIs to be 1x1
+      DType roi_width = max(roi_end_w - roi_start_w, static_cast<DType>(0.1));  // avoid 0
+      DType roi_height = max(roi_end_h - roi_start_h, static_cast<DType>(0.1));
+
+      // Compute w and h at bottom
+      DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
+      DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
+
+      DType sub_bin_size_h = bin_size_h / static_cast<DType>(sample_per_part);
+      DType sub_bin_size_w = bin_size_w / static_cast<DType>(sample_per_part);
+
+      index_t part_h = floor(static_cast<DType>(ph) / pooled_height * part_size);
+      index_t part_w = floor(static_cast<DType>(pw) / pooled_width * part_size);
+      index_t class_id = ctop / channels_each_class;
+      DType trans_x = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+      DType trans_y = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2 + 1)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+
+      DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
+      wstart += trans_x * roi_width;
+      DType hstart = static_cast<DType>(ph) * bin_size_h + roi_start_h;
+      hstart += trans_y * roi_height;
+
+      if (top_count[index] <= 0) {
+        continue;
+      }
+      DType diff_val = top_diff[index] / top_count[index];
+      const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+      DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+      index_t gw = floor(static_cast<DType>(pw)* group_size / pooled_width);
+      index_t gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
+      gw = min(max(gw, static_cast<index_t>(0)), group_size - 1);
+      gh = min(max(gh, static_cast<index_t>(0)), group_size - 1);
+
+      for (index_t ih = 0; ih < sample_per_part; ih++) {
+        for (index_t iw = 0; iw < sample_per_part; iw++) {
+          DType w = wstart + iw * sub_bin_size_w;
+          DType h = hstart + ih * sub_bin_size_h;
+          // bilinear interpolation
+          if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+            continue;
+          }
+          w = min(max(w, static_cast<DType>(0)), static_cast<DType>(width - 1));
+          h = min(max(h, static_cast<DType>(0)), static_cast<DType>(height - 1));
+          index_t c = (ctop * group_size + gh) * group_size + gw;
+          // backward on feature
+          index_t x0 = floor(w);
+          index_t x1 = ceil(w);
+          index_t y0 = floor(h);
+          index_t y1 = ceil(h);
+          DType dist_x = w - x0, dist_y = h - y0;
+          DType q00 = (1 - dist_x) * (1 - dist_y);
+          DType q01 = (1 - dist_x) * dist_y;
+          DType q10 = dist_x * (1 - dist_y);
+          DType q11 = dist_x * dist_y;
+          index_t bottom_index_base = c * height * width;
+          offset_bottom_data_diff[bottom_index_base + y0 * width + x0] += q00 * diff_val;
+          offset_bottom_data_diff[bottom_index_base + y1 * width + x0] += q01 * diff_val;
+          offset_bottom_data_diff[bottom_index_base + y0 * width + x1] += q10 * diff_val;
+          offset_bottom_data_diff[bottom_index_base + y1 * width + x1] += q11 * diff_val;
+
+          if (no_trans) {
+            continue;
+          }
+          DType U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+          DType U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+          DType U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+          DType U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+          DType diff_x = U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y);
+          diff_x *= trans_std * diff_val * roi_width;
+          DType diff_y = U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x);
+          diff_y *= trans_std * diff_val * roi_height;
+
+          index_t offset_trans_diff = (((n * num_classes + class_id) * 2)
+            * part_size + part_h) * part_size + part_w;
+          bottom_trans_diff[offset_trans_diff] += diff_x;
+          bottom_trans_diff[offset_trans_diff + part_size * part_size] += diff_y;
+        }
+      }
+    }
+  }
+
   template<typename DType>
   inline void DeformablePSROIPoolBackwardAcc(const Tensor<cpu, 4, DType> &in_grad,
-    const Tensor<cpu, 4, DType> &trans_grad,
-    const Tensor<cpu, 4, DType> &out_grad,
-    const Tensor<cpu, 4, DType> &data,
-    const Tensor<cpu, 2, DType> &bbox,
-    const Tensor<cpu, 4, DType> &trans,
-    const Tensor<cpu, 4, DType> &top_count,
-    const bool no_trans,
-    const float spatial_scale,
-    const int output_dim,
-    const int group_size,
-    const int pooled_size,
-    const int part_size,
-    const int sample_per_part,
-    const float trans_std) {
-    // NOT_IMPLEMENTED;
+                                             const Tensor<cpu, 4, DType> &trans_grad,
+                                             const Tensor<cpu, 4, DType> &out_grad,
+                                             const Tensor<cpu, 4, DType> &data,
+                                             const Tensor<cpu, 2, DType> &bbox,
+                                             const Tensor<cpu, 4, DType> &trans,
+                                             const Tensor<cpu, 4, DType> &top_count,
+                                             const bool no_trans, const float spatial_scale,
+                                             const index_t output_dim, const index_t group_size,
+                                             const index_t pooled_size, const index_t part_size,
+                                             const index_t sample_per_part, const float trans_std) {
+    const DType *top_diff = out_grad.dptr_;
+    const DType *bottom_data = data.dptr_;
+    const DType *bottom_rois = bbox.dptr_;
+    const DType *bottom_trans = no_trans ? nullptr : trans.dptr_;
+    DType *bottom_data_diff = in_grad.dptr_;
+    DType *bottom_trans_diff = no_trans ? nullptr : trans_grad.dptr_;
+    const DType *top_count_data = top_count.dptr_;
+    const index_t count = out_grad.shape_.Size();
+    const index_t num_rois = bbox.size(0);
+    const index_t channels = in_grad.size(1);
+    const index_t height = in_grad.size(2);
+    const index_t width = in_grad.size(3);
+    const index_t pooled_height = pooled_size;
+    const index_t pooled_width = pooled_size;
+    const index_t num_classes = no_trans ? 1 : trans_grad.size(1) / 2;
+    const index_t channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+    DeformablePSROIPoolBackwardAccCPU<DType>(count, top_diff, top_count_data, num_rois,
+                                             spatial_scale, channels, height, width,
+                                             pooled_height, pooled_width, output_dim,
+                                             bottom_data_diff, bottom_trans_diff,
+                                             bottom_data, bottom_rois, bottom_trans,
+                                             no_trans, trans_std, sample_per_part,
+                                             group_size, part_size, num_classes,
+                                             channels_each_class);
+
     return;
   }
 }  // namespace mshadow
@@ -88,9 +354,9 @@ namespace op {
     return op;
   }
 
-  Operator *DeformablePSROIPoolingProp::CreateOperatorEx(
-    Context ctx, mxnet::ShapeVector *in_shape,
-    std::vector<int> *in_type) const {
+  Operator *DeformablePSROIPoolingProp::CreateOperatorEx(Context ctx,
+                                                         mxnet::ShapeVector *in_shape,
+                                                         std::vector<int> *in_type) const {
     mxnet::ShapeVector out_shape, aux_shape;
     std::vector<int> out_type, aux_type;
     CHECK(InferType(in_type, &out_type, &aux_type));
diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu
index bf7d1c0bc755..6c89746b43ab 100644
--- a/src/operator/contrib/deformable_psroi_pooling.cu
+++ b/src/operator/contrib/deformable_psroi_pooling.cu
@@ -46,56 +46,52 @@ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 namespace mshadow {
 namespace cuda {
   template <typename DType>
-  __device__ DType bilinear_interp(
-    const DType* data,
-    const DType x,
-    const DType y,
-    const int width,
-    const int height) {
-    int x1 = floor(x);
-    int x2 = ceil(x);
-    int y1 = floor(y);
-    int y2 = ceil(y);
+  __device__ DType bilinear_interp(const DType* data,
+                                   const DType x, const DType y,
+                                   const index_t width, const index_t height) {
+    index_t x1 = floor(x);
+    index_t x2 = ceil(x);
+    index_t y1 = floor(y);
+    index_t y2 = ceil(y);
     DType dist_x = static_cast<DType>(x - x1);
     DType dist_y = static_cast<DType>(y - y1);
-    DType value11 = data[y1*width + x1];
-    DType value12 = data[y2*width + x1];
-    DType value21 = data[y1*width + x2];
-    DType value22 = data[y2*width + x2];
-    DType value = (1 - dist_x)*(1 - dist_y)*value11 + (1 - dist_x)*dist_y*value12
-      + dist_x*(1 - dist_y)*value21 + dist_x*dist_y*value22;
+    DType value11 = data[y1 * width + x1];
+    DType value12 = data[y2 * width + x1];
+    DType value21 = data[y1 * width + x2];
+    DType value22 = data[y2 * width + x2];
+    DType value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 +
+      dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
     return value;
   }
 
   template <typename DType>
-  __global__ void DeformablePSROIPoolForwardKernel(
-    const int count,
-    const DType* bottom_data,
-    const DType spatial_scale,
-    const int channels,
-    const int height, const int width,
-    const int pooled_height, const int pooled_width,
-    const DType* bottom_rois, const DType* bottom_trans,
-    const bool no_trans,
-    const DType trans_std,
-    const int sample_per_part,
-    const int output_dim,
-    const int group_size,
-    const int part_size,
-    const int num_classes,
-    const int channels_each_class,
-    DType* top_data,
-    DType* top_count) {
+  __global__ void DeformablePSROIPoolForwardKernel(const index_t count,
+                                                   const DType* bottom_data,
+                                                   const DType spatial_scale,
+                                                   const index_t channels,
+                                                   const index_t height, const index_t width,
+                                                   const index_t pooled_height,
+                                                   const index_t pooled_width,
+                                                   const DType* bottom_rois,
+                                                   const DType* bottom_trans,
+                                                   const bool no_trans, const DType trans_std,
+                                                   const index_t sample_per_part,
+                                                   const index_t output_dim,
+                                                   const index_t group_size,
+                                                   const index_t part_size,
+                                                   const index_t num_classes,
+                                                   const index_t channels_each_class,
+                                                   DType* top_data, DType* top_count) {
     CUDA_KERNEL_LOOP(index, count) {
       // The output is in order (n, ctop, ph, pw)
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int ctop = (index / pooled_width / pooled_height) % output_dim;
-      int n = index / pooled_width / pooled_height / output_dim;
+      index_t pw = index % pooled_width;
+      index_t ph = (index / pooled_width) % pooled_height;
+      index_t ctop = (index / pooled_width / pooled_height) % output_dim;
+      index_t n = index / pooled_width / pooled_height / output_dim;
 
       // [start, end) interval for spatial sampling
       const DType* offset_bottom_rois = bottom_rois + n * 5;
-      int roi_batch_ind = offset_bottom_rois[0];
+      index_t roi_batch_ind = offset_bottom_rois[0];
       DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
       DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
       DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
@@ -112,9 +108,9 @@ namespace cuda {
       DType sub_bin_size_h = bin_size_h / static_cast<DType>(sample_per_part);
       DType sub_bin_size_w = bin_size_w / static_cast<DType>(sample_per_part);
 
-      int part_h = floor(static_cast<DType>(ph) / pooled_height*part_size);
-      int part_w = floor(static_cast<DType>(pw) / pooled_width*part_size);
-      int class_id = ctop / channels_each_class;
+      index_t part_h = floor(static_cast<DType>(ph) / pooled_height * part_size);
+      index_t part_w = floor(static_cast<DType>(pw) / pooled_width * part_size);
+      index_t class_id = ctop / channels_each_class;
       DType trans_x = no_trans ? static_cast<DType>(0) :
         bottom_trans[(((n * num_classes + class_id) * 2)
                         * part_size + part_h)
@@ -124,33 +120,32 @@ namespace cuda {
                         * part_size + part_h)
                         * part_size + part_w] * trans_std;
 
-      DType wstart = static_cast<DType>(pw)* bin_size_w
-        + roi_start_w;
+      DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
       wstart += trans_x * roi_width;
-      DType hstart = static_cast<DType>(ph) * bin_size_h
-        + roi_start_h;
+      DType hstart = static_cast<DType>(ph) * bin_size_h + roi_start_h;
       hstart += trans_y * roi_height;
 
       DType sum = 0;
-      int count = 0;
-      int gw = floor(static_cast<DType>(pw) * group_size / pooled_width);
-      int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
-      gw = min(max(gw, 0), group_size - 1);
-      gh = min(max(gh, 0), group_size - 1);
+      index_t count = 0;
+      index_t gw = floor(static_cast<DType>(pw) * group_size / pooled_width);
+      index_t gh = floor(static_cast<DType>(ph) * group_size / pooled_height);
+      gw = min(max(gw, static_cast<index_t>(0)), group_size - 1);
+      gh = min(max(gh, static_cast<index_t>(0)), group_size - 1);
 
       const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
-      for (int ih = 0; ih < sample_per_part; ih++) {
-        for (int iw = 0; iw < sample_per_part; iw++) {
-          DType w = wstart + iw*sub_bin_size_w;
-          DType h = hstart + ih*sub_bin_size_h;
+      for (index_t ih = 0; ih < sample_per_part; ih++) {
+        for (index_t iw = 0; iw < sample_per_part; iw++) {
+          DType w = wstart + iw * sub_bin_size_w;
+          DType h = hstart + ih * sub_bin_size_h;
           // bilinear interpolation
-          if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) {
+          if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
             continue;
           }
           w = min(max(w, 0.), width - 1.);
           h = min(max(h, 0.), height - 1.);
-          int c = (ctop*group_size + gh)*group_size + gw;
-          DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height);
+          index_t c = (ctop * group_size + gh) * group_size + gw;
+          DType val = bilinear_interp(offset_bottom_data + c * height * width,
+                                      w, h, width, height);
           sum += val;
           count++;
         }
@@ -162,75 +157,74 @@ namespace cuda {
 
   template<typename DType>
   inline void DeformablePSROIPoolForward(const Tensor<gpu, 4, DType> &out,
-    const Tensor<gpu, 4, DType> &data,
-    const Tensor<gpu, 2, DType> &bbox,
-    const Tensor<gpu, 4, DType> &trans,
-    const Tensor<gpu, 4, DType> &top_count,
-    const bool no_trans,
-    const float spatial_scale,
-    const int output_dim,
-    const int group_size,
-    const int pooled_size,
-    const int part_size,
-    const int sample_per_part,
-    const float trans_std) {
-    // LOG(INFO) << "DeformablePSROIPoolForward";
+                                         const Tensor<gpu, 4, DType> &data,
+                                         const Tensor<gpu, 2, DType> &bbox,
+                                         const Tensor<gpu, 4, DType> &trans,
+                                         const Tensor<gpu, 4, DType> &top_count,
+                                         const bool no_trans, const float spatial_scale,
+                                         const index_t output_dim, const index_t group_size,
+                                         const index_t pooled_size, const index_t part_size,
+                                         const index_t sample_per_part, const float trans_std) {
     const DType *bottom_data = data.dptr_;
     const DType *bottom_rois = bbox.dptr_;
     const DType *bottom_trans = no_trans ? NULL : trans.dptr_;
     DType *top_data = out.dptr_;
     DType *top_count_data = top_count.dptr_;
-    const int count = out.shape_.Size();
-    const int channels = data.size(1);
-    const int height = data.size(2);
-    const int width = data.size(3);
-    const int pooled_height = pooled_size;
-    const int pooled_width = pooled_size;
-    const int num_classes = no_trans ? 1 : trans.size(1) / 2;
-    const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+    const index_t count = out.shape_.Size();
+    const index_t channels = data.size(1);
+    const index_t height = data.size(2);
+    const index_t width = data.size(3);
+    const index_t pooled_height = pooled_size;
+    const index_t pooled_width = pooled_size;
+    const index_t num_classes = no_trans ? 1 : trans.size(1) / 2;
+    const index_t channels_each_class = no_trans ? output_dim : output_dim / num_classes;
 
     cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
-    DeformablePSROIPoolForwardKernel<DType> << <mxnet::op::mxnet_op::cuda_get_num_blocks(count),
-      kBaseThreadNum, 0, stream >> >(
-      count, bottom_data, spatial_scale, channels, height, width, pooled_height, pooled_width,
-      bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, output_dim,
-      group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+    DeformablePSROIPoolForwardKernel<DType><<<
+      mxnet::op::mxnet_op::cuda_get_num_blocks(count), kBaseThreadNum,
+      0, stream>>>(count, bottom_data, spatial_scale, channels, height, width,
+                   pooled_height, pooled_width, bottom_rois, bottom_trans,
+                   no_trans, trans_std, sample_per_part, output_dim,
+                   group_size, part_size, num_classes,
+                   channels_each_class, top_data, top_count_data);
     DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
   }
 
 
   template <typename DType>
-  __global__ void DeformablePSROIPoolBackwardAccKernel(
-    const int count,
-    const DType* top_diff,
-    const DType* top_count,
-    const int num_rois,
-    const DType spatial_scale,
-    const int channels,
-    const int height, const int width,
-    const int pooled_height, const int pooled_width,
-    const int output_dim,
-    DType* bottom_data_diff, DType* bottom_trans_diff,
-    const DType* bottom_data,
-    const DType* bottom_rois,
-    const DType* bottom_trans,
-    const bool no_trans,
-    const DType trans_std,
-    const int sample_per_part,
-    const int group_size,
-    const int part_size,
-    const int num_classes,
-    const int channels_each_class) {
+  __global__ void DeformablePSROIPoolBackwardAccKernel(const index_t count,
+                                                       const DType* top_diff,
+                                                       const DType* top_count,
+                                                       const index_t num_rois,
+                                                       const DType spatial_scale,
+                                                       const index_t channels,
+                                                       const index_t height,
+                                                       const index_t width,
+                                                       const index_t pooled_height,
+                                                       const index_t pooled_width,
+                                                       const index_t output_dim,
+                                                       DType* bottom_data_diff,
+                                                       DType* bottom_trans_diff,
+                                                       const DType* bottom_data,
+                                                       const DType* bottom_rois,
+                                                       const DType* bottom_trans,
+                                                       const bool no_trans,
+                                                       const DType trans_std,
+                                                       const index_t sample_per_part,
+                                                       const index_t group_size,
+                                                       const index_t part_size,
+                                                       const index_t num_classes,
+                                                       const index_t channels_each_class) {
     CUDA_KERNEL_LOOP(index, count) {
       // The output is in order (n, ctop, ph, pw)
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int ctop = (index / pooled_width / pooled_height) % output_dim;
-      int n = index / pooled_width / pooled_height / output_dim;
+      index_t pw = index % pooled_width;
+      index_t ph = (index / pooled_width) % pooled_height;
+      index_t ctop = (index / pooled_width / pooled_height) % output_dim;
+      index_t n = index / pooled_width / pooled_height / output_dim;
 
       // [start, end) interval for spatial sampling
       const DType* offset_bottom_rois = bottom_rois + n * 5;
-      int roi_batch_ind = offset_bottom_rois[0];
+      index_t roi_batch_ind = offset_bottom_rois[0];
       DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
       DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
       DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
@@ -247,9 +241,9 @@ namespace cuda {
       DType sub_bin_size_h = bin_size_h / static_cast<DType>(sample_per_part);
       DType sub_bin_size_w = bin_size_w / static_cast<DType>(sample_per_part);
 
-      int part_h = floor(static_cast<DType>(ph) / pooled_height*part_size);
-      int part_w = floor(static_cast<DType>(pw) / pooled_width*part_size);
-      int class_id = ctop / channels_each_class;
+      index_t part_h = floor(static_cast<DType>(ph) / pooled_height * part_size);
+      index_t part_w = floor(static_cast<DType>(pw) / pooled_width * part_size);
+      index_t class_id = ctop / channels_each_class;
       DType trans_x = no_trans ? static_cast<DType>(0) :
         bottom_trans[(((n * num_classes + class_id) * 2)
                         * part_size + part_h)
@@ -259,11 +253,9 @@ namespace cuda {
                         * part_size + part_h)
                         * part_size + part_w] * trans_std;
 
-      DType wstart = static_cast<DType>(pw)* bin_size_w
-        + roi_start_w;
+      DType wstart = static_cast<DType>(pw) * bin_size_w + roi_start_w;
       wstart += trans_x * roi_width;
-      DType hstart = static_cast<DType>(ph) * bin_size_h
-        + roi_start_h;
+      DType hstart = static_cast<DType>(ph) * bin_size_h + roi_start_h;
       hstart += trans_y * roi_height;
 
       if (top_count[index] <= 0) {
@@ -272,51 +264,49 @@ namespace cuda {
       DType diff_val = top_diff[index] / top_count[index];
       const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
       DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
-      int gw = floor(static_cast<DType>(pw)* group_size / pooled_width);
-      int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
-      gw = min(max(gw, 0), group_size - 1);
-      gh = min(max(gh, 0), group_size - 1);
-
-      for (int ih = 0; ih < sample_per_part; ih++) {
-        for (int iw = 0; iw < sample_per_part; iw++) {
-          DType w = wstart + iw*sub_bin_size_w;
-          DType h = hstart + ih*sub_bin_size_h;
+      index_t gw = floor(static_cast<DType>(pw) * group_size / pooled_width);
+      index_t gh = floor(static_cast<DType>(ph) * group_size / pooled_height);
+      gw = min(max(gw, static_cast<index_t>(0)), group_size - 1);
+      gh = min(max(gh, static_cast<index_t>(0)), group_size - 1);
+
+      for (index_t ih = 0; ih < sample_per_part; ih++) {
+        for (index_t iw = 0; iw < sample_per_part; iw++) {
+          DType w = wstart + iw * sub_bin_size_w;
+          DType h = hstart + ih * sub_bin_size_h;
           // bilinear interpolation
-          if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) {
+          if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
             continue;
           }
           w = min(max(w, 0.), width - 1.);
           h = min(max(h, 0.), height - 1.);
-          int c = (ctop*group_size + gh)*group_size + gw;
+          index_t c = (ctop * group_size + gh) * group_size + gw;
           // backward on feature
-          int x0 = floor(w);
-          int x1 = ceil(w);
-          int y0 = floor(h);
-          int y1 = ceil(h);
+          index_t x0 = floor(w);
+          index_t x1 = ceil(w);
+          index_t y0 = floor(h);
+          index_t y1 = ceil(h);
           DType dist_x = w - x0, dist_y = h - y0;
-          DType q00 = (1 - dist_x)*(1 - dist_y);
-          DType q01 = (1 - dist_x)*dist_y;
-          DType q10 = dist_x*(1 - dist_y);
-          DType q11 = dist_x*dist_y;
-          int bottom_index_base = c * height *width;
-          atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x0, q00*diff_val);
-          atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x0, q01*diff_val);
-          atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x1, q10*diff_val);
-          atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x1, q11*diff_val);
+          DType q00 = (1 - dist_x) * (1 - dist_y);
+          DType q01 = (1 - dist_x) * dist_y;
+          DType q10 = dist_x * (1 - dist_y);
+          DType q11 = dist_x * dist_y;
+          index_t bottom_index_base = c * height * width;
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
 
           if (no_trans) {
             continue;
           }
-          DType U00 = offset_bottom_data[bottom_index_base + y0*width + x0];
-          DType U01 = offset_bottom_data[bottom_index_base + y1*width + x0];
-          DType U10 = offset_bottom_data[bottom_index_base + y0*width + x1];
-          DType U11 = offset_bottom_data[bottom_index_base + y1*width + x1];
-          DType diff_x = (U11*dist_y + U10*(1 - dist_y) - U01*dist_y - U00*(1 - dist_y))
-            *trans_std*diff_val;
-          diff_x *= roi_width;
-          DType diff_y = (U11*dist_x + U01*(1 - dist_x) - U10*dist_x - U00*(1 - dist_x))
-            *trans_std*diff_val;
-          diff_y *= roi_height;
+          DType U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+          DType U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+          DType U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+          DType U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+          DType diff_x = U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y);
+          diff_x *= trans_std * diff_val * roi_width;
+          DType diff_y = U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x);
+          diff_y *= trans_std * diff_val * roi_height;
 
           atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2)
                                            * part_size + part_h)
@@ -332,21 +322,16 @@ namespace cuda {
 
   template<typename DType>
   inline void DeformablePSROIPoolBackwardAcc(const Tensor<gpu, 4, DType> &in_grad,
-    const Tensor<gpu, 4, DType> &trans_grad,
-    const Tensor<gpu, 4, DType> &out_grad,
-    const Tensor<gpu, 4, DType> &data,
-    const Tensor<gpu, 2, DType> &bbox,
-    const Tensor<gpu, 4, DType> &trans,
-    const Tensor<gpu, 4, DType> &top_count,
-    const bool no_trans,
-    const float spatial_scale,
-    const int output_dim,
-    const int group_size,
-    const int pooled_size,
-    const int part_size,
-    const int sample_per_part,
-    const float trans_std) {
-    // LOG(INFO) << "DeformablePSROIPoolBackward";
+                                             const Tensor<gpu, 4, DType> &trans_grad,
+                                             const Tensor<gpu, 4, DType> &out_grad,
+                                             const Tensor<gpu, 4, DType> &data,
+                                             const Tensor<gpu, 2, DType> &bbox,
+                                             const Tensor<gpu, 4, DType> &trans,
+                                             const Tensor<gpu, 4, DType> &top_count,
+                                             const bool no_trans, const float spatial_scale,
+                                             const index_t output_dim, const index_t group_size,
+                                             const index_t pooled_size, const index_t part_size,
+                                             const index_t sample_per_part, const float trans_std) {
     const DType *top_diff = out_grad.dptr_;
     const DType *bottom_data = data.dptr_;
     const DType *bottom_rois = bbox.dptr_;
@@ -354,23 +339,25 @@ namespace cuda {
     DType *bottom_data_diff = in_grad.dptr_;
     DType *bottom_trans_diff = no_trans ? NULL : trans_grad.dptr_;
     const DType *top_count_data = top_count.dptr_;
-    const int count = out_grad.shape_.Size();
-    const int num_rois = bbox.size(0);
-    const int channels = in_grad.size(1);
-    const int height = in_grad.size(2);
-    const int width = in_grad.size(3);
-    const int pooled_height = pooled_size;
-    const int pooled_width = pooled_size;
-    const int num_classes = no_trans ? 1 : trans_grad.size(1) / 2;
-    const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+    const index_t count = out_grad.shape_.Size();
+    const index_t num_rois = bbox.size(0);
+    const index_t channels = in_grad.size(1);
+    const index_t height = in_grad.size(2);
+    const index_t width = in_grad.size(3);
+    const index_t pooled_height = pooled_size;
+    const index_t pooled_width = pooled_size;
+    const index_t num_classes = no_trans ? 1 : trans_grad.size(1) / 2;
+    const index_t channels_each_class = no_trans ? output_dim : output_dim / num_classes;
 
     cudaStream_t stream = Stream<gpu>::GetStream(in_grad.stream_);
-    DeformablePSROIPoolBackwardAccKernel<DType> << <mxnet::op::mxnet_op::cuda_get_num_blocks(count),
-      kBaseThreadNum, 0, stream >> >(
-      count, top_diff, top_count_data, num_rois, spatial_scale, channels, height, width,
-      pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
-      bottom_data, bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part,
-      group_size, part_size, num_classes, channels_each_class);
+    DeformablePSROIPoolBackwardAccKernel<DType><<<
+      mxnet::op::mxnet_op::cuda_get_num_blocks(count), kBaseThreadNum,
+      0, stream >>>(count, top_diff, top_count_data, num_rois, spatial_scale,
+                    channels, height, width, pooled_height, pooled_width,
+                    output_dim, bottom_data_diff, bottom_trans_diff,
+                    bottom_data, bottom_rois, bottom_trans,
+                    no_trans, trans_std, sample_per_part, group_size,
+                    part_size, num_classes, channels_each_class);
     DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
   }
 
@@ -378,41 +365,36 @@ namespace cuda {
 
   template<typename DType>
   inline void DeformablePSROIPoolForward(const Tensor<gpu, 4, DType> &out,
-    const Tensor<gpu, 4, DType> &data,
-    const Tensor<gpu, 2, DType> &bbox,
-    const Tensor<gpu, 4, DType> &trans,
-    const Tensor<gpu, 4, DType> &top_count,
-    const bool no_trans,
-    const float spatial_scale,
-    const int output_dim,
-    const int group_size,
-    const int pooled_size,
-    const int part_size,
-    const int sample_per_part,
-    const float trans_std) {
-    cuda::DeformablePSROIPoolForward(out, data, bbox, trans, top_count, no_trans, spatial_scale,
-      output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std);
+                                         const Tensor<gpu, 4, DType> &data,
+                                         const Tensor<gpu, 2, DType> &bbox,
+                                         const Tensor<gpu, 4, DType> &trans,
+                                         const Tensor<gpu, 4, DType> &top_count,
+                                         const bool no_trans, const float spatial_scale,
+                                         const index_t output_dim, const index_t group_size,
+                                         const index_t pooled_size, const index_t part_size,
+                                         const index_t sample_per_part, const float trans_std) {
+    cuda::DeformablePSROIPoolForward(out, data, bbox, trans, top_count,
+                                     no_trans, spatial_scale, output_dim,
+                                     group_size, pooled_size, part_size,
+                                     sample_per_part, trans_std);
   }
 
   template<typename DType>
   inline void DeformablePSROIPoolBackwardAcc(const Tensor<gpu, 4, DType> &in_grad,
-    const Tensor<gpu, 4, DType> &trans_grad,
-    const Tensor<gpu, 4, DType> &out_grad,
-    const Tensor<gpu, 4, DType> &data,
-    const Tensor<gpu, 2, DType> &bbox,
-    const Tensor<gpu, 4, DType> &trans,
-    const Tensor<gpu, 4, DType> &top_count,
-    const bool no_trans,
-    const float spatial_scale,
-    const int output_dim,
-    const int group_size,
-    const int pooled_size,
-    const int part_size,
-    const int sample_per_part,
-    const float trans_std) {
-    cuda::DeformablePSROIPoolBackwardAcc(in_grad, trans_grad, out_grad, data, bbox, trans,
-      top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size,
-      sample_per_part, trans_std);
+                                             const Tensor<gpu, 4, DType> &trans_grad,
+                                             const Tensor<gpu, 4, DType> &out_grad,
+                                             const Tensor<gpu, 4, DType> &data,
+                                             const Tensor<gpu, 2, DType> &bbox,
+                                             const Tensor<gpu, 4, DType> &trans,
+                                             const Tensor<gpu, 4, DType> &top_count,
+                                             const bool no_trans, const float spatial_scale,
+                                             const index_t output_dim, const index_t group_size,
+                                             const index_t pooled_size, const index_t part_size,
+                                             const index_t sample_per_part, const float trans_std) {
+    cuda::DeformablePSROIPoolBackwardAcc(in_grad, trans_grad, out_grad, data, bbox,
+                                         trans, top_count, no_trans, spatial_scale,
+                                         output_dim, group_size, pooled_size,
+                                         part_size, sample_per_part, trans_std);
   }
 
 }  // namespace mshadow
diff --git a/src/operator/contrib/hawkes_ll-inl.h b/src/operator/contrib/hawkes_ll-inl.h
new file mode 100644
index 000000000000..d5e90ad6545d
--- /dev/null
+++ b/src/operator/contrib/hawkes_ll-inl.h
@@ -0,0 +1,506 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file hawkes_ll-inl.h
+ * \brief Log likelihood of a marked self-exciting Hawkes process
+ * \author Caner Turkmen <turkmen.ac@gmail.com>
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_HAWKES_LL_INL_H_
+#define MXNET_OPERATOR_CONTRIB_HAWKES_LL_INL_H_
+
+#include <mxnet/operator.h>
+#include <vector>
+
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace hawkesll {
+  enum HawkesLLOpInputs {kMu, kAlpha, kBeta, kState, kIATimes, kMarks,
+                         kValidLength, kMaxTime};
+  enum HawkesLLGradInputs {kOutGradLL, kOutGradStates, kGradMu, kGradAlpha,
+                           kGradBeta, kGradState, kGradIATimes, kGradMarks,
+                           kGradValidLength, kGradMaxTime};
+  enum HawkesLLOpOutputs {kOutLL, kOutStates};
+  enum HawkesLLOpResource {kTempSpace};
+}  // namespace hawkesll
+
+inline bool HawkesLLOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  // check dimensions of the type vectors
+  CHECK_EQ(in_attrs->size(), 8U);
+  CHECK_EQ(out_attrs->size(), 2U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, hawkesll::kOutLL, in_attrs->at(0))
+  TYPE_ASSIGN_CHECK(*out_attrs, hawkesll::kOutStates, in_attrs->at(0))
+
+  for (index_t j = 0; j < 8; ++j) {
+    if (j != hawkesll::kMarks) {
+      TYPE_ASSIGN_CHECK(*in_attrs, j, out_attrs->at(0))
+    }
+  }
+  TYPE_ASSIGN_CHECK(*in_attrs, hawkesll::kMarks, 4)  // int32
+
+  return out_attrs->at(hawkesll::kOutLL) != -1;
+}
+
+inline bool HawkesLLOpShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_attrs,
+                             std::vector<TShape>* out_attrs) {
+  using namespace mshadow;
+  int N, T, K;
+
+  CHECK_EQ(in_attrs->size(), 8U);
+  CHECK_EQ(out_attrs->size(), 2U);
+
+  // check ndims
+  CHECK_EQ(in_attrs->at(hawkesll::kMu).ndim(), 2);  // mu (N, K)
+  CHECK_EQ(in_attrs->at(hawkesll::kAlpha).ndim(), 1);  // branching ratio (K,)
+  CHECK_EQ(in_attrs->at(hawkesll::kBeta).ndim(), 1);  // decay exponent (K,)
+  CHECK_EQ(in_attrs->at(hawkesll::kState).ndim(), 2);  // Hawkes states (N, K)
+  CHECK_EQ(in_attrs->at(hawkesll::kIATimes).ndim(), 2);  // i.a. times  (N, T)
+  CHECK_EQ(in_attrs->at(hawkesll::kMarks).ndim(), 2);  // marks (N, T)
+  CHECK_EQ(in_attrs->at(hawkesll::kValidLength).ndim(), 1);  // valid len (N,)
+  CHECK_EQ(in_attrs->at(hawkesll::kMaxTime).ndim(), 1);  // max_time (N,)
+
+  N = in_attrs->at(hawkesll::kIATimes)[0];  // number of samples in batch
+  T = in_attrs->at(hawkesll::kIATimes)[1];  // time length
+  K = in_attrs->at(hawkesll::kMu)[1];  // number of marks
+
+  // check inputs consistent
+  CHECK_EQ(in_attrs->at(hawkesll::kMu)[0], N);
+  CHECK_EQ(in_attrs->at(hawkesll::kMu)[1], K);
+  CHECK_EQ(in_attrs->at(hawkesll::kAlpha)[0], K);
+  CHECK_EQ(in_attrs->at(hawkesll::kBeta)[0], K);
+  CHECK_EQ(in_attrs->at(hawkesll::kState)[0], N);
+  CHECK_EQ(in_attrs->at(hawkesll::kState)[1], K);
+  CHECK_EQ(in_attrs->at(hawkesll::kMarks)[0], N);
+  CHECK_EQ(in_attrs->at(hawkesll::kMarks)[1], T);
+  CHECK_EQ(in_attrs->at(hawkesll::kValidLength)[0], N);
+  CHECK_EQ(in_attrs->at(hawkesll::kMaxTime)[0], N);
+
+  // infer output type
+  SHAPE_ASSIGN_CHECK(*out_attrs, hawkesll::kOutLL, Shape1(N))
+  SHAPE_ASSIGN_CHECK(*out_attrs, hawkesll::kOutStates, Shape2(N, K))
+
+  return out_attrs->at(hawkesll::kOutLL).ndim() != 0U &&
+    out_attrs->at(hawkesll::kOutStates).Size() != 0U;
+}
+
+template<int req>
+struct hawkesll_forward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType* out_loglike,
+                                  DType* out_state,
+                                  const DType* mu,
+                                  const DType* alpha,
+                                  const DType* beta,
+                                  DType* state,
+                                  const DType* lags,
+                                  const int32_t* marks,
+                                  DType* valid_length,
+                                  DType* max_time,
+                                  int K,
+                                  int T,
+                                  DType* temp_register
+                                  ) {
+    int32_t ci;  // current mark
+    DType ll = 0;  // log likelihood
+    DType t = 0;  // current time
+    DType d, ed, lda, comp;
+    DType *last_ = &temp_register[i * K];
+
+    const DType *mu_ = &mu[i * K];
+    const DType *lag_ = &lags[i * T];
+    const int32_t *mark_ = &marks[i * T];
+    DType *state_ = &out_state[i * K];
+
+    // iterate over points in sequence
+    for (index_t j = 0; j < valid_length[i]; ++j) {
+      ci = mark_[j];
+      t += lag_[j];
+      d = t - last_[ci];
+      ed = expf(-beta[ci] * d);
+
+      lda = mu_[ci] + alpha[ci] * beta[ci] * state_[ci] * ed;
+      comp = mu_[ci] * d + alpha[ci] * state_[ci] * (1 - ed);
+
+      ll += logf(lda) - comp;
+
+      KERNEL_ASSIGN(state_[ci], req, 1 + (state_[ci] * ed))
+
+      last_[ci] = t;
+    }
+
+    KERNEL_ASSIGN(out_loglike[i], req, ll)
+  }
+};
+
+template<int req>
+struct hawkesll_forward_compensator {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType* rem_comp,
+                                  DType* out_state,
+                                  const DType* mu,
+                                  const DType* alpha,
+                                  const DType* beta,
+                                  const DType* max_time,
+                                  const int K,
+                                  const DType* last_buffer
+                                  ) {
+    DType d, ed;
+    int m = i % K;  // mark
+    int j = i / K;  // particle
+
+    // take care of the remaining compensators and state update
+    d = max_time[j] - last_buffer[i];
+    ed = expf(-beta[m] * d);
+
+    // return the remaining compensator
+    KERNEL_ASSIGN(rem_comp[i], req,
+                  mu[i] * d + alpha[m] * out_state[i] * (1 - ed))
+
+    // update the state
+    KERNEL_ASSIGN(out_state[i], req, ed * out_state[i])
+  }
+};
+
+template<typename xpu>
+void HawkesLLForward(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  CHECK_EQ(inputs.size(), 8U);
+  CHECK_EQ(outputs.size(), 2U);
+
+  const TBlob& out_loglike = outputs[hawkesll::kOutLL];
+  const TBlob& out_state = outputs[hawkesll::kOutStates];
+
+  int K = inputs[hawkesll::kMu].shape_[1];
+  int N = inputs[hawkesll::kIATimes].shape_[0];
+  int T = inputs[hawkesll::kIATimes].shape_[1];
+
+  MSHADOW_TYPE_SWITCH(out_loglike.type_flag_, DType, {
+    Tensor<xpu, 2, DType> temp_space = ctx.requested[hawkesll::kTempSpace]
+                                          .get_space_typed<xpu, 2, DType>(
+                                            Shape2(2*N, K),
+                                            s);
+
+    Tensor<xpu, 2, DType> last_buffer =
+        Tensor<xpu, 2, DType>(&temp_space.dptr_[0], Shape2(N, K), s);
+    Tensor<xpu, 2, DType> rem_comp =
+        Tensor<xpu, 2, DType>(&temp_space.dptr_[N*K], Shape2(N, K), s);
+
+    Tensor<xpu, 1, DType> out_loglike_ts =
+        out_loglike.get_with_shape<xpu, 1, DType>(Shape1(N), s);
+
+    last_buffer = DType(0.0);
+    rem_comp = DType(0.0);
+
+    Tensor<xpu, 2, DType> out_state_ts =
+        out_state.get_with_shape<xpu, 2, DType>(Shape2(N, K), s);
+    Tensor<xpu, 2, DType> in_state_ts =
+        inputs[hawkesll::kState].get_with_shape<xpu, 2, DType>(Shape2(N, K), s);
+
+    mshadow::Copy(out_state_ts, in_state_ts, s);
+
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<hawkesll_forward<req_type>, xpu>::Launch(
+        s, N,
+        out_loglike.dptr<DType>(),
+        out_state.dptr<DType>(),
+        inputs[hawkesll::kMu].dptr<DType>(),  // mu
+        inputs[hawkesll::kAlpha].dptr<DType>(),  // alpha
+        inputs[hawkesll::kBeta].dptr<DType>(),  // beta
+        inputs[hawkesll::kState].dptr<DType>(),  // states
+        inputs[hawkesll::kIATimes].dptr<DType>(),  // interarrival times
+        inputs[hawkesll::kMarks].dptr<int32_t>(),  // marks
+        inputs[hawkesll::kValidLength].dptr<DType>(),  // valid_length
+        inputs[hawkesll::kMaxTime].dptr<DType>(),  // max_time
+        K,
+        T,
+        last_buffer.dptr_);
+    });
+
+    // in parallel, we take care of the remaining compensators
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<hawkesll_forward_compensator<req_type>, xpu>::Launch(
+        s, N * K,
+        rem_comp.dptr_,
+        out_state.dptr<DType>(),
+        inputs[hawkesll::kMu].dptr<DType>(),  // mu
+        inputs[hawkesll::kAlpha].dptr<DType>(),  // alpha
+        inputs[hawkesll::kBeta].dptr<DType>(),  // beta
+        inputs[hawkesll::kMaxTime].dptr<DType>(),  // max_time
+        K,
+        last_buffer.dptr_);
+    });
+    out_loglike_ts -= mshadow::expr::sumall_except_dim<0>(rem_comp);
+  })
+}
+
+template<int req>
+struct hawkesll_backward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,  // indexes the sample (particle)
+                                  DType* mu_gbfr,
+                                  DType* alpha_gbfr,
+                                  DType* beta_gbfr,  // (N, K)
+                                  const DType* mu,   // (N, K)
+                                  const DType* alpha,   // (K,)
+                                  const DType* beta,    // (K,)
+                                  const DType* lags,    // (N, T)
+                                  const int32_t* marks,  // (N, T)
+                                  const DType* valid_length,  // (N,)
+                                  const DType* max_time,  // (N,)
+                                  const int K,
+                                  const int T,
+                                  DType* last_buffer,
+                                  DType* phi_buffer,
+                                  DType* phig_buffer
+                                  ) {
+    int32_t ci;
+    int32_t part_ix_K = i*K, part_ix_T = i*T;
+
+    DType t = 0, d, lda, ed;
+    DType* last_ = &last_buffer[part_ix_K];
+    DType* state_ = &phi_buffer[part_ix_K];
+    DType* dstate_ = &phig_buffer[part_ix_K];
+
+    DType* mug_ = &mu_gbfr[part_ix_K];
+    DType* alphag_ = &alpha_gbfr[part_ix_K];
+    DType* betag_ = &beta_gbfr[part_ix_K];
+
+    const DType* lag_ = &lags[part_ix_T];
+    const int32_t* mark_ = &marks[part_ix_T];
+
+    // iterate over points
+    for (index_t j = 0; j < valid_length[i]; ++j){
+      ci = mark_[j];
+      t += lag_[j];
+      d = t - last_[ci];
+      ed = expf(-beta[ci] * d);
+
+      lda = mu[part_ix_K + ci] + alpha[ci] * beta[ci] * state_[ci] * ed;
+
+      KERNEL_ASSIGN(mug_[ci], req, mug_[ci] + (1 / lda) - d)
+      KERNEL_ASSIGN(alphag_[ci], req,
+                    (
+                      alphag_[ci]
+                      + (beta[ci] * state_[ci] * ed) / lda
+                      - state_[ci] * (1 - ed)
+                    )
+      )
+      KERNEL_ASSIGN(betag_[ci], req,
+                    betag_[ci]
+                    + alpha[ci] * ed
+                    * (state_[ci] * (1 - beta[ci] * d) + beta[ci] * dstate_[ci])
+                    / lda
+                    - alpha[ci]
+                    * (dstate_[ci] * (1 - ed) + state_[ci] * d * ed)
+      )
+
+      KERNEL_ASSIGN(dstate_[ci], req, ed * (-d * state_[ci] + dstate_[ci]))
+      KERNEL_ASSIGN(state_[ci], req, 1 + (state_[ci] * ed))
+
+      last_[ci] = t;
+    }
+  }
+};
+
+
+template<int req>
+struct hawkesll_backward_compensator {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType* mu_gbfr,
+                                  DType* alpha_gbfr,
+                                  DType* beta_gbfr,  // (N, K)
+                                  DType* out_grad,  // read this  (N,)
+                                  const DType* mu,  // (N, K)
+                                  const DType* alpha,   // (K,)
+                                  const DType* beta,    // (K,)
+                                  const DType* max_time,  // (N,)
+                                  const int K,
+                                  DType* last_buffer,
+                                  DType* phi_buffer,
+                                  DType* phig_buffer
+                                  ) {
+    DType d, ed;
+    int m = i % K;  // mark
+    int j = i / K;  // particle
+    int32_t part_ix_K = j*K;
+    DType* mug_ = &mu_gbfr[part_ix_K];
+    DType* alphag_ = &alpha_gbfr[part_ix_K];
+    DType* betag_ = &beta_gbfr[part_ix_K];
+
+    // take care of the remaining compensators and state update
+    d = max_time[j] - last_buffer[i];
+    ed = expf(-beta[m] * d);
+
+    // take care of the gradients of the remaining compensator
+    KERNEL_ASSIGN(mug_[m], req, mug_[m] - d)
+    KERNEL_ASSIGN(alphag_[m], req,
+                  alphag_[m] - phi_buffer[i] * (1 - ed)
+    )
+    KERNEL_ASSIGN(betag_[m], req,
+                  betag_[m] - alpha[m] * (
+                    phig_buffer[i] * (1 - ed)
+                    + phi_buffer[i] * d * ed
+                  )
+    )
+
+    // // correct the gradients with respect to output gradients
+    KERNEL_ASSIGN(mug_[m], req, out_grad[j] * mug_[m])
+    KERNEL_ASSIGN(alphag_[m], req, out_grad[j] * alphag_[m])
+    KERNEL_ASSIGN(betag_[m], req, out_grad[j] * betag_[m])
+  }
+};
+
+template<typename xpu>
+void HawkesLLBackward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 10U);
+  CHECK_EQ(outputs.size(), 8U);
+  CHECK_EQ(req.size(), 8U);
+
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  int K = inputs[hawkesll::kGradMu].shape_[1];  // mu data
+  int N = inputs[hawkesll::kGradIATimes].shape_[0];
+  int T = inputs[hawkesll::kGradIATimes].shape_[1];
+
+  CHECK_EQ(inputs[hawkesll::kOutGradLL].shape_[0], N);  // grad of out 0 (LL)
+  CHECK_EQ(inputs[hawkesll::kOutGradStates].shape_[0], N);  // grad out 1-states
+  CHECK_EQ(inputs[hawkesll::kOutGradStates].shape_[1], K);
+
+  // sufficient statistics are not differentiated w.r.t.
+  CHECK_EQ(req[hawkesll::kIATimes], OpReqType::kNullOp);
+  CHECK_EQ(req[hawkesll::kMarks], OpReqType::kNullOp);
+  CHECK_EQ(req[hawkesll::kValidLength], OpReqType::kNullOp);
+  CHECK_EQ(req[hawkesll::kMaxTime], OpReqType::kNullOp);
+
+  const TBlob& out_grad = inputs[hawkesll::kOutGradLL];
+
+  using namespace mshadow;
+  using namespace mxnet_op;
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    // allocate gradient buffers
+    Tensor<xpu, 2, DType> bfr =
+        ctx.requested[hawkesll::kTempSpace]
+           .get_space_typed<xpu, 2, DType>(Shape2(6*N, K), s);
+
+    Tensor<xpu, 2, DType> alpha_gbfr =
+        Tensor<xpu, 2, DType>(&bfr.dptr_[N*K], Shape2(N, K), s);
+    Tensor<xpu, 2, DType> beta_gbfr =
+        Tensor<xpu, 2, DType>(&bfr.dptr_[2*N*K], Shape2(N, K), s);
+    Tensor<xpu, 2, DType> last_buffer =
+        Tensor<xpu, 2, DType>(&bfr.dptr_[3*N*K], Shape2(N, K), s);
+    Tensor<xpu, 2, DType> phig_buffer =
+        Tensor<xpu, 2, DType>(&bfr.dptr_[4*N*K], Shape2(N, K), s);
+    Tensor<xpu, 2, DType> phi_buffer =
+        Tensor<xpu, 2, DType>(&bfr.dptr_[5*N*K], Shape2(N, K), s);
+
+    alpha_gbfr = DType(0.0);
+    beta_gbfr = DType(0.0);
+    last_buffer = DType(0.0);
+    phig_buffer = DType(0.0);
+
+    mshadow::Copy(phi_buffer,
+                  inputs[hawkesll::kGradState]
+                    .get_with_shape<xpu, 2, DType>(Shape2(N, K), s),
+                  s);
+
+    // get the gradient to be output
+    Tensor<xpu, 2, DType> in_grad_mu =
+        outputs[hawkesll::kMu].get_with_shape<xpu, 2, DType>(Shape2(N, K), s);
+    Tensor<xpu, 1, DType> in_grad_alpha =
+        outputs[hawkesll::kAlpha].get_with_shape<xpu, 1, DType>(Shape1(K), s);
+    Tensor<xpu, 1, DType> in_grad_beta =
+        outputs[hawkesll::kBeta].get_with_shape<xpu, 1, DType>(Shape1(K), s);
+
+    in_grad_mu = DType(0.0);
+
+    MXNET_ASSIGN_REQ_SWITCH(req[hawkesll::kMu], req_type, {
+      Kernel<hawkesll_backward<req_type>, xpu>::Launch(
+        s,
+        N,
+        in_grad_mu.dptr_, alpha_gbfr.dptr_, beta_gbfr.dptr_,  // gradients
+        inputs[hawkesll::kGradMu].dptr<DType>(),  // mu data
+        inputs[hawkesll::kGradAlpha].dptr<DType>(),  // alpha data
+        inputs[hawkesll::kGradBeta].dptr<DType>(),  // beta data
+        inputs[hawkesll::kGradIATimes].dptr<DType>(),  // lags data
+        inputs[hawkesll::kGradMarks].dptr<int32_t>(),  // marks data
+        inputs[hawkesll::kGradValidLength].dptr<DType>(),  // valid_length data
+        inputs[hawkesll::kGradMaxTime].dptr<DType>(),  // max_time data
+        K,
+        T,
+        last_buffer.dptr_,  // buffer to keep timestamp of last item
+        phi_buffer.dptr_,  // "states"
+        phig_buffer.dptr_);  // derivatives of "states"
+    });
+
+    MXNET_ASSIGN_REQ_SWITCH(req[hawkesll::kMu], req_type, {
+      Kernel<hawkesll_backward_compensator<req_type>, xpu>::Launch(
+        s,
+        N * K,
+        in_grad_mu.dptr_, alpha_gbfr.dptr_, beta_gbfr.dptr_,  // gradients
+        out_grad.dptr<DType>(),
+        inputs[hawkesll::kGradMu].dptr<DType>(),  // mu data
+        inputs[hawkesll::kGradAlpha].dptr<DType>(),  // alpha data
+        inputs[hawkesll::kGradBeta].dptr<DType>(),  // beta data
+        inputs[hawkesll::kGradMaxTime].dptr<DType>(),  // max_time data
+        K,
+        last_buffer.dptr_,  // buffer to keep timestamp of last item
+        phi_buffer.dptr_,  // "states"
+        phig_buffer.dptr_);  // derivatives of "states"
+    });
+
+    // reduce the gradients
+    Assign(in_grad_alpha, req[hawkesll::kAlpha],
+           mshadow::expr::sumall_except_dim<1>(alpha_gbfr)
+           )
+
+    Assign(in_grad_beta, req[hawkesll::kBeta],
+           mshadow::expr::sumall_except_dim<1>(beta_gbfr)
+           )
+  })
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_HAWKES_LL_INL_H_
diff --git a/src/operator/contrib/hawkes_ll.cc b/src/operator/contrib/hawkes_ll.cc
new file mode 100644
index 000000000000..758ab2012580
--- /dev/null
+++ b/src/operator/contrib/hawkes_ll.cc
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file hawkes_ll.cc
+ * \brief Log likelihood of a marked self-exciting Hawkes process
+ * \author Caner Turkmen <turkmen.ac@gmail.com>
+ */
+
+#include "./hawkes_ll-inl.h"
+#include "../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_hawkesll)
+    .describe(R"code(Computes the log likelihood of a univariate Hawkes process.
+
+The log likelihood is calculated on point process observations represented
+as *ragged* matrices for *lags* (interarrival times w.r.t. the previous point),
+and *marks* (identifiers for the process ID). Note that each mark is considered independent,
+i.e., computes the joint likelihood of a set of Hawkes processes determined by the conditional intensity:
+
+.. math::
+
+  \lambda_k^*(t) = \lambda_k + \alpha_k \sum_{\{t_i < t, y_i = k\}} \beta_k \exp(-\beta_k (t - t_i))
+
+where :math:`\lambda_k` specifies the background intensity ``lda``, :math:`\alpha_k` specifies the *branching ratio* or ``alpha``, and :math:`\beta_k` the delay density parameter ``beta``.
+
+``lags`` and ``marks`` are two NDArrays of shape (N, T) and correspond to the representation of the point process observation, the first dimension corresponds to the batch index, and the second to the sequence. These are "left-aligned" *ragged* matrices (the first index of the second dimension is the beginning of every sequence. The length of each sequence is given by ``valid_length``, of shape (N,) where ``valid_length[i]`` corresponds to the number of valid points in ``lags[i, :]`` and ``marks[i, :]``.
+
+``max_time`` is the length of the observation period of the point process. That is, specifying ``max_time[i] = 5`` computes the likelihood of the i-th sample as observed on the time interval :math:`(0, 5]`. Naturally, the sum of all valid ``lags[i, :valid_length[i]]`` must be less than or equal to 5.
+
+The input ``state`` specifies the *memory* of the Hawkes process. Invoking the memoryless property of exponential decays, we compute the *memory* as
+
+.. math::
+
+    s_k(t) = \sum_{t_i < t} \exp(-\beta_k (t - t_i)).
+
+The ``state`` to be provided is :math:`s_k(0)` and carries the added intensity due to past events before the current batch. :math:`s_k(T)` is returned from the function where :math:`T` is ``max_time[T]``.
+
+Example::
+
+  # define the Hawkes process parameters
+  lda = nd.array([1.5, 2.0, 3.0]).tile((N, 1))
+  alpha = nd.array([0.2, 0.3, 0.4])  # branching ratios should be < 1
+  beta = nd.array([1.0, 2.0, 3.0])
+
+  # the "data", or observations
+  ia_times = nd.array([[6, 7, 8, 9], [1, 2, 3, 4], [3, 4, 5, 6], [8, 9, 10, 11]])
+  marks = nd.zeros((N, T)).astype(np.int32)
+
+  # starting "state" of the process
+  states = nd.zeros((N, K))
+
+  valid_length = nd.array([1, 2, 3, 4])  # number of valid points in each sequence
+  max_time = nd.ones((N,)) * 100.0  # length of the observation period
+
+  A = nd.contrib.hawkesll(
+      lda, alpha, beta, states, ia_times, marks, valid_length, max_time
+  )
+
+References:
+
+-  Bacry, E., Mastromatteo, I., & Muzy, J. F. (2015).
+   Hawkes processes in finance. Market Microstructure and Liquidity
+   , 1(01), 1550005.
+)code" ADD_FILELINE)
+    .set_num_inputs(8)
+    .set_num_outputs(2)
+    .set_attr<nnvm::FListInputNames>("FListInputNames",
+        [](const NodeAttrs& attrs) {
+        return std::vector<std::string>{
+          "lda", "alpha", "beta", "state", "lags",
+          "marks", "valid_length", "max_time"
+        };
+    })
+    .set_attr<nnvm::FListOutputNames>("FListOutputNames",
+        [](const NodeAttrs& attrs) {
+        return std::vector<std::string>{"output", "out_state"};
+    })
+    .set_attr<mxnet::FInferShape>("FInferShape", HawkesLLOpShape)
+    .set_attr<nnvm::FInferType>("FInferType", HawkesLLOpType)
+    .set_attr<FCompute>("FCompute<cpu>", HawkesLLForward<cpu>)
+    .set_attr<nnvm::FGradient>(
+      "FGradient", ElemwiseGradUseIn{"_contrib_backward_hawkesll"}
+    )
+    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+        return std::vector<ResourceRequest>{ResourceRequest::Type::kTempSpace};
+    })
+    .add_argument(
+      "lda", "NDArray-or-Symbol",
+      "Shape (N, K) The intensity for each of the K processes, for each sample"
+    )
+    .add_argument(
+      "alpha", "NDArray-or-Symbol",
+      "Shape (K,) The infectivity factor (branching ratio) for each process"
+    )
+    .add_argument(
+      "beta", "NDArray-or-Symbol",
+      "Shape (K,) The decay parameter for each process"
+    )
+    .add_argument(
+      "state", "NDArray-or-Symbol",
+      "Shape (N, K) the Hawkes state for each process"
+    )
+    .add_argument(
+      "lags", "NDArray-or-Symbol",
+      "Shape (N, T) the interarrival times"
+    )
+    .add_argument(
+      "marks", "NDArray-or-Symbol",
+      "Shape (N, T) the marks (process ids)"
+    )
+    .add_argument(
+      "valid_length", "NDArray-or-Symbol",
+      "The number of valid points in the process"
+    )
+    .add_argument(
+      "max_time", "NDArray-or-Symbol",
+      "the length of the interval where the processes were sampled");
+
+NNVM_REGISTER_OP(_contrib_backward_hawkesll)
+    .set_num_inputs(10)
+    .set_num_outputs(8)
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FCompute>("FCompute<cpu>", HawkesLLBackward<cpu>)
+    .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+        return std::vector<ResourceRequest>{ResourceRequest::Type::kTempSpace};
+    });
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/hawkes_ll.cu b/src/operator/contrib/hawkes_ll.cu
new file mode 100755
index 000000000000..d35d7d0b0c08
--- /dev/null
+++ b/src/operator/contrib/hawkes_ll.cu
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file hawkes_ll.cu
+ * \brief Log likelihood of a marked self-exciting Hawkes process
+ * \author Caner Turkmen <turkmen.ac@gmail.com>
+ */
+
+#include "./hawkes_ll-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_hawkesll)
+.set_attr<FCompute>("FCompute<gpu>", HawkesLLForward<gpu>);
+
+NNVM_REGISTER_OP(_contrib_backward_hawkesll)
+.set_attr<FCompute>("FCompute<gpu>", HawkesLLBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/index_array-inl.h b/src/operator/contrib/index_array-inl.h
new file mode 100644
index 000000000000..e280d7661b7c
--- /dev/null
+++ b/src/operator/contrib/index_array-inl.h
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_CONTRIB_INDEX_ARRAY_INL_H_
+#define MXNET_OPERATOR_CONTRIB_INDEX_ARRAY_INL_H_
+
+#include <vector>
+#include <utility>
+#include "../mshadow_op.h"
+#include "../tensor/init_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace index_array_enum {
+enum IndexArrayOpInputs {kIn};
+enum IndexArrayOpOutputs {kOut};
+enum IndexArrayOpResource {kTempSpace};
+}  // namespace index_array_enum
+
+template<int req>
+struct IndexArrayKernel {
+  MSHADOW_XINLINE static void Map(int i,
+                                  int64_t* out_data,
+                                  const int n,
+                                  const int64_t* workspace) {
+    for (ptrdiff_t j = 0; j < n; j++) {
+      int64_t upper = workspace[ptrdiff_t(2) * j];
+      int64_t lower = workspace[ptrdiff_t(2) * j + ptrdiff_t(1)];
+      KERNEL_ASSIGN(out_data[ptrdiff_t(i) * ptrdiff_t(n) + j], req, (i % upper) / lower);
+    }
+  }
+};
+
+template<int req>
+struct IndexArrayDefaultKernel {
+  MSHADOW_XINLINE static void Map(int i,
+                                  int64_t* out_data,
+                                  const int ndim,
+                                  const dim_t* shape) {
+    int64_t index = i;
+    for (ptrdiff_t j = ndim - 1; j >= 0; j--) {
+      KERNEL_ASSIGN(out_data[ptrdiff_t(i) * ptrdiff_t(ndim) + j], req, index % shape[j]);
+      index /= shape[j];
+    }
+  }
+};
+
+inline std::vector<int64_t> IndexArrayComputeIndexProducts(const TShape &inshape) {
+  const int ndim = inshape.ndim();
+
+  std::vector<int64_t> index_products(static_cast<size_t>(ndim + 1));
+
+  index_products[ndim] = 1;
+
+  for (int i = ndim - 1; i >= 0; i--) {
+    index_products[i] = index_products[i + 1] * inshape[i];
+  }
+
+  return index_products;
+}
+
+inline void IndexArrayBuildSelectedAxesWorkspace(const mxnet::Tuple<int> &axes,
+                                                 const std::vector<int64_t> &index_products,
+                                                 int64_t* workspace,
+                                                 const int ndim) {
+  for (int i = 0; i < axes.ndim(); i++) {
+    // Make sure that the axis is between 0 and ndim.
+    const int axis = ((axes[i] % ndim) + ndim) % ndim;
+
+    workspace[ptrdiff_t(2) * ptrdiff_t(i)] = index_products[axis];
+    workspace[ptrdiff_t(2) * ptrdiff_t(i) + ptrdiff_t(1)] = index_products[axis + 1];
+  }
+}
+
+struct IndexArrayParam : public dmlc::Parameter<IndexArrayParam> {
+  dmlc::optional<mxnet::Tuple<int>> axes;
+  DMLC_DECLARE_PARAMETER(IndexArrayParam) {
+    DMLC_DECLARE_FIELD(axes).set_default(dmlc::optional<mxnet::Tuple<int>>())
+      .describe("The axes to include in the index array. Supports negative values.");
+  }
+};  // struct IndexArrayParam
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_INDEX_ARRAY_INL_H_
diff --git a/src/operator/contrib/index_array.cc b/src/operator/contrib/index_array.cc
new file mode 100644
index 000000000000..a70dee106314
--- /dev/null
+++ b/src/operator/contrib/index_array.cc
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <mshadow/tensor.h>
+#include "./index_array-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+void IndexArrayForwardCPU(const nnvm::NodeAttrs &attrs,
+                          const OpContext &ctx,
+                          const std::vector<TBlob> &inputs,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+
+  const IndexArrayParam& param = nnvm::get<IndexArrayParam>(attrs.parsed);
+
+  const TShape inshape = in_data.shape_;
+  const int ndim = inshape.ndim();
+
+  Stream<cpu> *stream = ctx.get_stream<cpu>();
+
+  using namespace mxnet_op;
+
+  if (param.axes.has_value()) {
+    const mxnet::Tuple<int>& axes = param.axes.value();
+    const int naxes = axes.ndim();
+
+    std::vector<int64_t> index_products = IndexArrayComputeIndexProducts(inshape);
+
+    Tensor<cpu, 1, int64_t> workspace =
+        ctx.requested[0].get_space_typed<cpu, 1, int64_t>(Shape1(2 * naxes), stream);
+
+    IndexArrayBuildSelectedAxesWorkspace(axes, index_products, workspace.dptr_, ndim);
+
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<IndexArrayKernel<req_type>, cpu>::Launch(stream, in_data.Size(),
+          out_data.dptr<int64_t>(), naxes, workspace.dptr_);
+    });
+  } else {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<IndexArrayDefaultKernel<req_type>, cpu>::Launch(stream, in_data.Size(),
+          out_data.dptr<int64_t>(), ndim, inshape.data());
+    });
+  }
+}
+
+DMLC_REGISTER_PARAMETER(IndexArrayParam);
+
+NNVM_REGISTER_OP(_contrib_index_array)
+.describe(R"code(Returns an array of indexes of the input array.
+
+For an input array with shape  :math:`(d_1, d_2, ..., d_n)`, `index_array` returns a
+:math:`(d_1, d_2, ..., d_n, n)` array `idx`, where
+:math:`idx[i_1, i_2, ..., i_n, :] = [i_1, i_2, ..., i_n]`.
+
+Additionally, when the parameter `axes` is specified, `idx` will be a
+:math:`(d_1, d_2, ..., d_n, m)` array where `m` is the length of `axes`, and the following
+equality will hold: :math:`idx[i_1, i_2, ..., i_n, j] = i_{axes[j]}`.
+
+Examples::
+
+    x = mx.nd.ones((3, 2))
+
+    mx.nd.contrib.index_array(x) = [[[0 0]
+                                     [0 1]]
+
+                                    [[1 0]
+                                     [1 1]]
+
+                                    [[2 0]
+                                     [2 1]]]
+
+    x = mx.nd.ones((3, 2, 2))
+
+    mx.nd.contrib.index_array(x, axes=(1, 0)) = [[[[0 0]
+                                                   [0 0]]
+
+                                                  [[1 0]
+                                                   [1 0]]]
+
+
+                                                 [[[0 1]
+                                                   [0 1]]
+
+                                                  [[1 1]
+                                                   [1 1]]]
+
+
+                                                 [[[0 2]
+                                                   [0 2]]
+
+                                                  [[1 2]
+                                                   [1 2]]]]
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+                                 [](const NodeAttrs &attrs) {
+                                   return std::vector<std::string>{ "data" };
+                                 })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+                                  [](const NodeAttrs &attrs) {
+                                    return std::vector<std::string>{ "output" };
+                                  })
+.set_attr_parser(ParamParser<IndexArrayParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", [](const nnvm::NodeAttrs &attrs,
+                                                mxnet::ShapeVector *in_shape,
+                                                mxnet::ShapeVector *out_shape) {
+  const IndexArrayParam &param = nnvm::get<IndexArrayParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  CHECK_EQ(out_shape->size(), 1U);
+  const mxnet::TShape &inshape = (*in_shape)[index_array_enum::kIn];
+  if (!mxnet::ndim_is_known(inshape)) return false;
+
+  mxnet::TShape oshape = mxnet::TShape(inshape.ndim() + 1, 0);
+
+  for (int i = 0; i < inshape.ndim(); i++) {
+    oshape[i] = inshape[i];
+  }
+  if (param.axes.has_value()) {
+    oshape[inshape.ndim()] = param.axes.value().ndim();
+  } else {
+    oshape[inshape.ndim()] = inshape.ndim();
+  }
+
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, oshape);
+  return shape_is_known(oshape);
+})
+.set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs &attrs,
+                                             std::vector<int> *in_attrs,
+                                             std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
+  return out_attrs->at(0) != -1;
+})
+.set_attr<FCompute>("FCompute<cpu>", IndexArrayForwardCPU)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.add_argument("data", "NDArray-or-Symbol", "Input data")
+.add_arguments(IndexArrayParam::__FIELDS__());
+
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/contrib/index_array.cu b/src/operator/contrib/index_array.cu
new file mode 100644
index 000000000000..ddba6a87309a
--- /dev/null
+++ b/src/operator/contrib/index_array.cu
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <mshadow/tensor.h>
+#include "./index_array-inl.h"
+
+namespace mxnet {
+namespace op {
+
+using namespace mshadow::cuda;
+
+void IndexArrayForwardGPU(const nnvm::NodeAttrs &attrs,
+                          const OpContext &ctx,
+                          const std::vector<TBlob> &inputs,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+
+  const IndexArrayParam& param = nnvm::get<IndexArrayParam>(attrs.parsed);
+
+  const TShape inshape = in_data.shape_;
+  const int ndim = inshape.ndim();
+
+  Stream<gpu> *stream = ctx.get_stream<gpu>();
+
+  using namespace mxnet_op;
+
+  if (param.axes.has_value()) {
+    const mxnet::Tuple<int>& axes = param.axes.value();
+    const int naxes = axes.ndim();
+
+    std::vector<int64_t> index_products = IndexArrayComputeIndexProducts(inshape);
+
+    std::vector<int64_t> cpu_workspace(2 * naxes);
+    IndexArrayBuildSelectedAxesWorkspace(axes, index_products, cpu_workspace.data(), ndim);
+
+    Tensor<gpu, 1, int64_t> workspace =
+        ctx.requested[0].get_space_typed<gpu, 1, int64_t>(Shape1(2 * naxes), stream);
+
+    CUDA_CALL(cudaMemcpy(workspace.dptr_, cpu_workspace.data(), sizeof(int64_t) * (2 * naxes),
+                         cudaMemcpyHostToDevice));
+
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<IndexArrayKernel<req_type>, gpu>::Launch(stream, in_data.Size(),
+          out_data.dptr<int64_t>(), naxes, workspace.dptr_);
+    });
+  } else {
+    Tensor<gpu, 1, dim_t> workspace =
+        ctx.requested[0].get_space_typed<gpu, 1, dim_t>(Shape1(ndim), stream);
+
+    CUDA_CALL(cudaMemcpy(workspace.dptr_, inshape.data(), sizeof(dim_t) * ndim,
+        cudaMemcpyHostToDevice));
+
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<IndexArrayDefaultKernel<req_type>, gpu>::Launch(stream, in_data.Size(),
+          out_data.dptr<int64_t>(), ndim, workspace.dptr_);
+    });
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_index_array)
+.set_attr<FCompute>("FCompute<gpu>", IndexArrayForwardGPU);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
index 7010dadfedbc..919e9023310c 100644
--- a/src/operator/contrib/multi_proposal-inl.h
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -64,10 +64,9 @@ struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
   DMLC_DECLARE_PARAMETER(MultiProposalParam) {
     float tmp[] = {0, 0, 0, 0};
     DMLC_DECLARE_FIELD(rpn_pre_nms_top_n).set_default(6000)
-    .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals");
+    .describe("Number of top scoring boxes to keep before applying NMS to RPN proposals");
     DMLC_DECLARE_FIELD(rpn_post_nms_top_n).set_default(300)
-    .describe("Overlap threshold used for non-maximum"
-              "suppresion(suppress boxes with IoU >= this threshold");
+    .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals");
     DMLC_DECLARE_FIELD(threshold).set_default(0.7)
     .describe("NMS value, below which to suppress.");
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
diff --git a/src/operator/contrib/nn/deformable_im2col.cuh b/src/operator/contrib/nn/deformable_im2col.cuh
index 5f206d23d8d7..9494fb379faf 100644
--- a/src/operator/contrib/nn/deformable_im2col.cuh
+++ b/src/operator/contrib/nn/deformable_im2col.cuh
@@ -75,26 +75,26 @@ namespace mxnet {
 namespace op {
 
 template <typename DType>
-__device__ DType deformable_im2col_bilinear(const DType* bottom_data, const int data_width,
-  const int height, const int width, DType h, DType w) {
-
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high;
-  int w_high;
+__device__ DType deformable_im2col_bilinear(const DType* bottom_data,
+                                            const index_t data_width,
+                                            const index_t height,
+                                            const index_t width,
+                                            DType h, DType w) {
+  index_t h_low = floor(h);
+  index_t w_low = floor(w);
+  index_t h_high;
+  index_t w_high;
   if (h_low >= height - 1) {
     h_high = h_low = height - 1;
-    h = (DType)h_low;
-  }
-  else {
+    h = static_cast<DType>(h_low);
+  } else {
     h_high = h_low + 1;
   }
 
   if (w_low >= width - 1) {
     w_high = w_low = width - 1;
-    w = (DType)w_low;
-  }
-  else {
+    w = static_cast<DType>(w_low);
+  } else {
     w_high = w_low + 1;
   }
 
@@ -115,30 +115,30 @@ __device__ DType deformable_im2col_bilinear(const DType* bottom_data, const int
 
 template <typename DType>
 __device__ DType get_gradient_weight(DType argmax_h, DType argmax_w,
-  const int h, const int w, const int height, const int width) {
-
+                                     const index_t h, const index_t w,
+                                     const index_t height, const index_t width) {
   if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) {
     //empty
     return 0;
   }
 
-  argmax_h = max(argmax_h, (DType)0.0f);
-  argmax_w = max(argmax_w, (DType)0.0f);
+  argmax_h = max(argmax_h, static_cast<DType>(0.0f));
+  argmax_w = max(argmax_w, static_cast<DType>(0.0f));
 
-  int argmax_h_low = (int)argmax_h;
-  int argmax_w_low = (int)argmax_w;
-  int argmax_h_high;
-  int argmax_w_high;
+  index_t argmax_h_low = static_cast<index_t>(argmax_h);
+  index_t argmax_w_low = static_cast<index_t>(argmax_w);
+  index_t argmax_h_high;
+  index_t argmax_w_high;
   if (argmax_h_low >= height - 1) {
     argmax_h_high = argmax_h_low = height - 1;
-    argmax_h = (DType)argmax_h_low;
+    argmax_h = static_cast<DType>(argmax_h_low);
   } else {
     argmax_h_high = argmax_h_low + 1;
   }
   if (argmax_w_low >= width - 1)
   {
     argmax_w_high = argmax_w_low = width - 1;
-    argmax_w = (DType)argmax_w_low;
+    argmax_w = static_cast<DType>(argmax_w_low);
   } else {
     argmax_w_high = argmax_w_low + 1;
   }
@@ -162,9 +162,10 @@ __device__ DType get_gradient_weight(DType argmax_h, DType argmax_w,
 
 template <typename DType>
 __device__ DType get_coordinate_weight(DType argmax_h, DType argmax_w,
-  const int height, const int width, const DType* im_data,
-  const int data_width, const int bp_dir) {
-
+                                       const index_t height, const index_t width,
+                                       const DType* im_data,
+                                       const index_t data_width,
+                                       const index_t bp_dir) {
   if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width)
   {
     //empty
@@ -174,34 +175,38 @@ __device__ DType get_coordinate_weight(DType argmax_h, DType argmax_w,
   if (argmax_h < 0) argmax_h = 0;
   if (argmax_w < 0) argmax_w = 0;
 
-  int argmax_h_low = (int)argmax_h;
-  int argmax_w_low = (int)argmax_w;
-  int argmax_h_high;
-  int argmax_w_high;
+  index_t argmax_h_low = static_cast<index_t>(argmax_h);
+  index_t argmax_w_low = static_cast<index_t>(argmax_w);
+  index_t argmax_h_high;
+  index_t argmax_w_high;
   if (argmax_h_low >= height - 1) {
     argmax_h_high = argmax_h_low = height - 1;
-    argmax_h = (DType)argmax_h_low;
+    argmax_h = static_cast<DType>(argmax_h_low);
   } else {
     argmax_h_high = argmax_h_low + 1;
   }
   if (argmax_w_low >= width - 1) {
     argmax_w_high = argmax_w_low = width - 1;
-    argmax_w = (DType)argmax_w_low;
+    argmax_w = static_cast<DType>(argmax_w_low);
   } else {
     argmax_w_high = argmax_w_low + 1;
   }
-  DType weight = 0;
 
+  DType weight = 0;
+  DType im_ll = im_data[argmax_h_low * data_width + argmax_w_low];
+  DType im_lh = im_data[argmax_h_low * data_width + argmax_w_high];
+  DType im_hl = im_data[argmax_h_high * data_width + argmax_w_low];
+  DType im_hh = im_data[argmax_h_high * data_width + argmax_w_high];
   if (bp_dir == 0) {
-    weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
-    weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
-    weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
-    weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+    weight += -1 * (argmax_w_low + 1 - argmax_w) * im_ll;
+    weight += -1 * (argmax_w - argmax_w_low) * im_lh;
+    weight += (argmax_w_low + 1 - argmax_w) * im_hl;
+    weight += (argmax_w - argmax_w_low) * im_hh;
   } else if (bp_dir == 1) {
-    weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
-    weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
-    weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
-    weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+    weight += -1 * (argmax_h_low + 1 - argmax_h) * im_ll;
+    weight += (argmax_h_low + 1 - argmax_h) * im_lh;
+    weight += -1 * (argmax_h - argmax_h_low) * im_hl;
+    weight += (argmax_h - argmax_h_low) * im_hh;
   }
 
   return weight;
@@ -213,35 +218,38 @@ __device__ DType get_coordinate_weight(DType argmax_h, DType argmax_w,
  * DO NOT call this directly. Use wrapper function im2col() instead;
  */
 template <typename DType>
-__global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im, const DType* data_offset,
-  const int height, const int width, const int kernel_h, const int kernel_w,
-  const int pad_h, const int pad_w,
-  const int stride_h, const int stride_w,
-  const int dilation_h, const int dilation_w,
-  const int channel_per_deformable_group,
-  const int height_col, const int width_col,
-  DType* data_col) {
+__global__ void deformable_im2col_gpu_kernel(const index_t n, const DType* data_im,
+                                             const DType* data_offset,
+                                             const index_t height, const index_t width,
+                                             const index_t kernel_h, const index_t kernel_w,
+                                             const index_t pad_h, const index_t pad_w,
+                                             const index_t stride_h, const index_t stride_w,
+                                             const index_t dilation_h, const index_t dilation_w,
+                                             const index_t channel_per_group,
+                                             const index_t height_col, const index_t width_col,
+                                             DType* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
     // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int c_im = (index / width_col) / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
+    const index_t w_col = index % width_col;
+    const index_t h_col = (index / width_col) % height_col;
+    const index_t c_im = (index / width_col) / height_col;
+    const index_t c_col = c_im * kernel_h * kernel_w;
 
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
+    const index_t group_index = c_im / channel_per_group;
+    const index_t group_offset_step = 2 * kernel_h * kernel_w * height_col * width_col;
 
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
+    const index_t h_in = h_col * stride_h - pad_h;
+    const index_t w_in = w_col * stride_w - pad_w;
     DType* data_col_ptr = data_col + (c_col * height_col + h_col) * width_col + w_col;
     const DType* data_im_ptr = data_im + (c_im * height + h_in) * width + w_in;
-    const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col;
+    const DType* data_offset_ptr = data_offset + group_index * group_offset_step;
 
 
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+    for (index_t i = 0; i < kernel_h; ++i) {
+      for (index_t j = 0; j < kernel_w; ++j) {
+        const index_t data_offset_h_ptr = ((2 * (i * kernel_w + j)) *
+          height_col + h_col) * width_col + w_col;
+        const index_t data_offset_w_ptr = data_offset_h_ptr + height_col * width_col;
         const DType offset_h = data_offset_ptr[data_offset_h_ptr];
         const DType offset_w = data_offset_ptr[data_offset_w_ptr];
         DType val = static_cast<DType>(0);
@@ -250,8 +258,8 @@ __global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im,
         if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
           const DType map_h = i * dilation_h + offset_h;
           const DType map_w = j * dilation_w + offset_w;
-          const int cur_height = height - h_in;
-          const int cur_width = width - w_in;
+          const index_t cur_height = height - h_in;
+          const index_t cur_width = width - w_in;
           val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
         }
         *data_col_ptr = val;
@@ -262,10 +270,6 @@ __global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im,
 }
 
 
-
-
-
-
 /*!\brief
  * cpu function of deformable_im2col algorithm
  * \param s device stream
@@ -282,24 +286,33 @@ __global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im,
  */
 template <typename DType>
 inline void deformable_im2col(mshadow::Stream<gpu>* s,
-  const DType* data_im, const DType* data_offset,
-  const mxnet::TShape& im_shape, const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
-  const mxnet::TShape& pad, const mxnet::TShape& stride, const mxnet::TShape& dilation,
-  const uint32_t deformable_group, DType* data_col) {
+                              const DType* data_im,
+                              const DType* data_offset,
+                              const mxnet::TShape& im_shape,
+                              const mxnet::TShape& col_shape,
+                              const mxnet::TShape& kernel_shape,
+                              const mxnet::TShape& pad,
+                              const mxnet::TShape& stride,
+                              const mxnet::TShape& dilation,
+                              const index_t deformable_group,
+                              DType* data_col) {
   // num_axes should be smaller than block size
-  index_t num_spatial_axes = kernel_shape.ndim();
+  const int num_spatial_axes = kernel_shape.ndim();
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
-  index_t channel_per_deformable_group = im_shape[1] / deformable_group;
+  index_t channel_per_group = im_shape[1] / deformable_group;
   index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());
   using namespace mxnet_op;
   switch (num_spatial_axes) {
   case 2:
     deformable_im2col_gpu_kernel<DType> // NOLINT_NEXT_LINE(whitespace/operators)
         <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
-           0, mshadow::Stream<gpu>::GetStream(s)>>>(
-        num_kernels, data_im, data_offset, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1],
-        pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], channel_per_deformable_group,
-        col_shape[1], col_shape[2], data_col);
+           0, mshadow::Stream<gpu>::GetStream(s)>>>(num_kernels, data_im, data_offset,
+                                                    im_shape[2], im_shape[3],
+                                                    kernel_shape[0], kernel_shape[1],
+                                                    pad[0], pad[1], stride[0], stride[1],
+                                                    dilation[0], dilation[1],
+                                                    channel_per_group,
+                                                    col_shape[1], col_shape[2], data_col);
     MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_im2col_gpu_kernel);
     break;
   default:
@@ -314,39 +327,42 @@ inline void deformable_im2col(mshadow::Stream<gpu>* s,
 * \brief DO NOT call this directly. Use wrapper function deformable_col2im() instead;
 */
 template <typename DType>
-__global__ void deformable_col2im_gpu_kernel(const int n, const DType* data_col, const DType* data_offset,
-  const int channels, const int height, const int width,
-  const int kernel_h, const int kernel_w,
-  const int pad_h, const int pad_w,
-  const int stride_h, const int stride_w,
-  const int dilation_h, const int dilation_w,
-  const int channel_per_deformable_group,
-  const int height_col, const int width_col,
-  DType* grad_im, OpReqType req) {
+__global__ void deformable_col2im_gpu_kernel(const index_t n, const DType* data_col,
+                                             const DType* data_offset, const index_t channels,
+                                             const index_t height, const index_t width,
+                                             const index_t kernel_h, const index_t kernel_w,
+                                             const index_t pad_h, const index_t pad_w,
+                                             const index_t stride_h, const index_t stride_w,
+                                             const index_t dilation_h, const index_t dilation_w,
+                                             const index_t channel_per_group,
+                                             const index_t height_col, const index_t width_col,
+                                             DType* grad_im) {
   CUDA_KERNEL_LOOP(index, n) {
-    const int j = (index / width_col / height_col) % kernel_w;
-    const int i = (index / width_col / height_col / kernel_w) % kernel_h;
-    const int c = index / width_col / height_col / kernel_w / kernel_h;
+    const index_t j = (index / width_col / height_col) % kernel_w;
+    const index_t i = (index / width_col / height_col / kernel_w) % kernel_h;
+    const index_t c = index / width_col / height_col / kernel_w / kernel_h;
     // compute the start and end of the output
 
-    const int deformable_group_index = c / channel_per_deformable_group;
+    const index_t group_index = c / channel_per_group;
+    const index_t group_offset_step = 2 * kernel_h * kernel_w * height_col * width_col;
 
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
+    index_t w_col = index % width_col;
+    index_t h_col = (index / width_col) % height_col;
+    index_t w_in = w_col * stride_w - pad_w;
+    index_t h_in = h_col * stride_h - pad_h;
 
-    const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const DType* data_offset_ptr = data_offset + group_index * group_offset_step;
+    const index_t data_offset_h_ptr = ((2 * (i * kernel_w + j)) *
+      height_col + h_col) * width_col + w_col;
+    const index_t data_offset_w_ptr = data_offset_h_ptr + height_col * width_col;
     const DType offset_h = data_offset_ptr[data_offset_h_ptr];
     const DType offset_w = data_offset_ptr[data_offset_w_ptr];
     const DType cur_inv_h_data = h_in + i * dilation_h + offset_h;
     const DType cur_inv_w_data = w_in + j * dilation_w + offset_w;
 
     const DType cur_top_grad = data_col[index];
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
+    const index_t cur_h = static_cast<index_t>(cur_inv_h_data);
+    const index_t cur_w = static_cast<index_t>(cur_inv_w_data);
     for (int dy = -2; dy <= 2; dy++) {
       for (int dx = -2; dx <= 2; dx++) {
         if (cur_h + dy >= 0 && cur_h + dy < height &&
@@ -354,8 +370,9 @@ __global__ void deformable_col2im_gpu_kernel(const int n, const DType* data_col,
           abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
           abs(cur_inv_w_data - (cur_w + dx)) < 1
           ) {
-          int cur_bottom_grad_pos = (c * height + cur_h + dy) * width + cur_w + dx;
-          DType weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          index_t cur_bottom_grad_pos = (c * height + cur_h + dy) * width + cur_w + dx;
+          DType weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                             cur_h + dy, cur_w + dx, height, width);
           atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
         }
       }
@@ -380,14 +397,19 @@ __global__ void deformable_col2im_gpu_kernel(const int n, const DType* data_col,
  */
 template <typename DType>
 inline void deformable_col2im(mshadow::Stream<gpu>* s,
-  const DType* data_col, const DType* data_offset,
-  const mxnet::TShape& im_shape, const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
-  const mxnet::TShape& pad, const mxnet::TShape& stride,
-  const mxnet::TShape& dilation, const uint32_t deformable_group,
-  DType* grad_im, OpReqType req) {
-  index_t num_spatial_axes = kernel_shape.ndim();
+                              const DType* data_col,
+                              const DType* data_offset,
+                              const mxnet::TShape& im_shape,
+                              const mxnet::TShape& col_shape,
+                              const mxnet::TShape& kernel_shape,
+                              const mxnet::TShape& pad,
+                              const mxnet::TShape& stride,
+                              const mxnet::TShape& dilation,
+                              const index_t deformable_group,
+                              DType* grad_im) {
+  const int num_spatial_axes = kernel_shape.ndim();
   index_t im_size = im_shape.ProdShape(1, im_shape.ndim());
-  index_t channel_per_deformable_group = im_shape[1] / deformable_group;
+  index_t channel_per_group = im_shape[1] / deformable_group;
   index_t num_kernels = col_shape.ProdShape(0, col_shape.ndim());
   // num_axes should be smaller than block size
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
@@ -397,11 +419,15 @@ inline void deformable_col2im(mshadow::Stream<gpu>* s,
     // To avoid involving atomic operations, we will launch one kernel per
     // bottom dimension, and then in the kernel add up the top dimensions.
     // NOLINT_NEXT_LINE(whitespace/operators)
-    deformable_col2im_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
-                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
-        num_kernels, data_col, data_offset, im_shape[1], im_shape[2], im_shape[3],
-        kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
-        dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_im, req);
+    deformable_col2im_gpu_kernel<DType>
+      <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+      0, mshadow::Stream<gpu>::GetStream(s)>>>(num_kernels, data_col, data_offset,
+                                               im_shape[1], im_shape[2], im_shape[3],
+                                               kernel_shape[0], kernel_shape[1],
+                                               pad[0], pad[1], stride[0], stride[1],
+                                               dilation[0], dilation[1],
+                                               channel_per_group,
+                                               col_shape[1], col_shape[2], grad_im);
     MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel);
     break;
   default:
@@ -416,44 +442,50 @@ inline void deformable_col2im(mshadow::Stream<gpu>* s,
  * \brief DO NOT call this directly. Use wrapper function deformable_col2im_coord() instead;
  */
 template <typename DType>
-__global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* data_col,
-  const DType* data_im, const DType* data_offset,
-  const int channels, const int height, const int width,
-  const int kernel_h, const int kernel_w,
-  const int pad_h, const int pad_w,
-  const int stride_h, const int stride_w,
-  const int dilation_h, const int dilation_w,
-  const int channel_per_deformable_group,
-  const int height_col, const int width_col,
-  DType* grad_offset, OpReqType req) {
+__global__ void deformable_col2im_coord_gpu_kernel(const index_t n, const DType* data_col,
+                                                   const DType* data_im,
+                                                   const DType* data_offset,
+                                                   const index_t channels,
+                                                   const index_t height, const index_t width,
+                                                   const index_t kernel_h, const index_t kernel_w,
+                                                   const index_t pad_h, const index_t pad_w,
+                                                   const index_t stride_h, const index_t stride_w,
+                                                   const index_t dilation_h, const index_t dilation_w,
+                                                   const index_t channel_per_group,
+                                                   const index_t height_col, const index_t width_col,
+                                                   DType* grad_offset) {
   CUDA_KERNEL_LOOP(index, n) {
     DType val = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = index / width_col / height_col;
+    index_t w = index % width_col;
+    index_t h = (index / width_col) % height_col;
+    index_t c = index / width_col / height_col;
     // compute the start and end of the output
 
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const DType* data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * width_col * height_col;
-    const DType* data_im_ptr = data_im + deformable_group_index * channel_per_deformable_group / kernel_h / kernel_w * height * width;
-    const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) {
-      const int col_pos = ((col_c * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col) % kernel_w;
-      int i = (col_pos / width_col / height_col / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+    const index_t group_index = c / (2 * kernel_h * kernel_w);
+    const index_t group_col_step = channel_per_group * width_col * height_col;
+    const index_t group_im_step = channel_per_group / kernel_h / kernel_w * height * width;
+    const index_t group_offset_step = 2 * kernel_h * kernel_w * height_col * width_col;
+    const index_t col_step = kernel_h * kernel_w;
+    const DType* data_col_ptr = data_col + group_index * group_col_step;
+    const DType* data_im_ptr = data_im + group_index * group_im_step;
+    const DType* data_offset_ptr = data_offset + group_index * group_offset_step;
+
+    index_t cnt = 0;
+    const index_t offset_c = c - group_index * 2 * kernel_h * kernel_w;
+
+    for (index_t col_c = (offset_c / 2); col_c < channel_per_group; col_c += col_step) {
+      const index_t col_pos = ((col_c * height_col) + h) * width_col + w;
+      const index_t bp_dir = offset_c % 2;
+
+      index_t j = (col_pos / width_col / height_col) % kernel_w;
+      index_t i = (col_pos / width_col / height_col / kernel_w) % kernel_h;
+      index_t w_col = col_pos % width_col;
+      index_t h_col = (col_pos / width_col) % height_col;
+      index_t w_in = w_col * stride_w - pad_w;
+      index_t h_in = h_col * stride_h - pad_h;
+      const index_t data_offset_h_ptr = ((2 * (i * kernel_w + j)) *
+        height_col + h_col) * width_col + w_col;
+      const index_t data_offset_w_ptr = data_offset_h_ptr + height_col * width_col;
       const DType offset_h = data_offset_ptr[data_offset_h_ptr];
       const DType offset_w = data_offset_ptr[data_offset_w_ptr];
       DType inv_h = h_in + i * dilation_h + offset_h;
@@ -461,9 +493,9 @@ __global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* dat
       if (inv_h < 0 || inv_w < 0 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -1;
       }
-      const DType weight = get_coordinate_weight(
-        inv_h, inv_w,
-        height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      const DType weight = get_coordinate_weight(inv_h, inv_w, height, width,
+                                                 data_im_ptr + cnt * height * width,
+                                                 width, bp_dir);
       val += weight * data_col_ptr[col_pos];
       cnt += 1;
     }
@@ -472,6 +504,7 @@ __global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* dat
   }
 }
 
+
 /*!\brief
  * gpu function of deformable_col2im_coord algorithm
  * \param s device stream
@@ -489,13 +522,21 @@ __global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* dat
  */
 template <typename DType>
 inline void deformable_col2im_coord(mshadow::Stream<gpu>* s,
-  const DType* data_col, const DType* data_im, const DType* data_offset, const mxnet::TShape& im_shape,
-  const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
-  const mxnet::TShape& pad, const mxnet::TShape& stride,
-  const mxnet::TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) {
-  index_t num_spatial_axes = kernel_shape.ndim();
-  index_t num_kernels = col_shape[1] * col_shape[2] * 2 * kernel_shape[0] * kernel_shape[1] * deformable_group;
-  index_t channel_per_deformable_group = col_shape[0] / deformable_group;
+                                    const DType* data_col,
+                                    const DType* data_im,
+                                    const DType* data_offset,
+                                    const mxnet::TShape& im_shape,
+                                    const mxnet::TShape& col_shape,
+                                    const mxnet::TShape& kernel_shape,
+                                    const mxnet::TShape& pad,
+                                    const mxnet::TShape& stride,
+                                    const mxnet::TShape& dilation,
+                                    const index_t deformable_group,
+                                    DType* grad_offset) {
+  const int num_spatial_axes = kernel_shape.ndim();
+  index_t num_kernels = col_shape[1] * col_shape[2] * 2 *
+    kernel_shape[0] * kernel_shape[1] * deformable_group;
+  index_t channel_per_group = col_shape[0] / deformable_group;
   // num_axes should be smaller than block size
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
   using namespace mxnet_op;
@@ -504,12 +545,15 @@ inline void deformable_col2im_coord(mshadow::Stream<gpu>* s,
     // To avoid involving atomic operations, we will launch one kernel per
     // bottom dimension, and then in the kernel add up the top dimensions.
     // NOLINT_NEXT_LINE(whitespace/operators)
-
-    deformable_col2im_coord_gpu_kernel<DType> << <cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
-      0, mshadow::Stream<gpu>::GetStream(s) >> >(
-        num_kernels, data_col, data_im, data_offset, im_shape[1], im_shape[2], im_shape[3],
-        kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
-        dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_offset, req);
+    deformable_col2im_coord_gpu_kernel<DType>
+      <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+      0, mshadow::Stream<gpu>::GetStream(s)>>>(num_kernels, data_col, data_im, data_offset,
+                                               im_shape[1], im_shape[2], im_shape[3],
+                                               kernel_shape[0], kernel_shape[1],
+                                               pad[0], pad[1], stride[0], stride[1],
+                                               dilation[0], dilation[1],
+                                               channel_per_group,
+                                               col_shape[1], col_shape[2], grad_offset);
     MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_coord_gpu_kernel);
     break;
   default:
diff --git a/src/operator/contrib/nn/deformable_im2col.h b/src/operator/contrib/nn/deformable_im2col.h
index 1f96fe5b2366..3f42668b86be 100644
--- a/src/operator/contrib/nn/deformable_im2col.h
+++ b/src/operator/contrib/nn/deformable_im2col.h
@@ -65,11 +65,197 @@
 #include <mxnet/operator.h>
 #include <cstring>
 #include <vector>
+#include <algorithm>
 #include "../../mxnet_op.h"
 
 namespace mxnet {
 namespace op {
 
+template <typename DType>
+inline DType im2col_bilinear_cpu(const DType* data,
+                                 const index_t height,
+                                 const index_t width,
+                                 DType h, DType w) {
+  index_t h_low = floor(h);
+  index_t w_low = floor(w);
+  index_t h_high;
+  index_t w_high;
+
+  if (h_low >= height - 1) {
+    h_high = height - 1;
+    h = static_cast<DType>(h_low);
+  } else {
+    h_high = h_low + 1;
+  }
+
+  if (w_low >= width - 1) {
+    w_high = width - 1;
+    w = static_cast<DType>(w_low);
+  } else {
+    w_high = w_low + 1;
+  }
+
+  DType lh = h - h_low;
+  DType lw = w - w_low;
+  DType hh = 1 - lh, hw = 1 - lw;
+
+  DType v1 = data[h_low * width + w_low];
+  DType v2 = data[h_low * width + w_high];
+  DType v3 = data[h_high * width + w_low];
+  DType v4 = data[h_high * width + w_high];
+  DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+}
+
+
+template <typename DType>
+inline DType get_gradient_weight_cpu(DType argmax_h, DType argmax_w,
+                                    const index_t h, const index_t w,
+                                    const index_t height, const index_t width) {
+  if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) {
+    // empty
+    return 0;
+  }
+
+  argmax_h = std::max(argmax_h, static_cast<DType>(0.0f));
+  argmax_w = std::max(argmax_w, static_cast<DType>(0.0f));
+
+  index_t argmax_h_low = static_cast<index_t>(argmax_h);
+  index_t argmax_w_low = static_cast<index_t>(argmax_w);
+  index_t argmax_h_high;
+  index_t argmax_w_high;
+  if (argmax_h_low >= height - 1) {
+    argmax_h_high = argmax_h_low = height - 1;
+    argmax_h = static_cast<DType>(argmax_h_low);
+  } else {
+    argmax_h_high = argmax_h_low + 1;
+  }
+  if (argmax_w_low >= width - 1) {
+    argmax_w_high = argmax_w_low = width - 1;
+    argmax_w = static_cast<DType>(argmax_w_low);
+  } else {
+    argmax_w_high = argmax_w_low + 1;
+  }
+  DType weight = 0;
+  if (h == argmax_h_low) {
+    if (w == argmax_w_low) {
+      weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+    } else if (w == argmax_w_high) {
+      weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+    }
+  } else if (h == argmax_h_high) {
+    if (w == argmax_w_low) {
+      weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+    } else if (w == argmax_w_high) {
+      weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+    }
+  }
+  return weight;
+}
+
+
+template <typename DType>
+inline DType get_coordinate_weight_cpu(DType argmax_h, DType argmax_w,
+                                       const index_t height, const index_t width,
+                                       const DType* im_data,
+                                       const index_t data_width, const index_t bp_dir) {
+  if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) {
+    // empty
+    return 0;
+  }
+
+  if (argmax_h < 0) argmax_h = 0;
+  if (argmax_w < 0) argmax_w = 0;
+
+  index_t argmax_h_low = static_cast<index_t>(argmax_h);
+  index_t argmax_w_low = static_cast<index_t>(argmax_w);
+  index_t argmax_h_high;
+  index_t argmax_w_high;
+  if (argmax_h_low >= height - 1) {
+    argmax_h_high = argmax_h_low = height - 1;
+    argmax_h = static_cast<DType>(argmax_h_low);
+  } else {
+    argmax_h_high = argmax_h_low + 1;
+  }
+  if (argmax_w_low >= width - 1) {
+    argmax_w_high = argmax_w_low = width - 1;
+    argmax_w = static_cast<DType>(argmax_w_low);
+  } else {
+    argmax_w_high = argmax_w_low + 1;
+  }
+
+  DType weight = 0;
+  DType im_ll = im_data[argmax_h_low * data_width + argmax_w_low];
+  DType im_lh = im_data[argmax_h_low * data_width + argmax_w_high];
+  DType im_hl = im_data[argmax_h_high * data_width + argmax_w_low];
+  DType im_hh = im_data[argmax_h_high * data_width + argmax_w_high];
+  if (bp_dir == 0) {
+    weight += -1 * (argmax_w_low + 1 - argmax_w) * im_ll;
+    weight += -1 * (argmax_w - argmax_w_low) * im_lh;
+    weight += (argmax_w_low + 1 - argmax_w) * im_hl;
+    weight += (argmax_w - argmax_w_low) * im_hh;
+  } else if (bp_dir == 1) {
+    weight += -1 * (argmax_h_low + 1 - argmax_h) * im_ll;
+    weight += (argmax_h_low + 1 - argmax_h) * im_lh;
+    weight += -1 * (argmax_h - argmax_h_low) * im_hl;
+    weight += (argmax_h - argmax_h_low) * im_hh;
+  }
+
+  return weight;
+}
+
+
+/*!
+ * \brief deformable_im2col 2D cpu version.
+ * DO NOT call this function directly.
+ * Use the wrapper function im2col() instead.
+ */
+template <typename DType>
+inline void deformable_im2col_cpu(const DType* data_im,
+                                  const DType* data_offset,
+                                  const index_t channels,
+                                  const index_t height, const index_t width,
+                                  const index_t kernel_h, const index_t kernel_w,
+                                  const index_t pad_h, const index_t pad_w,
+                                  const index_t stride_h, const index_t stride_w,
+                                  const index_t dilation_h, const index_t dilation_w,
+                                  const index_t deformable_group,
+                                  const index_t height_col, const index_t width_col,
+                                  DType* data_col) {
+  const index_t channel_size = height * width;
+  const index_t offset_size = 2 * kernel_h * kernel_w * height_col * width_col;
+  const index_t channel_per_group = channels / deformable_group;
+  for (index_t channel = 0; channel < channels; channel++, data_im += channel_size) {
+    if (channel % channel_per_group == 0 && channel != 0) {
+      data_offset += offset_size;
+    }
+    for (index_t i = 0; i < kernel_h; i++) {
+      for (index_t j = 0; j < kernel_w; j++) {
+        index_t input_row = -pad_h + i * dilation_h;
+        for (index_t h_col = 0; h_col < height_col; h_col++) {
+          index_t input_col = -pad_w + j * dilation_w;
+          for (index_t w_col = 0; w_col < width_col; w_col++) {
+            index_t offset_h_ptr = ((2 * (i * kernel_w + j)) *
+              height_col + h_col) * width_col + w_col;
+            index_t offset_w_ptr = offset_h_ptr + height_col * width_col;
+            DType im_row = input_row + data_offset[offset_h_ptr];
+            DType im_col = input_col + data_offset[offset_w_ptr];
+            if (im_row >= 0 && im_col >= 0 && im_row < height && im_col < width) {
+              *(data_col++) = im2col_bilinear_cpu(data_im, height, width, im_row, im_col);
+            } else {
+              *(data_col++) = 0;
+            }
+            input_col += stride_w;
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+
 /*!\brief
  * cpu function of deformable_im2col algorithm
  * \param s device stream
@@ -86,18 +272,92 @@ namespace op {
  */
 template <typename DType>
 inline void deformable_im2col(mshadow::Stream<cpu>* s,
-  const DType* data_im, const DType* data_offset,
-  const mxnet::TShape& im_shape, const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
-  const mxnet::TShape& pad, const mxnet::TShape& stride, const mxnet::TShape& dilation,
-  const uint32_t deformable_group, DType* data_col) {
+                              const DType* data_im, const DType* data_offset,
+                              const mxnet::TShape& im_shape,
+                              const mxnet::TShape& col_shape,
+                              const mxnet::TShape& kernel_shape,
+                              const mxnet::TShape& pad,
+                              const mxnet::TShape& stride,
+                              const mxnet::TShape& dilation,
+                              const index_t deformable_group,
+                              DType* data_col) {
   if (2 == kernel_shape.ndim()) {
-    LOG(FATAL) << "only implemented in GPU";
+    deformable_im2col_cpu(data_im, data_offset,
+                          im_shape[1], im_shape[2], im_shape[3],
+                          kernel_shape[0], kernel_shape[1],
+                          pad[0], pad[1],
+                          stride[0], stride[1],
+                          dilation[0], dilation[1],
+                          deformable_group,
+                          col_shape[1], col_shape[2], data_col);
   } else {
     LOG(FATAL) << "not implemented";
   }
 }
 
 
+/*!
+ * \brief deformable_col2im cpu version.
+ * DO NOT call this directly.
+ * Use wrapper function deformable_col2im() instead;
+ */
+template <typename DType>
+inline void deformable_col2im_cpu(const DType* data_col,
+                                  const DType* data_offset, const index_t channels,
+                                  const index_t height, const index_t width,
+                                  const index_t kernel_h, const index_t kernel_w,
+                                  const index_t pad_h, const index_t pad_w,
+                                  const index_t stride_h, const index_t stride_w,
+                                  const index_t dilation_h, const index_t dilation_w,
+                                  const index_t deformable_group,
+                                  const index_t height_col, const index_t width_col,
+                                  DType* grad_im) {
+  index_t channel_per_group = channels / deformable_group;
+  index_t count = channels * kernel_h * kernel_w * height_col * width_col;
+  for (index_t index = 0; index < count; ++index) {
+    const index_t j = (index / width_col / height_col) % kernel_w;
+    const index_t i = (index / width_col / height_col / kernel_w) % kernel_h;
+    const index_t c = index / width_col / height_col / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const index_t group_index = c / channel_per_group;
+    const index_t group_offset_step = 2 * kernel_h * kernel_w * height_col * width_col;
+
+    index_t w_col = index % width_col;
+    index_t h_col = (index / width_col) % height_col;
+    index_t w_in = w_col * stride_w - pad_w;
+    index_t h_in = h_col * stride_h - pad_h;
+
+    const DType* data_offset_ptr = data_offset + group_index * group_offset_step;
+    const index_t data_offset_h_ptr = ((2 * (i * kernel_w + j)) *
+      height_col + h_col) * width_col + w_col;
+    const index_t data_offset_w_ptr = data_offset_h_ptr + height_col * width_col;
+    const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+    const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+    const DType cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const DType cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const DType cur_top_grad = data_col[index];
+    const index_t cur_h = static_cast<index_t>(cur_inv_h_data);
+    const index_t cur_w = static_cast<index_t>(cur_inv_w_data);
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+          cur_w + dx >= 0 && cur_w + dx < width &&
+          std::abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+          std::abs(cur_inv_w_data - (cur_w + dx)) < 1
+          ) {
+          index_t cur_bottom_grad_pos = (c * height + cur_h + dy) * width + cur_w + dx;
+          DType weight = get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
+                                                 cur_h + dy, cur_w + dx, height, width);
+          grad_im[cur_bottom_grad_pos] += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+
 /*!\brief
  * cpu function of deformable_col2im algorithm
  * \param s device stream
@@ -114,12 +374,98 @@ inline void deformable_im2col(mshadow::Stream<cpu>* s,
  */
 template <typename DType>
 inline void deformable_col2im(mshadow::Stream<cpu>* s,
-  const DType* data_col, const DType* data_offset,
-  const mxnet::TShape& im_shape, const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
-  const mxnet::TShape& pad, const mxnet::TShape& stride,
-  const mxnet::TShape& dilation, const uint32_t deformable_group,
-  DType* grad_im, OpReqType req) {
-  LOG(FATAL) << "only implemented in GPU";
+                              const DType* data_col,
+                              const DType* data_offset,
+                              const mxnet::TShape& im_shape,
+                              const mxnet::TShape& col_shape,
+                              const mxnet::TShape& kernel_shape,
+                              const mxnet::TShape& pad,
+                              const mxnet::TShape& stride,
+                              const mxnet::TShape& dilation,
+                              const index_t deformable_group,
+                              DType* grad_im) {
+  if (2 == kernel_shape.ndim()) {
+    deformable_col2im_cpu(data_col, data_offset,
+                          im_shape[1], im_shape[2], im_shape[3],
+                          kernel_shape[0], kernel_shape[1],
+                          pad[0], pad[1], stride[0], stride[1],
+                          dilation[0], dilation[1],
+                          deformable_group,
+                          col_shape[1], col_shape[2], grad_im);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+
+/*!
+ * \brief deformable_col2im_coord cpu version.
+ * DO NOT call this directly.
+ * Use wrapper function deformable_col2im_coord() instead;
+ */
+template <typename DType>
+inline void deformable_col2im_coord_cpu(const DType* data_col,
+                                        const DType* data_im,
+                                        const DType* data_offset,
+                                        const index_t channels,
+                                        const index_t height, const index_t width,
+                                        const index_t kernel_h, const index_t kernel_w,
+                                        const index_t pad_h, const index_t pad_w,
+                                        const index_t stride_h, const index_t stride_w,
+                                        const index_t dilation_h, const index_t dilation_w,
+                                        const index_t deformable_group,
+                                        const index_t height_col, const index_t width_col,
+                                        DType* grad_offset) {
+  index_t channel_per_group = channels * kernel_h * kernel_w / deformable_group;
+  index_t count = height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  for (index_t index = 0; index < count; ++index) {
+    DType val = 0;
+    index_t w = index % width_col;
+    index_t h = (index / width_col) % height_col;
+    index_t c = index / width_col / height_col;
+    // compute the start and end of the output
+
+    const index_t group_index = c / (2 * kernel_h * kernel_w);
+    const index_t group_col_step = channel_per_group * width_col * height_col;
+    const index_t group_im_step = channel_per_group / kernel_h / kernel_w * height * width;
+    const index_t group_offset_step = 2 * kernel_h * kernel_w * height_col * width_col;
+    const index_t col_step = kernel_h * kernel_w;
+    const DType* data_col_ptr = data_col + group_index * group_col_step;
+    const DType* data_im_ptr = data_im + group_index * group_im_step;
+    const DType* data_offset_ptr = data_offset + group_index * group_offset_step;
+
+    index_t cnt = 0;
+    const index_t offset_c = c - group_index * 2 * kernel_h * kernel_w;
+
+    for (index_t col_c = (offset_c / 2); col_c < channel_per_group; col_c += col_step) {
+      const index_t col_pos = ((col_c * height_col) + h) * width_col + w;
+      const index_t bp_dir = offset_c % 2;
+
+      index_t j = (col_pos / width_col / height_col) % kernel_w;
+      index_t i = (col_pos / width_col / height_col / kernel_w) % kernel_h;
+      index_t w_col = col_pos % width_col;
+      index_t h_col = (col_pos / width_col) % height_col;
+      index_t w_in = w_col * stride_w - pad_w;
+      index_t h_in = h_col * stride_h - pad_h;
+      const index_t data_offset_h_ptr = ((2 * (i * kernel_w + j)) *
+        height_col + h_col) * width_col + w_col;
+      const index_t data_offset_w_ptr = data_offset_h_ptr + height_col * width_col;
+      const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+      const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+      DType inv_h = h_in + i * dilation_h + offset_h;
+      DType inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h < 0 || inv_w < 0 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -1;
+      }
+      const DType weight = get_coordinate_weight_cpu(inv_h, inv_w, height, width,
+                                                     data_im_ptr + cnt * height * width,
+                                                     width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
 }
 
 
@@ -138,16 +484,30 @@ inline void deformable_col2im(mshadow::Stream<cpu>* s,
  * \param deformable_group #offset group that deformable convolution use
  * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch
  */
-
 template <typename DType>
 inline void deformable_col2im_coord(mshadow::Stream<cpu>* s,
-  const DType* data_col, const DType* data_im,
-  const DType* data_offset, const mxnet::TShape& im_shape,
-  const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
-  const mxnet::TShape& pad, const mxnet::TShape& stride,
-  const mxnet::TShape& dilation, const uint32_t deformable_group,
-  DType* grad_offset, OpReqType req) {
-  LOG(FATAL) << "only implemented in GPU";
+                                    const DType* data_col,
+                                    const DType* data_im,
+                                    const DType* data_offset,
+                                    const mxnet::TShape& im_shape,
+                                    const mxnet::TShape& col_shape,
+                                    const mxnet::TShape& kernel_shape,
+                                    const mxnet::TShape& pad,
+                                    const mxnet::TShape& stride,
+                                    const mxnet::TShape& dilation,
+                                    const index_t deformable_group,
+                                    DType* grad_offset) {
+  if (2 == kernel_shape.ndim()) {
+    deformable_col2im_coord_cpu(data_col, data_im, data_offset,
+                                im_shape[1], im_shape[2], im_shape[3],
+                                kernel_shape[0], kernel_shape[1],
+                                pad[0], pad[1], stride[0], stride[1],
+                                dilation[0], dilation[1],
+                                deformable_group,
+                                col_shape[1], col_shape[2], grad_offset);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
 }
 
 }  // namespace op
diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index 10f1f86806e4..b68822d53614 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -62,10 +62,9 @@ struct ProposalParam : public dmlc::Parameter<ProposalParam> {
   DMLC_DECLARE_PARAMETER(ProposalParam) {
     float tmp[] = {0, 0, 0, 0};
     DMLC_DECLARE_FIELD(rpn_pre_nms_top_n).set_default(6000)
-    .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals");
+    .describe("Number of top scoring boxes to keep before applying NMS to RPN proposals");
     DMLC_DECLARE_FIELD(rpn_post_nms_top_n).set_default(300)
-    .describe("Overlap threshold used for non-maximum"
-              "suppresion(suppress boxes with IoU >= this threshold");
+    .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals");
     DMLC_DECLARE_FIELD(threshold).set_default(0.7)
     .describe("NMS value, below which to suppress.");
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 412bfa1bc3aa..77fe2e6e4b1c 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -224,7 +224,7 @@ std::vector<nnvm::NodeEntry> Gradient(
     size_t i = static_cast<size_t>(t);
     if (i >= params.num_outs + params.num_args) {
       uint32_t idx = static_cast<uint32_t>(i-params.num_outs-params.num_args);
-      g->inputs.push_back(nnvm::NodeEntry{n, idx, 0});
+      g->inputs.emplace_back(n, idx, 0);
     } else if (i >= params.num_outs) {
       g->inputs.push_back(n->inputs[i-params.num_outs]);
     } else {
@@ -238,14 +238,14 @@ std::vector<nnvm::NodeEntry> Gradient(
 
   std::vector<nnvm::NodeEntry> ret;
   for (size_t i = 0; i < params.num_args; ++i) {
-    ret.emplace_back(nnvm::NodeEntry{g, static_cast<uint32_t>(i), 0});
+    ret.emplace_back(g, static_cast<uint32_t>(i), 0);
   }
   if (params.num_auxs) {
     nnvm::NodePtr ng = nnvm::Node::Create();
     ng->attrs.op = nnvm::Op::Get("_NoGradient");
     ng->attrs.name = "NoGradient";
     for (size_t i = 0; i < params.num_auxs; ++i) {
-      ret.emplace_back(nnvm::NodeEntry{ng, 0, 0});
+      ret.emplace_back(ng, 0, 0);
     }
   }
 
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 2edaa55540c1..6dae2dfa20c4 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -203,7 +203,7 @@ struct ElemwiseGradUseOut {
     std::vector<nnvm::NodeEntry> heads;
     uint32_t n_out = n->num_outputs();
     for (uint32_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+      heads.emplace_back(n, i, 0);
     }
     return MakeNonlossGradNode(op_name, n, ograds, heads, n->attrs.dict);
   }
@@ -220,7 +220,7 @@ struct ElemwiseGradUseInOut {
     }
     uint32_t n_out = n->num_outputs();
     for (uint32_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+      heads.emplace_back(n, i, 0);
     }
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
diff --git a/src/operator/image/image_random.cc b/src/operator/image/image_random.cc
index 0b95b198ae64..34f4cb4d395c 100644
--- a/src/operator/image/image_random.cc
+++ b/src/operator/image/image_random.cc
@@ -41,7 +41,7 @@ DMLC_REGISTER_PARAMETER(RandomColorJitterParam);
 NNVM_REGISTER_OP(_image_to_tensor)
 .describe(R"code(Converts an image NDArray of shape (H x W x C) or (N x H x W x C) 
 with values in the range [0, 255] to a tensor NDArray of shape (C x H x W) or (N x C x H x W)
-with values in the range [0, 1)
+with values in the range [0, 1]
 
 Example:
     .. code-block:: python
diff --git a/src/operator/instance_norm-inl.h b/src/operator/instance_norm-inl.h
index b7e579e2d066..c71cbe043afd 100644
--- a/src/operator/instance_norm-inl.h
+++ b/src/operator/instance_norm-inl.h
@@ -113,7 +113,7 @@ class InstanceNormOp : public Operator {
     CHECK_EQ(in_data.size(), 3U);
     CHECK_EQ(out_data.size(), 3U);
 
-    CHECK_GE(in_data[instance_norm::kData].ndim(), 3U)
+    CHECK_GE(in_data[instance_norm::kData].ndim(), 3)
         << "InstanceNorm only supports input tensors of rank > 2.";
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index 975e81f78c25..210d91823075 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -102,7 +102,7 @@ class L2NormalizationOp : public Operator {
       norm = F<mxnet::op::mshadow_op::square_root>(norm);
       out = data / broadcast<0>(norm, out.shape_);
     } else if (param_.mode == l2_normalization::kChannel) {
-      CHECK_GE(orig_shape.ndim(), 3U);
+      CHECK_GE(orig_shape.ndim(), 3);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3, DType> data = in_data[l2_normalization::kData]
@@ -120,7 +120,7 @@ class L2NormalizationOp : public Operator {
       norm = F<mxnet::op::mshadow_op::square_root>(norm);
       out = data / broadcast_with_axis(norm, 0, orig_shape[1]);
     } else if (param_.mode == l2_normalization::kSpatial) {
-      CHECK_GE(orig_shape.ndim(), 3U);
+      CHECK_GE(orig_shape.ndim(), 3);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3, DType> data = in_data[l2_normalization::kData]
@@ -174,7 +174,7 @@ class L2NormalizationOp : public Operator {
         (grad_out - data * broadcast<0>(temp, data.shape_)) /
         broadcast<0>(norm, data.shape_));
     } else if (param_.mode == l2_normalization::kChannel) {
-      CHECK_GE(orig_shape.ndim(), 3U);
+      CHECK_GE(orig_shape.ndim(), 3);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3, DType> data = out_data[l2_normalization::kOut]
@@ -193,7 +193,7 @@ class L2NormalizationOp : public Operator {
         (grad_out - data * broadcast_with_axis(temp, 0, orig_shape[1])) /
         broadcast_with_axis(norm, 0, orig_shape[1]));
     } else if (param_.mode == l2_normalization::kSpatial) {
-      CHECK_GE(orig_shape.ndim(), 3U);
+      CHECK_GE(orig_shape.ndim(), 3);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3, DType> data = out_data[l2_normalization::kOut]
@@ -273,12 +273,12 @@ class L2NormalizationProp : public OperatorProperty {
     if (param_.mode == l2_normalization::kInstance) {
       out_shape->push_back(Shape1(dshape[0]));
     } else if (param_.mode == l2_normalization::kChannel) {
-      CHECK_GE(dshape.ndim(), 3U) << "At lease 3 dimensions required in channel mode";
+      CHECK_GE(dshape.ndim(), 3) << "At lease 3 dimensions required in channel mode";
       mxnet::TShape norm_shape = dshape;
       norm_shape[1] = 1;
       out_shape->push_back(norm_shape);
     } else if (param_.mode == l2_normalization::kSpatial) {
-      CHECK_GE(dshape.ndim(), 3U) << "At lease 3 dimensions required in spatial mode";
+      CHECK_GE(dshape.ndim(), 3) << "At lease 3 dimensions required in spatial mode";
       out_shape->push_back(Shape2(dshape[0], dshape[1]));
     } else {
       return false;
diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc
index 92307af814d2..cbe2caeb394e 100644
--- a/src/operator/l2_normalization.cc
+++ b/src/operator/l2_normalization.cc
@@ -70,7 +70,7 @@ class L2NormalizationOpCPU : public L2NormalizationOp<cpu, DType> {
         }
       }
     } else if (this->param_.mode == l2_normalization::kChannel) {
-      CHECK_GE(orig_shape.ndim(), 3U);
+      CHECK_GE(orig_shape.ndim(), 3);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<cpu, 3, DType> data = in_data[l2_normalization::kData]
@@ -94,7 +94,7 @@ class L2NormalizationOpCPU : public L2NormalizationOp<cpu, DType> {
         }
       }
     } else if (this->param_.mode == l2_normalization::kSpatial) {
-      CHECK_GE(orig_shape.ndim(), 3U);
+      CHECK_GE(orig_shape.ndim(), 3);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<cpu, 3, DType> data = in_data[l2_normalization::kData]
diff --git a/src/operator/linalg.h b/src/operator/linalg.h
index dc5940013c6b..ee713e5548c0 100644
--- a/src/operator/linalg.h
+++ b/src/operator/linalg.h
@@ -191,6 +191,55 @@ int linalg_syevd_workspace_query(const Tensor<xpu, 2, DType>& A,
                                  const Tensor<xpu, 1, DType>& L,
                                  Stream<xpu> *s = 0);
 
+//////////////////////////////// GETRF ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "getrf". Please refer to the
+// LAPACK documentation for further details.
+// Note that this is A = getrf(A), so A is input and output parameter.
+
+template<typename xpu, typename DType>
+void linalg_getrf(const Tensor<xpu, 2, DType>& A,
+                  const Tensor<xpu, 1, DType>& work,
+                  Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_batch_getrf(const Tensor<xpu, 3, DType>& A,
+                        const Tensor<xpu, 1, DType>& work,
+                        Stream<xpu> *s = 0);
+
+//////////////////////////////// GETRI ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "getri". Please refer to the
+// LAPACK documentation for further details.
+// Note that this is A = getri(A), so A is input and output parameter.
+
+template<typename xpu, typename DType>
+void linalg_getri(const Tensor<xpu, 2, DType>& A,
+                  const Tensor<xpu, 1, DType>& work,
+                  Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_batch_getri(const Tensor<xpu, 3, DType>& A,
+                        const Tensor<xpu, 3, DType>& B,
+                        const Tensor<xpu, 1, DType>& work,
+                        Stream<xpu> *s = 0);
+
+// This function determines the amount of workspace needed for linalg_getri to operate
+// on a batch of matrices which is returned as number of elements of type DType.
+template<typename xpu, typename DType>
+int linalg_getri_workspace_query(const Tensor<xpu, 3, DType>& A,
+                                 Stream<xpu> *s = 0);
+
+//////////////////////////////// INVERSE ////////////////////////////////////////////
+
+// CPU/GPU-versions of matrix inversion combining LAPACK function "getrf" and "getri"
+// Note that A = inverse(B)
+template<typename xpu, typename DType>
+void linalg_batch_inverse(const Tensor<xpu, 3, DType>& A,
+                          const Tensor<xpu, 3, DType>& B,
+                          const Tensor<xpu, 1, DType>& work,
+                          Stream<xpu> *s = 0);
+
 #include "linalg_impl.h"
 
 #endif  // MXNET_OPERATOR_LINALG_H_
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index 4e63f61f1056..718e3f9c5aa0 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -30,6 +30,7 @@
 #include <algorithm>
 
 #include "../common/cuda_utils.h"
+#include "mxnet_op.h"
 
 // Convenience functions.
 inline void linalg_check_batch_size(int A, int B, int C) {
@@ -1133,7 +1134,7 @@ void linalg_syevd<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
                        A.dptr_, A.stride_, L.dptr_, work.dptr_, -1, &liwork, \
                       -1); \
   int lwork(static_cast<int>(*work.dptr_)); \
-  int *iwork = static_cast<int*>(static_cast<void*>(work.dptr_ + lwork)); \
+  int *iwork = static_cast<int *>(static_cast<void *>(work.dptr_ + lwork)); \
   int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, 'L', A.size(0), \
                                A.dptr_, A.stride_, L.dptr_, work.dptr_, \
                                lwork, iwork, liwork)); \
@@ -1233,4 +1234,255 @@ LINALG_GPU_SYEVD_WORKSPACE_QUERY(DnDsyevd, double)
 
 #endif  // __CUDACC__
 
+//////////////////////////////// GETRF ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "getrf"
+
+// The input of this function should be col-major for performance.
+// Tensor work holds space for ipiv in getrf
+#define LINALG_CPU_GETRF(fname, DType) \
+template<> inline \
+void linalg_getrf<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                              const Tensor<cpu, 1, DType>& work, \
+                              Stream<cpu> *s) { \
+  int *ipiv = reinterpret_cast<int *>(work.dptr_); \
+  int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_COL_MAJOR, A.size(1), A.size(0), \
+                               A.dptr_, A.stride_, ipiv)); \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+}
+
+LINALG_CPU_GETRF(sgetrf, float)
+LINALG_CPU_GETRF(dgetrf, double)
+
+#ifdef __CUDACC__
+
+// "getrfBatched" and "getriBatched" in cuBLAS must have DType *matrices[] as input
+// to store the pointers of each batch matrix. This kernel is used to build the
+// pointer array.
+struct set_matrix : public mxnet::op::mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType **p, DType *m, int step) {
+    p[i] = m + i * step;
+  }
+};
+
+// GETRF only available with cuda8 or higher.
+#if CUDA_VERSION >= 8000
+
+// Since there is no "getri" in cuSolver, we are using batched version of
+// "getrf" and "getri" in cuBLAS here. These routines are good for large
+// batches of small matrices, so performance issue may happen when computing
+// large matices. We leave it here until MAGMA which has "getri" is introduced
+// into MXNet.
+#define LINALG_GPU_BATCH_GETRF(fname, DType) \
+template<> inline \
+void linalg_batch_getrf<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
+                                    const Tensor<gpu, 1, DType>& work, \
+                                    Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using namespace mxnet::op::mxnet_op; \
+  CHECK_NOTNULL(s); \
+  Storage::Handle A_ptr_buf = Storage::Get()->Alloc(sizeof(DType *) * A.size(0), Context::GPU()); \
+  DType **A_ptr = static_cast<DType **>(A_ptr_buf.dptr); \
+  const Tensor<gpu, 3, DType> temp(work.dptr_, A.shape_, s); \
+  int *pivot = reinterpret_cast<int *>(temp.dptr_ + temp.shape_.Size()); \
+  int *info = pivot + A.size(0) * A.size(1); \
+  Copy(temp, A, s); \
+  Kernel<set_matrix, gpu>::Launch(s, temp.size(0), \
+                                  A_ptr, temp.dptr_, \
+                                  temp.size(1) * temp.size(2)); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+                            A.size(1), A_ptr, A.size(2), pivot, \
+                            info, A.size(0))) \
+  Storage::Get()->Free(A_ptr_buf); \
+}
+
+#else
+
+#define LINALG_GPU_BATCH_GETRF(fname, DType) \
+template<> inline \
+void linalg_batch_getrf<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
+                                    const Tensor<gpu, 1, DType>& work, \
+                                    Stream<gpu> *s) { \
+  LOG(FATAL) << "batched getrf requires CUDA version >= 8.0!"; \
+}
+
+#endif  // CUDA_VERSION >= 8000
+
+LINALG_GPU_BATCH_GETRF(SgetrfBatched, float)
+LINALG_GPU_BATCH_GETRF(DgetrfBatched, double)
+
+#endif  // __CUDACC__
+
+//////////////////////////////// GETRI ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "getri"
+
+// The input of this function should be col-major for performance.
+// Tensor work holds space for ipiv, work in getri
+#define LINALG_CPU_GETRI(fname, DType) \
+template<> inline \
+void linalg_getri<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                              const Tensor<cpu, 1, DType>& work, \
+                              Stream<cpu> *s) { \
+  DType wkopt; \
+  MXNET_LAPACK_##fname(MXNET_LAPACK_COL_MAJOR, A.size(0), A.dptr_, \
+                       A.stride_, nullptr, &wkopt, -1); \
+  int lwork(static_cast<int>(wkopt)); \
+  int *ipiv = reinterpret_cast<int *>(work.dptr_); \
+  DType *pwork = reinterpret_cast<DType *>(ipiv + A.size(0)); \
+  int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_COL_MAJOR, A.size(0), A.dptr_, \
+                               A.stride_, ipiv, pwork, lwork)); \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+}
+LINALG_CPU_GETRI(sgetri, float)
+LINALG_CPU_GETRI(dgetri, double)
+
+// Query workspace for the whole batch of matrices.For cpu version, the workspace
+// is re-used, so space for only one matrix is enough.
+#define LINALG_CPU_GETRI_WORKSPACE_QUERY(func, DType) \
+template<> inline \
+int linalg_getri_workspace_query<cpu, DType>(const Tensor<cpu, 3, DType>& A, \
+                                             Stream<cpu> *s) { \
+  const Tensor<cpu, 2, DType>& matrix = A[0]; \
+  DType lwork(0); \
+  MXNET_LAPACK_##func(MXNET_LAPACK_COL_MAJOR, matrix.size(0), matrix.dptr_, \
+                      matrix.stride_, nullptr, &lwork, -1); \
+  int ipiv = (sizeof(int) * matrix.size(0) + sizeof(DType) - 1) / sizeof(DType); \
+  return ipiv + static_cast<int>(lwork); \
+}
+LINALG_CPU_GETRI_WORKSPACE_QUERY(sgetri, float)
+LINALG_CPU_GETRI_WORKSPACE_QUERY(dgetri, double)
+
+#ifdef __CUDACC__
+
+// GETRI only available with cuda8 or higher.
+#if CUDA_VERSION >= 8000
+
+// Since there is no "getri" in cuSolver, we are using batched version of
+// "getrf" and "getri" in cuBLAS here. These routines are good for large
+// batches of small matrices, so performance issue may happen when computing
+// large matices. We leave it here until MAGMA which has "getri" is introduced
+// into MXNet.
+#define LINALG_GPU_BATCH_GETRI(fname, DType) \
+template<> inline \
+void linalg_batch_getri<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
+                                    const Tensor<gpu, 3, DType>& B, \
+                                    const Tensor<gpu, 1, DType>& work, \
+                                    Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using namespace mxnet::op::mxnet_op; \
+  CHECK_NOTNULL(s); \
+  Storage::Handle A_ptr_buf = Storage::Get()->Alloc(sizeof(DType *) * A.size(0), Context::GPU()); \
+  DType **A_ptr = static_cast<DType **>(A_ptr_buf.dptr); \
+  Storage::Handle B_ptr_buf = Storage::Get()->Alloc(sizeof(DType *) * A.size(0), Context::GPU()); \
+  DType **B_ptr = static_cast<DType **>(B_ptr_buf.dptr); \
+  Tensor<gpu, 3, DType> temp(work.dptr_, A.shape_, s); \
+  int *pivot = reinterpret_cast<int *>(temp.dptr_ + temp.shape_.Size()); \
+  int *info = pivot + A.size(0) * A.size(1); \
+  Kernel<set_matrix, gpu>::Launch(s, A.size(0), \
+                                  A_ptr, A.dptr_, \
+                                  A.size(1) * A.size(2)); \
+  Kernel<set_matrix, gpu>::Launch(s, temp.size(0), \
+                                  B_ptr, temp.dptr_, \
+                                  temp.size(1) * temp.size(2)); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+                            A.size(1), const_cast<const DType **>(B_ptr), \
+                            B.size(2), const_cast<const int *>(pivot), \
+                            A_ptr, A.size(2), info, A.size(0))) \
+  Storage::Get()->Free(A_ptr_buf); \
+  Storage::Get()->Free(B_ptr_buf); \
+}
+
+#define LINALG_GPU_GETRI_WORKSPACE_QUERY(fname, DType) \
+template<> inline \
+int linalg_getri_workspace_query<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
+                                             Stream<gpu> *s) { \
+  int pivot_size = sizeof(int) * A.size(0) * A.size(1); \
+  int info_size = sizeof(int) * A.size(0); \
+  int matrix_size = sizeof(DType) * A.shape_.Size(); \
+  return (pivot_size + info_size + matrix_size + sizeof(DType) - 1) / sizeof(DType); \
+}
+
+#else
+
+#define LINALG_GPU_BATCH_GETRI(fname, DType) \
+template<> inline \
+void linalg_batch_getri<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
+                                    const Tensor<gpu, 3, DType>& B, \
+                                    const Tensor<gpu, 1, DType>& work, \
+                                    Stream<gpu> *s) { \
+  LOG(FATAL) << "batched getri requires CUDA version >= 8.0!"; \
+}
+
+#define LINALG_GPU_GETRI_WORKSPACE_QUERY(fname, DType) \
+template<> inline \
+int linalg_getri_workspace_query<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
+                                             Stream<gpu> *s) { \
+  LOG(FATAL) << "batched getri requires CUDA version >= 8.0!"; \
+}
+
+#endif  // CUDA_VERSION >= 8000
+
+LINALG_GPU_BATCH_GETRI(SgetriBatched, float)
+LINALG_GPU_BATCH_GETRI(DgetriBatched, double)
+
+LINALG_GPU_GETRI_WORKSPACE_QUERY(SgetriBatched, float)
+LINALG_GPU_GETRI_WORKSPACE_QUERY(DgetriBatched, double)
+
+#endif  // __CUDACC__
+
+//////////////////////////////// INVERSE ////////////////////////////////////////////
+
+// CPU/GPU-versions of matrix inversion combining LAPACK function "getrf" and "getri"
+
+// Note A = inverse(B)
+#define LINALG_CPU_BATCH_INVERSE(xpu, DType) \
+template<> inline \
+void linalg_batch_inverse<xpu, DType>(const Tensor<xpu, 3, DType>& A, \
+                                      const Tensor<xpu, 3, DType>& B, \
+                                      const Tensor<xpu, 1, DType>& work, \
+                                      Stream<cpu> *s) { \
+  if (A.dptr_ != B.dptr_) Copy(A, B, s); \
+  for (index_t i = 0; i < A.size(0); ++i) { \
+    linalg_getrf(A[i], work, s); \
+    linalg_getri(A[i], work, s); \
+  } \
+}
+LINALG_CPU_BATCH_INVERSE(cpu, float)
+LINALG_CPU_BATCH_INVERSE(cpu, double)
+
+#ifdef __CUDACC__
+
+// GETRF and GETRI only available with cuda8 or higher.
+#if CUDA_VERSION >= 8000
+
+#define LINALG_GPU_BATCH_INVERSE(xpu, DType) \
+template<> inline \
+void linalg_batch_inverse<xpu, DType>(const Tensor<xpu, 3, DType>& A, \
+                                      const Tensor<xpu, 3, DType>& B, \
+                                      const Tensor<xpu, 1, DType>& work, \
+                                      Stream<gpu> *s) { \
+  linalg_batch_getrf(B, work, s); \
+  linalg_batch_getri(A, B, work, s); \
+}
+
+#else
+
+#define LINALG_GPU_BATCH_INVERSE(xpu, DType) \
+template<> inline \
+void linalg_batch_inverse<xpu, DType>(const Tensor<xpu, 3, DType>& A, \
+                                      const Tensor<xpu, 3, DType>& B, \
+                                      const Tensor<xpu, 1, DType>& work, \
+                                      Stream<gpu> *s) { \
+  LOG(FATAL) << "batched getrf and getri requires CUDA version >= 8.0!"; \
+}
+
+#endif  // CUDA_VERSION >= 8000
+
+LINALG_GPU_BATCH_INVERSE(gpu, float)
+LINALG_GPU_BATCH_INVERSE(gpu, double)
+
+#endif  // __CUDACC__
+
 #endif  // MXNET_OPERATOR_LINALG_IMPL_H_
diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
new file mode 100644
index 000000000000..608034732e0e
--- /dev/null
+++ b/src/operator/mkl_functions-inl.h
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkl_functions-inl.h
+ * \brief Wrapper for MKL VML functions
+ * \author Tao Lv, Shufan Wu
+*/
+#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+#define MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+
+#if MSHADOW_USE_MKL == 1
+#include "mkl_vml.h"
+
+namespace mxnet {
+namespace op {
+namespace mkl_func {
+
+MSHADOW_XINLINE
+static bool check_size(const size_t n) {
+  const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
+  return (n <= MKL_INT_MAX);
+}
+
+MSHADOW_XINLINE
+static bool check_type(const int t) {
+  return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
+}
+
+#define MXNET_MKL_UNARY_MATH_FUNC(name, func)                                               \
+struct name {                                                                               \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) {    \
+    vs##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) {  \
+    vd##func(static_cast<MKL_INT>(n), src, dst);                                            \
+  }                                                                                         \
+};
+
+#define MXNET_MKL_BINARY_MATH_FUNC(name, func)                                        \
+struct name {                                                                         \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const float *a,                               \
+                                        const float *b,                               \
+                                        float *c) {                                   \
+    vs##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+  MSHADOW_XINLINE static void Vectorize(const index_t n,                              \
+                                        const double *a,                              \
+                                        const double *b,                              \
+                                        double *c) {                                  \
+    vd##func(static_cast<MKL_INT>(n), a, b, c);                                       \
+  }                                                                                   \
+};
+
+MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
+MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
+MXNET_MKL_UNARY_MATH_FUNC(exp2, Exp2);
+MXNET_MKL_UNARY_MATH_FUNC(exp10, Exp10);
+MXNET_MKL_UNARY_MATH_FUNC(expm1, Expm1);
+MXNET_MKL_UNARY_MATH_FUNC(log, Ln);
+MXNET_MKL_UNARY_MATH_FUNC(log2, Log2);
+MXNET_MKL_UNARY_MATH_FUNC(log10, Log10);
+MXNET_MKL_UNARY_MATH_FUNC(log1p, Log1p);
+
+MXNET_MKL_UNARY_MATH_FUNC(sin, Sin);
+MXNET_MKL_UNARY_MATH_FUNC(cos, Cos);
+MXNET_MKL_UNARY_MATH_FUNC(tan, Tan);
+MXNET_MKL_UNARY_MATH_FUNC(asin, Asin);
+MXNET_MKL_UNARY_MATH_FUNC(acos, Acos);
+MXNET_MKL_UNARY_MATH_FUNC(atan, Atan);
+
+MXNET_MKL_UNARY_MATH_FUNC(sinh, Sinh);
+MXNET_MKL_UNARY_MATH_FUNC(cosh, Cosh);
+MXNET_MKL_UNARY_MATH_FUNC(tanh, Tanh);
+MXNET_MKL_UNARY_MATH_FUNC(asinh, Asinh);
+MXNET_MKL_UNARY_MATH_FUNC(acosh, Acosh);
+MXNET_MKL_UNARY_MATH_FUNC(atanh, Atanh);
+
+MXNET_MKL_UNARY_MATH_FUNC(sqrt, Sqrt);
+MXNET_MKL_UNARY_MATH_FUNC(abs, Abs);
+MXNET_MKL_UNARY_MATH_FUNC(cbrt, Cbrt);
+MXNET_MKL_UNARY_MATH_FUNC(round, Round);
+MXNET_MKL_UNARY_MATH_FUNC(ceil, Ceil);
+MXNET_MKL_UNARY_MATH_FUNC(floor, Floor);
+MXNET_MKL_UNARY_MATH_FUNC(trunc, Trunc);
+
+MXNET_MKL_UNARY_MATH_FUNC(lgamma, LGamma);
+MXNET_MKL_UNARY_MATH_FUNC(tgamma, TGamma);
+MXNET_MKL_UNARY_MATH_FUNC(square, Sqr);
+
+MXNET_MKL_BINARY_MATH_FUNC(add, Add);
+MXNET_MKL_BINARY_MATH_FUNC(sub, Sub);
+MXNET_MKL_BINARY_MATH_FUNC(mul, Mul);
+MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
+MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
+
+template <typename DType>
+MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
+  DType sum = 0.0f;
+  for (index_t i = 0; i < n; i++)
+    sum += in[i];
+
+  dst[0] = sum;
+}
+
+// LayerNorm on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void LayerNormLastDim(index_t m,
+                                             index_t n,
+                                             DType *a,
+                                             DType *b,
+                                             DType *gamma,
+                                             DType *beta,
+                                             DType *mean,
+                                             DType *var,
+                                             DType eps) {
+  auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthreads)
+  for (index_t i = 0; i < m; i++) {
+    DType* in_offset = a + i * n;
+    DType* out_offset = b + i * n;
+
+    sum_(n, in_offset, &(mean[i]));
+    mean[i] /= n;
+    var[i] = 0.0f;
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+    for (index_t j = 0; j < n; j++) {
+      out_offset[j] = in_offset[j] - mean[i];
+      var[i] += out_offset[j] * out_offset[j];
+    }
+    var[i] = math::sqrt(var[i] / n + eps);
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+    for (index_t j = 0; j < n; j++) {
+      out_offset[j] = out_offset[j] * gamma[j] / var[i] + beta[j];
+    }
+  }
+}
+
+}  // namespace mkl_func
+}  // namespace op
+}  // namespace mxnet
+#endif  // MSHADOW_USE_MKL == 1
+#endif  // MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 10e736258ab1..5b6cece4a92e 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -67,7 +67,7 @@ struct ActivationGrad {
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
     // ograds, output...
     std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0});
+    heads.emplace_back(n, activation::kOut, 0);
 
     const NodeAttrs& attrs = n->attrs;
     using namespace activation;
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 622952cc4bc5..2564609c6b90 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -483,20 +483,20 @@ static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs,
 
 std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
                                            const std::vector<nnvm::NodeEntry>& ograds) {
-  std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
-  for (uint32_t i = 0; i < out_data.size(); ++i) {
-    out_data[i] = nnvm::NodeEntry{n, i, 0};
-  }
+  std::vector<nnvm::NodeEntry> out_data;
+  out_data.reserve(n->num_outputs());
+  for (size_t i = 0; i < n->num_outputs(); ++i)
+    out_data.emplace_back(n, i, 0);
   std::vector<nnvm::NodeEntry> heads;
   heads.reserve(8);
-  heads.push_back(ograds[0]);
-  heads.push_back(out_data[batchnorm::kMean]);
-  heads.push_back(out_data[batchnorm::kVar]);
-  heads.push_back(n->inputs[batchnorm::kData]);
-  heads.push_back(n->inputs[batchnorm::kGamma]);
-  heads.push_back(n->inputs[batchnorm::kBeta]);
-  heads.push_back(n->inputs[batchnorm::kInMovingMean]);
-  heads.push_back(n->inputs[batchnorm::kInMovingVar]);
+  heads.emplace_back(ograds.at(0));
+  heads.emplace_back(out_data.at(batchnorm::kMean));
+  heads.emplace_back(out_data.at(batchnorm::kVar));
+  heads.emplace_back(n->inputs.at(batchnorm::kData));
+  heads.emplace_back(n->inputs.at(batchnorm::kGamma));
+  heads.emplace_back(n->inputs.at(batchnorm::kBeta));
+  heads.emplace_back(n->inputs.at(batchnorm::kInMovingMean));
+  heads.emplace_back(n->inputs.at(batchnorm::kInMovingVar));
 
   nnvm::NodePtr gnode = nnvm::Node::Create();
   gnode->inputs = std::move(heads);
@@ -505,19 +505,17 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
   gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
   gnode->attrs.name = n->attrs.name + "_backward";
   // The input of batchnorm
-  std::vector<nnvm::NodeEntry> in_grad(5);
-  for (uint32_t i = 0; i < 3; ++i) {
-    in_grad[i] = nnvm::NodeEntry{gnode, i, 0};
-  }
-
+  std::vector<nnvm::NodeEntry> in_grad;
+  in_grad.reserve(5);
+  for (size_t i = 0; i < 3; ++i)
+    in_grad.emplace_back(gnode, i, 0);
   // attach no gradient node to forbid gradient on aux_state
   nnvm::NodePtr ng = nnvm::Node::Create();
   ng->attrs.op = Op::Get("_NoGradient");
   ng->attrs.name = "NoGradient";
   // the aux state of batchnorm
-  for (uint32_t i = 0; i < 2; ++i) {
-    in_grad[i + 3] = nnvm::NodeEntry{ng, 0, 0};
-  }
+  for (size_t i = 3; i < 5; ++i)
+    in_grad.emplace_back(ng);
   return in_grad;
 }
 
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index a34d2992c8c6..2a643a266b2b 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -492,24 +492,6 @@ class DropoutOp {
 #endif  // MXNET_USE_CUDNN_DROPOUT
 };  // class DropoutOp
 
-static OpStatePtr CreateDropoutState(const nnvm::NodeAttrs &attrs,
-                                     const Context ctx,
-                                     const mxnet::ShapeVector &in_shapes,
-                                     const std::vector<int> &in_types) {
-  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
-  OpStatePtr state;
-  MSHADOW_REAL_TYPE_SWITCH(in_types[dropout::kData], DType, {
-    if (ctx.dev_type == kGPU) {
-      state = OpStatePtr::Create<DropoutOp<gpu, DType>>(param, ctx);
-    } else {
-      state = OpStatePtr::Create<DropoutOp<cpu, DType>>(param, ctx);
-    }
-    return state;
-  });
-  LOG(FATAL) << "should never reach here";
-  return OpStatePtr();  // should never reach here
-}
-
 template<typename xpu>
 void DropoutCompute(const OpStatePtr& state,
                     const OpContext& ctx,
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index afad6fd5cc80..63da5613df84 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -26,17 +26,38 @@
 
 #include "./dropout-inl.h"
 #include "../operator_common.h"
+#include "mxnet/op_attr_types.h"
+
+
 
 namespace mxnet {
 namespace op {
 
+OpStatePtr CreateDropoutState(const nnvm::NodeAttrs &attrs,
+                                     const Context ctx,
+                                     const mxnet::ShapeVector &in_shapes,
+                                     const std::vector<int> &in_types) {
+  const auto& param = nnvm::get<DropoutParam>(attrs.parsed);
+  OpStatePtr state;
+  MSHADOW_REAL_TYPE_SWITCH(in_types[dropout::kData], DType, {
+    if (ctx.dev_type == kGPU) {
+      state = OpStatePtr::Create<DropoutOp<mxnet::gpu, DType>>(param, ctx);
+    } else {
+      state = OpStatePtr::Create<DropoutOp<mxnet::cpu, DType>>(param, ctx);
+    }
+    return state;
+  });
+  LOG(FATAL) << "should never reach here";
+  return OpStatePtr();  // should never reach here
+}
+
 struct DropoutGrad {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> heads;
     heads.push_back(ograds[0]);
-    heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0});
+    heads.emplace_back(n, dropout::kMask, 0);
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
 };
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index 3fa2e91681fe..3294874e419a 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -63,21 +63,27 @@ struct LayerNormParam : public dmlc::Parameter<LayerNormParam> {
   }
 };
 
+static int GetRealAxis(int axis, int ndim) {
+  return axis < 0 ? (axis + ndim) : axis;
+}
 
 template<typename xpu>
 void LayerNormCompute(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx, const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
+                      const std::vector<TBlob>& outputs);
+
+template<typename xpu>
+void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx, const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
   if (req[0] == kNullOp) return;
   CHECK_NE(req[0], kAddTo);
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += static_cast<int>(inputs[0].ndim());
-  }
+  int axis = GetRealAxis(param.axis, inputs[0].ndim());
   CHECK(axis >= 0 && axis < inputs[0].ndim()) << "Channel axis out of range: " << param.axis;
   CHECK_EQ(inputs.size(), 3U);
   Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -108,11 +114,24 @@ void LayerNormCompute(const nnvm::NodeAttrs& attrs,
     });
   });
   workspace = ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+
+  bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
+  if (!safe_acc && inputs[0].type_flag_ == mshadow::kFloat16) {
+    common::LogOnce("MXNET_SAFE_ACCUMULATION=1 is recommended for float16 inputs for LayerNorm. "
+                    "See https://mxnet.incubator.apache.org/versions/master/faq/env_var.html "
+                    "for more details.");
+  }
+
   // Calculate mean
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-      broadcast::Reduce<red::sum, NDim, DType, op::mshadow_op::identity>(
-        s, mean_data, req[0], workspace, in_data);
+      if (safe_acc) {
+        broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, false>(
+          s, mean_data, req[0], workspace, in_data);
+      } else {
+        broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, true>(
+          s, mean_data, req[0], workspace, in_data);
+      }
       Tensor<xpu, 1, DType> mean_data_tensor = mean_data.FlatTo1D<xpu, DType>(s);
       mean_data_tensor /= scalar<DType>(channel_size);
     });
@@ -125,27 +144,38 @@ void LayerNormCompute(const nnvm::NodeAttrs& attrs,
   const TBlob centered_out = outputs[0].reshape(red_src_shape);
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-      broadcast::Reduce<red::sum, NDim, DType, op::mshadow_op::square>(
-        s, std_data, req[0], workspace, centered_out);
+      if (safe_acc) {
+        broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::square, false>(
+          s, std_data, req[0], workspace, centered_out);
+      } else {
+        broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::square, true>(
+          s, std_data, req[0], workspace, centered_out);
+      }
       Tensor<xpu, 1, DType> std_data_tensor = std_data.FlatTo1D<xpu, DType>(s);
       std_data_tensor = F<mshadow_op::square_root>(std_data_tensor / scalar<DType>(channel_size)
                         + scalar<DType>(param.eps));
     });
   });
   // Calculate data = data / std
-  BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                   {outputs[0], outputs[layernorm::kStd]},
-                                                   {kWriteTo}, {outputs[0]});
+  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                               {outputs[0], outputs[layernorm::kStd]},
+                                               {kWriteTo}, {outputs[0]});
   // Calculate data = data * gamma
-  BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                   {outputs[0], gamma},
-                                                   {kWriteTo}, {outputs[0]});
+  BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
+                                               {outputs[0], gamma},
+                                               {kWriteTo}, {outputs[0]});
   // Calculate data = data + beta
-  BinaryBroadcastCompute<xpu, op::mshadow_op::plus>(attrs, ctx,
-                                                   {outputs[0], beta},
-                                                   {kWriteTo}, {outputs[0]});
+  BinaryBroadcastCompute<xpu, mshadow_op::plus>(attrs, ctx,
+                                                {outputs[0], beta},
+                                                {kWriteTo}, {outputs[0]});
 }
 
+template<typename xpu>
+void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx, const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs);
+
 /*
 Calculate the gradient of layer normalization.
 We have the following gradient for gamma, beta and x:
@@ -157,10 +187,10 @@ grad_beta = sum(og, exclude_axis)
 grad_x = w - mean(w, axis) - \bar{x} * mean(w * \bar{x}, axis)
 */
 template<typename xpu>
-void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx, const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
+void LayerNormGradComputeGeneral(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx, const std::vector<TBlob>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(inputs.size(), 5U);
@@ -222,19 +252,26 @@ void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
   const TBlob red_out = TBlob(workspace.dptr_ + reduce_workspace_size + data_size * 2,
                               mean.shape_, mean.dev_mask(), mean.type_flag_, mean.dev_id());
   // Compute normalized_data = (data - mean) / std
-  BinaryBroadcastCompute<xpu, op::mshadow_op::minus>(attrs, ctx,
-                                                    {data, mean},
-                                                    {kWriteTo}, {normalized_data});
-  BinaryBroadcastCompute<xpu, op::mshadow_op::div>(attrs, ctx,
-                                                   {normalized_data, std},
-                                                   {kWriteTo}, {normalized_data});
+  BinaryBroadcastCompute<xpu, mshadow_op::minus>(attrs, ctx,
+                                                 {data, mean},
+                                                 {kWriteTo}, {normalized_data});
+  BinaryBroadcastCompute<xpu, mshadow_op::div>(attrs, ctx,
+                                               {normalized_data, std},
+                                               {kWriteTo}, {normalized_data});
   // Calculate grad_beta
+  bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
   if (req[2] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[2].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
-        broadcast::Reduce<red::sum, NDim, DType, op::mshadow_op::identity>(
-          s, outputs[2].reshape(red_exclude_dst_shape), req[2], workspace,
-          ograd.reshape(red_exclude_src_shape));
+        if (safe_acc) {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, false>(
+            s, outputs[2].reshape(red_exclude_dst_shape), req[2], workspace,
+            ograd.reshape(red_exclude_src_shape));
+        } else {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, true>(
+            s, outputs[2].reshape(red_exclude_dst_shape), req[2], workspace,
+            ograd.reshape(red_exclude_src_shape));
+        }
       });
     });
   }
@@ -244,9 +281,15 @@ void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
   if (req[1] != kNullOp) {
     MSHADOW_REAL_TYPE_SWITCH(outputs[1].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_exclude_dst_shape.ndim(), NDim, {
-        broadcast::Reduce<red::sum, NDim, DType, op::mshadow_op::identity>(
-          s, outputs[1].reshape(red_exclude_dst_shape), req[1], workspace,
-          ograd_mult.reshape(red_exclude_src_shape));
+        if (safe_acc) {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, false>(
+            s, outputs[1].reshape(red_exclude_dst_shape), req[1], workspace,
+            ograd_mult.reshape(red_exclude_src_shape));
+        } else {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, true>(
+            s, outputs[1].reshape(red_exclude_dst_shape), req[1], workspace,
+            ograd_mult.reshape(red_exclude_src_shape));
+        }
       });
     });
   }
@@ -263,9 +306,15 @@ void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
                                                     {kWriteTo}, {ograd_mult});
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-        broadcast::Reduce<red::sum, NDim, DType, op::mshadow_op::identity>(
-          s, red_out.reshape(red_dst_shape), kWriteTo, workspace,
-          ograd_mult.reshape(red_src_shape));
+        if (safe_acc) {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, false>(
+            s, red_out.reshape(red_dst_shape), kWriteTo, workspace,
+            ograd_mult.reshape(red_src_shape));
+        } else {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, true>(
+            s, red_out.reshape(red_dst_shape), kWriteTo, workspace,
+            ograd_mult.reshape(red_src_shape));
+        }
       });
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /= scalar<DType>(channel_size);
@@ -277,16 +326,22 @@ void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
                                                         {kWriteTo}, {ograd_mult});
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(red_dst_shape.ndim(), NDim, {
-        broadcast::Reduce<red::sum, NDim, DType, op::mshadow_op::identity>(
-          s, red_out.reshape(red_dst_shape), kWriteTo, workspace,
-          ograd_mult.reshape(red_src_shape));
+        if (safe_acc) {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, false>(
+            s, red_out.reshape(red_dst_shape), kWriteTo, workspace,
+            ograd_mult.reshape(red_src_shape));
+        } else {
+          broadcast::Reduce<mshadow_op::sum, NDim, DType, mshadow_op::identity, true>(
+            s, red_out.reshape(red_dst_shape), kWriteTo, workspace,
+            ograd_mult.reshape(red_src_shape));
+        }
       });
       Tensor<xpu, 1, DType> red_out_tensor = red_out.FlatTo1D<xpu, DType>(s);
       red_out_tensor /=  scalar<DType>(- channel_size);
     });
-    BinaryBroadcastCompute<xpu, op::mshadow_op::mul>(attrs, ctx,
-                                                     {normalized_data, red_out},
-                                                     {kAddTo}, {outputs[0]});
+    BinaryBroadcastCompute<xpu, mshadow_op::mul>(attrs, ctx,
+                                                 {normalized_data, red_out},
+                                                 {kAddTo}, {outputs[0]});
   }
 }
 
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 2e47503a3318..e95f47255d7a 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -27,6 +27,10 @@
 #include <nnvm/op_attr_types.h>
 #include "../elemwise_op_common.h"
 
+#if MSHADOW_USE_MKL == 1
+#include "../mkl_functions-inl.h"
+#endif
+
 namespace mxnet {
 namespace op {
 
@@ -39,10 +43,7 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
   const mxnet::TShape &dshape = in_shape->at(layernorm::kData);
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += dshape.ndim();
-  }
+  int axis = GetRealAxis(param.axis, dshape.ndim());
   CHECK(axis >= 0 && axis < dshape.ndim())
     << "Channel axis out of range: axis=" << param.axis;
 
@@ -64,6 +65,66 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+template<>
+void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+}
+
+#if MSHADOW_USE_MKL == 1
+void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  if (req[0] == kNullOp) return;
+  CHECK_NE(req[0], kAddTo);
+  CHECK_EQ(inputs.size(), 3U);
+  int axis = GetRealAxis(param.axis, inputs[0].ndim());
+
+  // This optimization only applys for LayerNorm on the last dimension with dtype FP32 or FP64.
+  if (axis == (inputs[layernorm::kData].ndim() - 1) &&
+      (inputs[0].type_flag_ == kFloat32 || inputs[0].type_flag_ == kFloat64)) {
+    // Compute necessary data for the reduce operation.
+    mxnet::TShape red_src_shape, red_dst_shape;
+    BroadcastReduceShapeCompact(inputs[layernorm::kData].shape_, outputs[layernorm::kMean].shape_,
+                                &red_src_shape, &red_dst_shape);
+    const TBlob in_data = inputs[layernorm::kData].reshape(red_src_shape);
+    const TBlob mean_data = outputs[layernorm::kMean].reshape(red_dst_shape);
+    const TBlob std_data = outputs[layernorm::kStd].reshape(red_dst_shape);
+    const int outter_size = red_dst_shape.Size();
+    const int channel_size = red_src_shape.Size() / red_dst_shape.Size();
+
+    // call
+    MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, {
+      mkl_func::LayerNormLastDim(outter_size, channel_size,
+                                 in_data.dptr<DType>(),
+                                 outputs[layernorm::kOut].dptr<DType>(),
+                                 inputs[layernorm::kGamma].dptr<DType>(),
+                                 inputs[layernorm::kBeta].dptr<DType>(),
+                                 outputs[layernorm::kMean].dptr<DType>(),
+                                 outputs[layernorm::kStd].dptr<DType>(),
+                                 static_cast<DType>(param.eps));
+    });
+  } else {
+    // fallback
+    LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
+  }
+}
+#endif
+
+
+template<>
+void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx, const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  return LayerNormGradComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+}
 
 NNVM_REGISTER_OP(LayerNorm)
 .describe(R"code(Layer normalization.
@@ -110,15 +171,19 @@ axis to be the last item in the input shape.
 })
 .set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
+#else
 .set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
+#endif
 .set_attr<nnvm::FGradient>("FGradient", [](const nnvm::NodePtr& n,
                                            const std::vector<nnvm::NodeEntry>& ograds) {
   std::vector<nnvm::NodeEntry> heads;
   heads.push_back(ograds[0]);  // ograd
   heads.push_back(n->inputs[0]);  // data
   heads.push_back(n->inputs[1]);  // gamma
-  heads.emplace_back(nnvm::NodeEntry{n, 1, 0});  // mean
-  heads.emplace_back(nnvm::NodeEntry{ n, 2, 0 });  // std
+  heads.emplace_back(n, 1, 0);  // mean
+  heads.emplace_back(n, 2, 0);  // std
   return MakeGradNode("_backward_LayerNorm", n, heads, n->attrs.dict);
 })
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
diff --git a/src/operator/nn/layer_norm.cu b/src/operator/nn/layer_norm.cu
index a146131294f0..fead2a6a3066 100644
--- a/src/operator/nn/layer_norm.cu
+++ b/src/operator/nn/layer_norm.cu
@@ -24,9 +24,726 @@
 */
 #include "./layer_norm-inl.h"
 
+using namespace mshadow::cuda;
+
 namespace mxnet {
 namespace op {
 
+template <typename DType>
+__device__ __forceinline__ DType warp_shfl(DType value, int src_lane,
+                                           int width = 32, unsigned int mask = 0xffffffff) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(mask, value, src_lane, width);
+#else
+  return __shfl(value, src_lane, width);
+#endif
+}
+
+template <typename DType>
+__device__ __forceinline__ DType warp_shfl_xor(DType value, int laneMask,
+                                               int width = 32, unsigned int mask = 0xffffffff) {
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+
+/* A single updating step of the Welford's online algorithm to calculate the mean and variance.
+ * The value 'curr' will be accumulated to the (mean, sigma2, count) triplet.
+ *
+ */
+template<typename DType, typename IType>
+__device__ __forceinline__ void StepWelfordOnlineSum(const DType curr,
+                                                     DType& mean,         //NOLINT
+                                                     DType& sigma2,       //NOLINT
+                                                     IType& count) {      //NOLINT
+  count += IType(1);
+  DType delta = curr - mean;
+  mean += delta / count;
+  sigma2 += delta * (curr - mean);
+}
+
+/* Merge the mean/variance of two partitions. It's the key step of the Chan's parallel algorithm.
+ * The (lhs_mean, lhs_sigma2, lhs_count) will be merged into (rhs_mean, rhs_sigma2, rhs_count)
+ *
+ * See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance for more details.
+ *
+ *  TODO(sxjscience) Explore the possibility of int lhs_count and rhs_count
+ */
+template<typename DType, typename IType>
+__device__ __inline__ void ChanMergePartition(const DType lhs_mean,
+                                              const DType lhs_sigma2,
+                                              const IType lhs_count,
+                                              DType& rhs_mean,         //NOLINT
+                                              DType& rhs_sigma2,       //NOLINT
+                                              IType& rhs_count) {      //NOLINT
+  DType delta = rhs_mean - lhs_mean;
+  DType nA = static_cast<DType>(lhs_count);
+  DType nB = static_cast<DType>(rhs_count);
+  rhs_count = nA + nB;
+  if (rhs_count > DType(0)) {
+    nA = nA / rhs_count;
+    nB = nB / rhs_count;
+    rhs_mean = nA * lhs_mean + nB * rhs_mean;
+    rhs_sigma2 = rhs_sigma2 + lhs_sigma2 + delta * delta * nA * nB * rhs_count;
+  } else {
+    rhs_mean = DType(0);
+    rhs_sigma2 = DType(0);
+  }
+}
+
+/* Split the input column into multiple partitions and compute the mean/sigma of each partition.
+ * Each thread will keep a mean/sigma2. The mean/sigma2 can be further merged to get the mean and
+ * sigma2 of the column.
+ */
+template<typename AType, typename DType, typename IType>
+__device__ __forceinline__ void BlockWelfordOnlineSum(const DType* __restrict__ col_vals,
+                                                      const int nchannel,
+                                                      AType& mean,         //NOLINT
+                                                      AType& sigma2,       //NOLINT
+                                                      IType& count) {      //NOLINT
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+  const int nthread = blockDim.x * blockDim.y;
+  // Each thread takes charge of 4 consecutive numbers. This should optimize the loading speed using
+  // vectorized types like float4.
+  // Also, to minimize branch divergence, we split the for-loop into two parts.
+  int l = 4 * tid;
+  for (; l + 3 < nchannel; l += 4 * nthread) {
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      StepWelfordOnlineSum(static_cast<AType>(col_vals[l + i]), mean, sigma2, count);
+    }
+  }
+  for (; l < nchannel; ++l) {
+    StepWelfordOnlineSum(static_cast<AType>(col_vals[l]), mean, sigma2, count);
+  }
+}
+
+template<>
+__device__ __forceinline__
+void BlockWelfordOnlineSum<float, mshadow::half::half_t, int>
+                                          (const mshadow::half::half_t* __restrict__ col_vals,
+                                           const int nchannel,
+                                           float& mean,                    //NOLINT
+                                           float& sigma2,                  //NOLINT
+                                           int& count) {                 //NOLINT
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+  const int nthread = blockDim.x * blockDim.y;
+  // We cast the input half pointer to half2 to optimize the loading speed.
+  // Here, we need to notice that CUDA forces memory alignment, i.e.,
+  // ASSERT static_cast<size_t>(ptr) % sizeof(dtype) == 0.
+  // Thus, we need to shift the address of the half pointer to be aligned by half2.
+  int align_shift = (reinterpret_cast<size_t>(col_vals) % 4) != 0;
+  int padding = (nchannel - align_shift) % 2;
+  int half2_size = (nchannel - align_shift) / 2;
+  const __half2* half2_col_vals = reinterpret_cast<const __half2*>(col_vals + align_shift);
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    if (align_shift) {
+      StepWelfordOnlineSum(__half2float(col_vals[0].cuhalf_), mean, sigma2, count);
+    }
+    if (padding) {
+      StepWelfordOnlineSum(__half2float(col_vals[nchannel - 1].cuhalf_), mean, sigma2, count);
+    }
+  }
+
+  for (int l = tid; l < half2_size; l += nthread) {
+    float2 ele_val =  __half22float2(half2_col_vals[l]);
+    StepWelfordOnlineSum(ele_val.x, mean, sigma2, count);
+    StepWelfordOnlineSum(ele_val.y, mean, sigma2, count);
+  }
+}
+
+/* Fused CUDA kernel for the forward pass of layer normalization.
+ * It computes the LayerNorm when axis=-1, i.e., contiguous reduction scenario.
+ * Shape of the input tensors:
+ *      in_data = (nbatch, nchannel)
+ *      gamma = (nchannel,)
+ *      beta = (nchannel,)
+ *      out_data = (nchannel,)
+ *      mean_data = (nbatch,)
+ *      var_data = (nbatch,)
+ *  It's always launched with (blockDim.x, blockDim.y) = (WARP_SIZE, blockDim.y)
+ *  Also, when blockDim.y > 1, it requires shared memory that has size:
+ *      sizeof(AType) * blockDim.y + sizeof(int) * blockDim.y / 2
+ */
+template<typename AType, typename DType, typename IType>
+__global__ void LayerNormFusedForwardKernelContig(const int nbatch,
+                                                  const int nchannel,
+                                                  const AType eps,
+                                                  const DType* __restrict__ in_data,
+                                                  const DType* __restrict__ gamma,
+                                                  const DType* __restrict__ beta,
+                                                  DType* __restrict__ out_data,
+                                                  DType* __restrict__ mean_data,
+                                                  DType* __restrict__ std_data) {
+  int bid = blockIdx.x + blockIdx.y * gridDim.x;
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  const int nthread = blockDim.x * blockDim.y;
+  IType count = 0;
+  AType mean = 0;
+  AType sigma2 = 0;
+
+  if (bid < nbatch) {
+    extern __shared__ char buf[];  // Shared memory
+    const DType* col_vals = in_data + bid * nchannel;
+    BlockWelfordOnlineSum(col_vals, nchannel, mean, sigma2, count);
+
+    // Merge the mean/sigma2 within a warp
+    // Use the Chan's Parallel Algorithm to merge all (mean, sigma2, counts)
+    // within a warp of threads.
+    // After calling the function, threadIdx.x == 0 will store the result of
+    // the aggregated (mean, sigma2, counts).
+    for (int mask = blockDim.x / 2; mask > 0; mask >>= 1) {
+      AType meanB = warp_shfl_xor(mean, mask);
+      AType sigma2B = warp_shfl_xor(sigma2, mask);
+      IType countB = warp_shfl_xor(count, mask);
+      ChanMergePartition(meanB, sigma2B, countB, mean, sigma2, count);
+    }
+    if (blockDim.y > 1) {
+      // Inter-warp reduction. Copy the upper-half of the warps to shared memory
+      // and merge with the lower-half warp
+      AType* mean_buf = reinterpret_cast<AType*>(buf);
+      AType* sigma2_buf =
+        reinterpret_cast<AType*>(buf + sizeof(AType) * blockDim.y / 2 * blockDim.x);
+      IType* count_buf = reinterpret_cast<IType*>(buf + sizeof(AType) * blockDim.y * blockDim.x);
+      for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+        if (threadIdx.y >= offset && threadIdx.y < 2 * offset) {
+          const int idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          mean_buf[idx] = mean;
+          sigma2_buf[idx] = sigma2;
+          count_buf[idx] = count;
+        }
+        __syncthreads();
+        if (threadIdx.y < offset) {
+          const int idx = threadIdx.y * blockDim.x + threadIdx.x;
+          ChanMergePartition(mean_buf[idx], sigma2_buf[idx], count_buf[idx], mean, sigma2, count);
+        }
+        __syncthreads();
+      }
+      // Broadcast the result to all threads
+      if (threadIdx.y == 0) {
+        mean_buf[threadIdx.x] = mean;
+        sigma2_buf[threadIdx.x] = sigma2;
+      }
+      __syncthreads();
+      mean = mean_buf[threadIdx.x];
+      sigma2 = sigma2_buf[threadIdx.x] / nchannel;
+    } else {
+      sigma2 /= nchannel;
+    }
+    // Calculate the out_data: gamma * (x - mean) / sqrt(var + eps) + beta
+    AType std_eps = sqrt(sigma2 + eps);
+    AType invstd_eps = DType(1.0) / std_eps;
+    DType* out_col_val = out_data + bid * nchannel;
+
+    if (gamma != NULL && beta != NULL) {
+      for (int i = tid; i < nchannel; i += nthread) {
+        out_col_val[i] = gamma[i] * static_cast<DType>(invstd_eps *
+                                                       (static_cast<AType>(col_vals[i]) - mean))
+                                                         + beta[i];
+      }
+    } else if (gamma == NULL && beta != NULL) {
+      for (int i = tid; i < nchannel; i += nthread) {
+        out_col_val[i] = static_cast<DType>(invstd_eps * (static_cast<AType>(col_vals[i]) - mean))
+                                                       + beta[i];
+      }
+    } else if (gamma != NULL && beta == NULL) {
+      for (int i = tid; i < nchannel; i += nthread) {
+        out_col_val[i] = gamma[i] * static_cast<DType>(invstd_eps *
+                                                       (static_cast<AType>(col_vals[i]) - mean));
+      }
+    } else {
+      for (int i = tid; i < nchannel; i += nthread) {
+        out_col_val[i] = static_cast<DType>(invstd_eps * (static_cast<AType>(col_vals[i]) - mean));
+      }
+    }
+    // Write the out_data and var_data
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      mean_data[bid] = static_cast<DType>(mean);
+      std_data[bid] = static_cast<DType>(std_eps);
+    }
+  }
+}
+
+template<bool safe_acc = false>
+void LayerNormGPUContig(const LayerNormParam param,
+                        const OpContext& ctx, const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 3U);
+  mxnet::TShape data_shape(2, 0);
+  mxnet::TShape mean_shape(1, 0);
+  size_t in_ndim = inputs[layernorm::kData].ndim();
+  data_shape[0] = mean_shape[0] = inputs[layernorm::kData].shape_.ProdShape(0, in_ndim - 1);
+  data_shape[1] = inputs[layernorm::kData].shape_[in_ndim - 1];
+  const TBlob in_data = inputs[layernorm::kData].reshape(data_shape);
+  const TBlob gamma = inputs[layernorm::kGamma];
+  const TBlob beta = inputs[layernorm::kBeta];
+  const TBlob out_data = outputs[layernorm::kOut].reshape(data_shape);
+  const TBlob mean_data = outputs[layernorm::kMean].reshape(mean_shape);
+  const TBlob std_data = outputs[layernorm::kStd].reshape(mean_shape);
+  // Make sure the inputs are contiguous
+  CHECK_EQ(in_data.CheckContiguous(), true);
+  CHECK_EQ(gamma.CheckContiguous(), true);
+  CHECK_EQ(beta.CheckContiguous(), true);
+  CHECK_EQ(out_data.CheckContiguous(), true);
+  CHECK_EQ(mean_data.CheckContiguous(), true);
+  CHECK_EQ(std_data.CheckContiguous(), true);
+
+  // Lauch the kernel. The dynamic shared memory size is
+  // sizeof(DType) * blockDim.y * blockDim.x + sizeof(DType) * blockDim.y / 2 * blockDim.x
+  int nbatch = data_shape[0];
+  int nchannel = data_shape[1];
+  float eps = param.eps;
+  int ngrid_x = (nbatch > kMaxGridDim) ? (nbatch + kBaseGridNum - 1) / kBaseGridNum : nbatch;
+  int ngrid_y = (nbatch > kMaxGridDim) ? kBaseGridNum : 1;
+  int nthread_y;
+  const dim3 dimGrid(ngrid_x, ngrid_y);
+  if (nchannel <= 128) {
+    nthread_y = 1;
+  } else if (nchannel <= 512) {
+    nthread_y = 2;
+  } else {
+    nthread_y = 4;
+  }
+  cudaStream_t stream = Stream<gpu>::GetStream(ctx.get_stream<gpu>());
+  const dim3 dimBlock(32, nthread_y);
+  MXNET_REAL_ACC_TYPE_SWITCH(in_data.type_flag_, DType, AccType, {
+    typedef typename std::conditional<safe_acc, AccType, DType>::type AType;
+    int nshared = nthread_y > 1 ? nthread_y * 32 * sizeof(AType)
+                                  + (nthread_y / 2) * 32 * sizeof(int) : 0;
+    CheckLaunchParam(dimGrid, dimBlock);
+    LayerNormFusedForwardKernelContig<AType, DType, int> <<<dimGrid, dimBlock, nshared, stream>>>
+     (nbatch, nchannel, static_cast<AType>(eps),
+      in_data.dptr<DType>(), gamma.dptr<DType>(), beta.dptr<DType>(),
+      out_data.dptr<DType>(), mean_data.dptr<DType>(), std_data.dptr<DType>());
+  });
+  MSHADOW_CUDA_POST_KERNEL_CHECK(LayerNormFusedForwardKernelContig);
+}
+
+template<>
+void LayerNormCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  if (req[0] == kNullOp) return;
+  CHECK_NE(req[0], kAddTo);
+  int axis = param.axis;
+  if (axis < 0) {
+    axis += static_cast<int>(inputs[0].ndim());
+  }
+  CHECK(axis >= 0 && axis < inputs[0].ndim()) << "Channel axis out of range: " << param.axis;
+  if (axis == inputs[0].ndim() - 1) {
+    // Try to use the accelerated CUDA kernels
+    bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
+    if (!safe_acc && inputs[0].type_flag_ == mshadow::kFloat16) {
+      common::LogOnce("MXNET_SAFE_ACCUMULATION=1 is recommended for LayerNorm with float16 inputs. "
+                      "See https://mxnet.incubator.apache.org/versions/master/faq/env_var.html "
+                      "for more details.");
+    }
+    if (safe_acc) {
+      return LayerNormGPUContig<true>(param, ctx, inputs, req, outputs);
+    } else {
+      return LayerNormGPUContig<false>(param, ctx, inputs, req, outputs);
+    }
+  }
+  return LayerNormComputeGeneral<gpu>(attrs, ctx, inputs, req, outputs);
+}
+
+
+/* Fused CUDA kernel for calculating the gradient w.r.t gamma/beta in LayerNorm when axis=-1
+ * (Contiguous case).
+ * The gradient of gamma and beta are:
+ *   d_gamma = sum(out_grad * (x - mean) / std, axis=0)
+ *   d_beta = sum(out_grad, axis=0)
+ *
+ * We compute the gradient (mainly reduction over a non-contiguous axis) using two steps to
+ * improve the parallelism.
+ *
+ * In the first step, we divide the rows uniformly into K parts. K independent threadblocks are used
+ * to calculate the partial reduction result of each part. Illustrated below:
+ *
+ *      1st Block          2nd Block          3rd Block              k-th Block
+ * | --------------- | ---------------- | --------------- | ... | ---------------- |
+ * | --------------- | ---------------- | --------------- | ... | ---------------- |
+ * | --------------- | ---------------- | --------------- | ... | ---------------- |
+ * | --------------- | ---------------- | --------------- | ... | ---------------- |
+ *     part_gamma[0]     part_gamma[1]      part_gamma[2]           part_gamma[k-1]
+ *     part_beta[0]      part_beta[1]       part_beta[2]            part_beta[k-1]
+ *
+ *
+ * In the second step, we sum up the row-values in part_gamma and part_beta.
+ *
+ * This `LayerNormFusedBackwardKernel_PartGammaBeta` function implements the first step and
+ * `LayerNormFusedBackwardKernel_GammaBeta` implements the second step.
+ */
+template<typename AType, typename DType>
+__global__ void LayerNormFusedBackwardKernel_PartGammaBeta(const int nbatch,
+                                                           const int nchannel,
+                                                           const DType* __restrict__ in_data,
+                                                           const DType* __restrict__ out_grad,
+                                                           const DType* __restrict__ mean_data,
+                                                           const DType* __restrict__ std_data,
+                                                           AType* __restrict__ part_gamma_grad,
+                                                           AType* __restrict__ part_beta_grad) {
+  extern __shared__ char buf[];
+  AType* d_buf = reinterpret_cast<AType*>(buf);
+  const int npart = gridDim.y;
+  const int block_row_num = (nbatch + npart - 1) / npart;
+  // The rows are divided into `npart` parts. Each threadblock calculates the reduction result
+  // within the corresponding row ranges.
+  int row_stride = blockDim.x + 1;
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  int r_begin = blockIdx.y * block_row_num;
+  int r_end = min((blockIdx.y + 1) * block_row_num, nbatch);
+  AType* buf_gamma_grad = d_buf;
+  AType* buf_beta_grad = d_buf + blockDim.y * row_stride;
+  AType local_gamma_grad = 0;
+  AType local_beta_grad = 0;
+
+  if (c < nchannel) {
+    for (int r_b = r_begin; r_b < r_end; r_b += blockDim.y) {
+      int r = r_b + threadIdx.y;
+      if (r < r_end) {
+        AType local_mean = static_cast<AType>(mean_data[r]);
+        AType local_std = static_cast<AType>(std_data[r]);
+        int read_idx = r * nchannel + c;
+        AType local_in_data = static_cast<AType>(in_data[read_idx]);
+        AType local_out_grad = static_cast<AType>(out_grad[read_idx]);
+        local_gamma_grad += (local_in_data - local_mean) / local_std * local_out_grad;
+        local_beta_grad += local_out_grad;
+      }
+    }
+  }
+  buf_gamma_grad[threadIdx.y * row_stride + threadIdx.x] = local_gamma_grad;
+  buf_beta_grad[threadIdx.y * row_stride + threadIdx.x] = local_beta_grad;
+  __syncthreads();
+  for (int offset = blockDim.y/2;  offset > 1;  offset >>= 1) {
+    if (threadIdx.y < offset) {
+      int idx1 = threadIdx.y * row_stride + threadIdx.x;
+      int idx2 = (threadIdx.y + offset) * row_stride + threadIdx.x;
+      buf_gamma_grad[idx1] += buf_gamma_grad[idx2];
+      buf_beta_grad[idx1] += buf_beta_grad[idx2];
+    }
+    __syncthreads();
+  }
+  if (threadIdx.y == 0 && c < nchannel) {
+    part_gamma_grad[blockIdx.y * nchannel + c] = buf_gamma_grad[threadIdx.x]
+                                                   + buf_gamma_grad[threadIdx.x + row_stride];
+    part_beta_grad[blockIdx.y * nchannel + c] = buf_beta_grad[threadIdx.x]
+                                                   + buf_beta_grad[threadIdx.x + row_stride];
+  }
+}
+
+template<bool gamma_addto, bool beta_addto, typename AType, typename DType>
+__global__ void LayerNormFusedBackwardKernel_GammaBeta(const int nbatch,
+                                                       const int nchannel,
+                                                       const int npart,
+                                                       const AType* __restrict__ part_gamma_grad,
+                                                       const AType* __restrict__ part_beta_grad,
+                                                       DType* gamma_grad,
+                                                       DType* beta_grad) {
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  if (c < nchannel) {
+    extern __shared__ char buf[];
+    AType* buf_gamma_grad = reinterpret_cast<AType*>(buf);
+    AType* buf_beta_grad = reinterpret_cast<AType*>(buf) + blockDim.x * blockDim.y;
+    buf_gamma_grad[tid] = 0;
+    buf_beta_grad[tid] = 0;
+    for (int r = threadIdx.y; r < npart; r += blockDim.y) {
+      buf_gamma_grad[tid] += part_gamma_grad[r * nchannel + c];
+      buf_beta_grad[tid] += part_beta_grad[r * nchannel + c];
+    }
+    __syncthreads();
+    // Begin for inter-warp reduce
+    if (npart > 1) {
+      for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+        if (threadIdx.y < offset) {
+          int idx1 = tid;
+          int idx2 = tid + offset * blockDim.x;
+          buf_gamma_grad[idx1] += buf_gamma_grad[idx2];
+          buf_beta_grad[idx1] += buf_beta_grad[idx2];
+        }
+        __syncthreads();
+      }
+    }
+    if (threadIdx.y == 0) {
+      if (gamma_grad) {
+        if (gamma_addto) {
+          gamma_grad[c] += static_cast<DType>(buf_gamma_grad[threadIdx.x]);
+        } else {
+          gamma_grad[c] = static_cast<DType>(buf_gamma_grad[threadIdx.x]);
+        }
+      }
+      if (beta_grad) {
+        if (beta_addto) {
+          beta_grad[c] += static_cast<DType>(buf_beta_grad[threadIdx.x]);
+        } else {
+          beta_grad[c] = static_cast<DType>(buf_beta_grad[threadIdx.x]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ *
+ *
+ */
+template<int LOAD_UNROLL, bool data_addto, typename AType, typename DType>
+__global__ void LayerNormFusedBackwardKernel_Data(const int nbatch,
+                                                  const int nchannel,
+                                                  const DType* __restrict__ in_data,
+                                                  const DType* __restrict__ out_grad,
+                                                  const DType* __restrict__ mean_data,
+                                                  const DType* __restrict__ std_data,
+                                                  const DType* __restrict__ gamma,
+                                                  DType* data_grad) {
+  int bid = blockIdx.x + blockIdx.y * gridDim.x;
+  const int nthread = blockDim.x * blockDim.y;
+  if (bid < nbatch) {
+    // Shared memory with size blockDim.y * blockDim.x * sizeof(DType)
+    extern __shared__ char buf[];
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    // 1. Calculate: mean(out_grad * gamma / std, axis=-1)
+    //               mean(out_grad * gamma / std * (x - mean) / std, axis=-1)
+    AType sum_val0 = 0;  // Stores mean(out_grad * gamma / std, axis=-1)
+    AType sum_val1 = 0;  // Stores mean(out_grad * gamma / std * (x - mean) / std, axis=-1)
+    AType mean = static_cast<AType>(mean_data[bid]);
+    AType invstd_eps = AType(1) / static_cast<AType>(std_data[bid]);
+    int l = LOAD_UNROLL * tid;
+    for (; l + LOAD_UNROLL - 1 < nchannel; l += nthread * LOAD_UNROLL) {
+#pragma unroll
+      for (int i = 0; i < LOAD_UNROLL; ++i) {
+        AType ele_og = static_cast<AType>(out_grad[bid * nchannel + l + i]);
+        AType ele_x = static_cast<AType>(in_data[bid * nchannel + l + i]);
+        AType ele_gamma = static_cast<AType>(gamma[l + i]);
+        sum_val0 += ele_og * ele_gamma * invstd_eps;
+        sum_val1 += ele_og * ele_gamma * (ele_x - mean) * invstd_eps * invstd_eps;
+      }
+    }
+    for (; l < nchannel; ++l) {
+      AType ele_og = static_cast<AType>(out_grad[bid * nchannel + l]);
+      AType ele_x = static_cast<AType>(in_data[bid * nchannel + l]);
+      AType ele_gamma = static_cast<AType>(gamma[l]);
+      sum_val0 += ele_og * ele_gamma * invstd_eps;
+      sum_val1 += ele_og * ele_gamma * (ele_x - mean) * invstd_eps * invstd_eps;
+    }
+    // Intra-warp reduction (all-reduce)
+    for (int mask = blockDim.x / 2; mask > 0; mask >>= 1) {
+      sum_val0 += warp_shfl_xor(sum_val0, mask);
+      sum_val1 += warp_shfl_xor(sum_val1, mask);
+    }
+    // Inter-warp reduction (all-reduce)
+    if (blockDim.y > 1) {
+      AType* sum_val0_buf = reinterpret_cast<AType*>(buf);
+      AType* sum_val1_buf =
+        reinterpret_cast<AType*>(buf + blockDim.y / 2 * blockDim.x * sizeof(AType));
+      for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+        if (threadIdx.y >= offset && threadIdx.y < 2 * offset) {
+          const int idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          sum_val0_buf[idx] = sum_val0;
+          sum_val1_buf[idx] = sum_val1;
+        }
+        __syncthreads();
+        if (threadIdx.y < offset) {
+          const int idx = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_val0 += sum_val0_buf[idx];
+          sum_val1 += sum_val1_buf[idx];
+        }
+        __syncthreads();
+      }
+      if (threadIdx.y == 0) {
+        sum_val0_buf[threadIdx.x] = sum_val0;
+        sum_val1_buf[threadIdx.x] = sum_val1;
+      }
+      __syncthreads();
+      sum_val0 = sum_val0_buf[threadIdx.x];
+      sum_val1 = sum_val1_buf[threadIdx.x];
+    }
+    sum_val0 /= nchannel;
+    sum_val1 /= nchannel;
+    // 2. Calculate the gradient as
+    //      out_grad * gamma / std - sum_val0 - (x - mean) / std * sum_val1
+    for (int l = tid; l < nchannel; l += nthread) {
+      AType ele_out_grad = static_cast<AType>(out_grad[bid * nchannel + l]);
+      AType ele_x = static_cast<AType>(in_data[bid * nchannel + l]);
+      AType ele_gamma = static_cast<AType>(gamma[l]);
+      if (data_addto) {
+        data_grad[bid * nchannel + l] +=
+          static_cast<DType>(ele_out_grad * ele_gamma * invstd_eps
+                               - sum_val0 - (ele_x - mean) * invstd_eps * sum_val1);
+      } else {
+        data_grad[bid * nchannel + l] =
+          static_cast<DType>(ele_out_grad * ele_gamma * invstd_eps - sum_val0
+                                               - (ele_x - mean) * invstd_eps * sum_val1);
+      }
+    }
+  }
+}
+
+void GetGammaBetaGradKernelParams(const int nbatch, const int nchannel,
+                                  dim3* part_grad_block_dim, dim3* part_grad_grid_dim,
+                                  dim3* gb_block_dim, dim3* gb_grid_dim,
+                                  int* npart) {
+  *npart = 16;
+  *part_grad_block_dim = dim3(32, 16);
+  *part_grad_grid_dim = dim3((nchannel + 32 - 1) / 32, *npart);
+  *gb_block_dim = dim3(32, *npart);
+  *gb_grid_dim = dim3((nchannel + 32 - 1) / 32);
+  CheckLaunchParam(*part_grad_grid_dim, *part_grad_block_dim);
+  CheckLaunchParam(*gb_grid_dim, *gb_block_dim);
+}
+
+template<bool safe_acc = false>
+void LayerNormGradGPUContig(const LayerNormParam param,
+                            const OpContext& ctx, const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 5U);
+  const TBlob out_grad = inputs[0];
+  const TBlob in_data = inputs[1];
+  const TBlob gamma = inputs[2];
+  const TBlob mean_data = inputs[3];
+  const TBlob std_data = inputs[4];
+  const TBlob data_grad = outputs[0];
+  const TBlob gamma_grad = outputs[1];
+  const TBlob beta_grad = outputs[2];
+
+  // Make sure the inputs are contiguous
+  CHECK_EQ(out_grad.CheckContiguous(), true);
+  CHECK_EQ(in_data.CheckContiguous(), true);
+  CHECK_EQ(gamma.CheckContiguous(), true);
+  CHECK_EQ(mean_data.CheckContiguous(), true);
+  CHECK_EQ(std_data.CheckContiguous(), true);
+  int nbatch = in_data.shape_.ProdShape(0, in_data.ndim() - 1);
+  int nchannel = in_data.shape_[in_data.ndim() - 1];
+  int data_grad_req = req[0];
+  int gamma_grad_req = req[1];
+  int beta_grad_req = req[2];
+  CHECK_NE(data_grad_req, kWriteInplace);
+  CHECK_NE(gamma_grad_req, kWriteInplace);
+  CHECK_NE(beta_grad_req, kWriteInplace);
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  cudaStream_t stream = Stream<gpu>::GetStream(s);
+
+  // Calculate the gradient for gamma/beta
+  CHECK_EQ(gamma_grad.CheckContiguous(), true);
+  CHECK_EQ(beta_grad.CheckContiguous(), true);
+  dim3 part_grad_block_dim, part_grad_grid_dim, gb_block_dim, gb_grid_dim;
+  int npart;
+  GetGammaBetaGradKernelParams(nbatch, nchannel, &part_grad_block_dim, &part_grad_grid_dim,
+                               &gb_block_dim, &gb_grid_dim, &npart);
+  if (gamma_grad_req != kNullOp || beta_grad_req != kNullOp) {
+    MXNET_REAL_ACC_TYPE_SWITCH(in_data.type_flag_, DType, AccType, {
+      typedef typename std::conditional<safe_acc, AccType, DType>::type AType;
+      Tensor<gpu, 1, AType> workspace =
+        ctx.requested[0].get_space_typed<gpu, 1, AType>(Shape1(2 * npart * nchannel), s);
+      AType* part_gamma_grad_ptr = workspace.dptr_;
+      AType* part_beta_grad_ptr = workspace.dptr_ + npart * nchannel;
+      const int nshared_K1 = 2 * (part_grad_block_dim.x + 1)
+                               * part_grad_block_dim.y * sizeof(AType);
+      const int nshared_K2 = 2 * gb_block_dim.x * gb_block_dim.y * sizeof(AType);
+      DType* gamma_grad_ptr = (gamma_grad_req != kNullOp) ? gamma_grad.dptr<DType>() : nullptr;
+      DType* beta_grad_ptr = (beta_grad_req != kNullOp) ? beta_grad.dptr<DType>() : nullptr;
+      LayerNormFusedBackwardKernel_PartGammaBeta
+        <<<part_grad_grid_dim, part_grad_block_dim, nshared_K1, stream>>>
+        (nbatch, nchannel, in_data.dptr<DType>(), out_grad.dptr<DType>(),
+         mean_data.dptr<DType>(), std_data.dptr<DType>(), part_gamma_grad_ptr, part_beta_grad_ptr);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(LayerNormFusedBackwardKernel_PartGammaBeta);
+      if (gamma_grad_req == kAddTo && beta_grad_req != kAddTo) {
+        LayerNormFusedBackwardKernel_GammaBeta<true, false>
+          <<<gb_grid_dim, gb_block_dim, nshared_K2, stream>>>
+          (nbatch, nchannel, npart, part_gamma_grad_ptr, part_beta_grad_ptr,
+           gamma_grad_ptr, beta_grad_ptr);
+      } else if (gamma_grad_req != kAddTo && beta_grad_req == kAddTo) {
+        LayerNormFusedBackwardKernel_GammaBeta<false, true>
+          <<<gb_grid_dim, gb_block_dim, nshared_K2, stream>>>
+          (nbatch, nchannel, npart, part_gamma_grad_ptr, part_beta_grad_ptr,
+            gamma_grad_ptr, beta_grad_ptr);
+      } else if (gamma_grad_req == kAddTo && beta_grad_req == kAddTo) {
+        LayerNormFusedBackwardKernel_GammaBeta<true, true>
+          <<<gb_grid_dim, gb_block_dim, nshared_K2, stream>>>
+          (nbatch, nchannel, npart, part_gamma_grad_ptr, part_beta_grad_ptr,
+            gamma_grad_ptr, beta_grad_ptr);
+      } else {
+        LayerNormFusedBackwardKernel_GammaBeta<false, false>
+          <<<gb_grid_dim, gb_block_dim, nshared_K2, stream>>>
+          (nbatch, nchannel, npart, part_gamma_grad_ptr, part_beta_grad_ptr,
+            gamma_grad_ptr, beta_grad_ptr);
+      }
+    });
+    MSHADOW_CUDA_POST_KERNEL_CHECK(LayerNormFusedBackwardKernel_GammaBeta);
+  }
+
+  // Calculate the gradient for data
+  CHECK_EQ(data_grad.CheckContiguous(), true);
+  int ngrid_x = (nbatch > kMaxGridDim) ? (nbatch + kBaseGridNum - 1) / kBaseGridNum : nbatch;
+  int ngrid_y = (nbatch > kMaxGridDim) ? kBaseGridNum : 1;
+  const dim3 data_grid_dim(ngrid_x, ngrid_y);
+  int nthread_y;
+  if (nchannel <= 32) {
+    nthread_y = 1;
+  } else if (nchannel <= 128) {
+    nthread_y = 2;
+  } else if (nchannel <= 512) {
+    nthread_y = 4;
+  } else {
+    nthread_y = 8;
+  }
+  const dim3 data_block_dim(32, nthread_y);
+  const int LOAD_UNROLL = 4;
+  if (data_grad_req != kNullOp) {
+    MXNET_REAL_ACC_TYPE_SWITCH(in_data.type_flag_, DType, AccType, {
+      typedef typename std::conditional<safe_acc, AccType, DType>::type AType;
+      int nshared = data_block_dim.y > 1 ? data_block_dim.y * data_block_dim.x * sizeof(AType) : 0;
+      CheckLaunchParam(data_grid_dim, data_block_dim);
+      if (data_grad_req == kAddTo) {
+        LayerNormFusedBackwardKernel_Data<LOAD_UNROLL, true, AType>
+          <<<data_grid_dim, data_block_dim, nshared, stream>>>
+          (nbatch, nchannel, in_data.dptr<DType>(), out_grad.dptr<DType>(), mean_data.dptr<DType>(),
+           std_data.dptr<DType>(), gamma.dptr<DType>(), data_grad.dptr<DType>());
+      } else {
+        LayerNormFusedBackwardKernel_Data<LOAD_UNROLL, false, AType>
+          <<<data_grid_dim, data_block_dim, nshared, stream>>>
+          (nbatch, nchannel, in_data.dptr<DType>(), out_grad.dptr<DType>(), mean_data.dptr<DType>(),
+           std_data.dptr<DType>(), gamma.dptr<DType>(), data_grad.dptr<DType>());
+      }
+    });
+    MSHADOW_CUDA_POST_KERNEL_CHECK(LayerNormFusedBackwardKernel_Data);
+  }
+}
+
+template<>
+void LayerNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx, const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  int axis = param.axis;
+  if (axis < 0) {
+    axis += static_cast<int>(inputs[0].ndim());
+  }
+  CHECK(axis >= 0 && axis < inputs[0].ndim()) << "Channel axis out of range: " << param.axis;
+  if (axis == inputs[0].ndim() - 1) {
+    // Use the accelerated CUDA kernels
+    bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
+    if (safe_acc) {
+      return LayerNormGradGPUContig<true>(param, ctx, inputs, req, outputs);
+    } else {
+      return LayerNormGradGPUContig<false>(param, ctx, inputs, req, outputs);
+    }
+  }
+  return LayerNormGradComputeGeneral<gpu>(attrs, ctx, inputs, req, outputs);
+}
+
+
 NNVM_REGISTER_OP(LayerNorm)
 .set_attr<FCompute>("FCompute<gpu>", LayerNormCompute<gpu>);
 
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index b632e35b57fe..3a3ca59f2be1 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -77,7 +77,7 @@ struct LRNGrad {
     std::vector<nnvm::NodeEntry> heads;
     heads.push_back(ograds[0]);  // out_grad
     heads.push_back(n->inputs[lrn_enum::kData]);
-    heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0});
+    heads.emplace_back(n, lrn_enum::kTmpNorm, 0);
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
 };
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index 9ce27fad4b19..e4c829645e13 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -57,6 +57,12 @@ bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input) {
   return SupportMKLDNNAct(param);
 }
 
+bool SupportQuantizedMKLDNNAct(const ActivationParam &param) {
+  // TODO(zhennan): Add more activation type when mkldnn supports.
+  //                Remove this when it's identity to SupportMKLDNNAct.
+  return param.act_type == activation::kReLU;
+}
+
 mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
   switch (param.act_type) {
     case activation::kReLU:
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 3da3f23d7683..5670983e6aa3 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -178,9 +178,10 @@ struct SoftmaxOutputParam;
 struct TransposeParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
 bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input);
+bool SupportQuantizedMKLDNNAct(const ActivationParam &param);
+bool SupportMKLDNNConv(const ConvolutionParam &params, const NDArray &input);
 bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
-bool SupportMKLDNNSoftmax(const SoftmaxParam& param);
+bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray &input, const NDArray &output);
 bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param);
 bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray &data);
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 5dccba281fd0..e36a0f008821 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -553,8 +553,8 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
   for (size_t i = 0; i < out_blobs.size(); i++)
     out_blobs[i] = outputs[i].data();
   fn(attrs, ctx, in_blobs, req, out_blobs);
-
-  LOG(INFO) << "test " << attrs.op->name;
+  if (dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false))
+    LOG(INFO) << "test " << attrs.op->name;
   size_t num = std::min(outputs.size(), outputs_.size());
   num = std::min(num_checks, num);
   for (size_t i = 0; i < num; i++) {
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
index ab6650eadad7..b4289e524999 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -38,9 +38,9 @@ namespace op {
 
 struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
   bool with_bn;
-  bool with_relu;
+  bool with_act;
   bool with_sum;
-  bool with_postsum_relu;
+  bool with_postsum_act;
   bool quantized;
 
   dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
@@ -49,12 +49,12 @@ struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
   DMLC_DECLARE_PARAMETER(MKLDNNConvParam) {
     DMLC_DECLARE_FIELD(with_bn).set_default(false)
     .describe("Add post batchnorm.");
-    DMLC_DECLARE_FIELD(with_relu).set_default(false)
-    .describe("Add post relu");
+    DMLC_DECLARE_FIELD(with_act).set_default(false)
+    .describe("Add post activation");
     DMLC_DECLARE_FIELD(with_sum).set_default(false)
     .describe("Add post sum");
-    DMLC_DECLARE_FIELD(with_postsum_relu).set_default(false)
-    .describe("Add post relu after sum");
+    DMLC_DECLARE_FIELD(with_postsum_act).set_default(false)
+    .describe("Add post activation after sum");
     DMLC_DECLARE_FIELD(quantized).set_default(false)
     .describe("enable quantization");
     DMLC_DECLARE_FIELD(min_calib_range)
@@ -70,18 +70,22 @@ struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
   }
 };
 
+struct MKLDNNPostActParam {
+  mkldnn::algorithm alg = mkldnn::algorithm::algorithm_undef;
+  float scale = 1.f;
+  float alpha = 0.f;
+  float beta = 1.f;
+};
+
 struct MKLDNNConvFullParam {
   ConvolutionParam conv_param;
   MKLDNNConvParam mkldnn_param;
-  float sum_scale;
+  float sum_scale = 1.f;
   std::vector<float> requantize_scales;
+  MKLDNNPostActParam act_param;
+  MKLDNNPostActParam postsum_act_param;
 };
 
-static inline bool IsOutputUInt8(const MKLDNNConvParam &mkldnn_param) {
-  return ((!mkldnn_param.with_sum) && mkldnn_param.with_relu) ||
-         mkldnn_param.with_postsum_relu;
-}
-
 mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(const MKLDNNConvFullParam &param,
                                                            const bool is_train,
                                                            const NDArray &data,
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index a3aca98d9f81..6a91ae0d92a1 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -63,15 +63,15 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(const MKLDNNConvFullP
   mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
   mkldnn::memory::dims padding(param.conv_param.kernel.ndim());
   if (param.conv_param.kernel.ndim() == 1) {
-    CHECK_GE(param.conv_param.stride.ndim(), 1U);
-    CHECK_GE(param.conv_param.pad.ndim(), 1U);
-    CHECK_GE(param.conv_param.dilate.ndim(), 1U);
+    CHECK_GE(param.conv_param.stride.ndim(), 1);
+    CHECK_GE(param.conv_param.pad.ndim(), 1);
+    CHECK_GE(param.conv_param.dilate.ndim(), 1);
     strides[0] = param.conv_param.stride[0];
     padding[0] = param.conv_param.pad[0];
   } else if (param.conv_param.kernel.ndim() == 2) {
-    CHECK_GE(param.conv_param.stride.ndim(), 2U);
-    CHECK_GE(param.conv_param.pad.ndim(), 2U);
-    CHECK_GE(param.conv_param.dilate.ndim(), 2U);
+    CHECK_GE(param.conv_param.stride.ndim(), 2);
+    CHECK_GE(param.conv_param.pad.ndim(), 2);
+    CHECK_GE(param.conv_param.dilate.ndim(), 2);
     strides[0] = param.conv_param.stride[0];
     strides[1] = param.conv_param.stride[1];
     padding[0] = param.conv_param.pad[0];
@@ -82,20 +82,16 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(const MKLDNNConvFullP
   }
   mkldnn::primitive_attr attr;
   mkldnn::post_ops ops;
-  if (param.mkldnn_param.with_relu) {
-    float scale = 1.0f;  // for fp32, scale is 1.
-    float alpha = 0.0f;  // negative slope for mkldnn_eltwise_relu.
-    float beta = 1.0f;   // ignored for mkldnn_eltwise_relu.
-    ops.append_eltwise(scale, eltwise_relu, alpha, beta);
+  if (param.mkldnn_param.with_act) {
+    const auto &act_param = param.act_param;
+    ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
   if (param.mkldnn_param.with_sum) {
     ops.append_sum(param.sum_scale);
   }
-  if (param.mkldnn_param.with_postsum_relu) {
-    float scale = 1.0f;  // for fp32, scale is 1.
-    float alpha = 0.0f;  // negative slope for mkldnn_eltwise_relu.
-    float beta = 1.0f;   // ignored for mkldnn_eltwise_relu.
-    ops.append_eltwise(scale, eltwise_relu, alpha, beta);
+  if (param.mkldnn_param.with_postsum_act) {
+    const auto &act_param = param.postsum_act_param;
+    ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
   attr.set_post_ops(ops);
 
@@ -173,15 +169,15 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
   mkldnn::memory::dims strides(param.kernel.ndim());
   mkldnn::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
-    CHECK_GE(param.stride.ndim(), 1U);
-    CHECK_GE(param.pad.ndim(), 1U);
-    CHECK_GE(param.dilate.ndim(), 1U);
+    CHECK_GE(param.stride.ndim(), 1);
+    CHECK_GE(param.pad.ndim(), 1);
+    CHECK_GE(param.dilate.ndim(), 1);
     strides[0] = param.stride[0];
     padding[0] = param.pad[0];
   } else if (param.kernel.ndim() == 2) {
-    CHECK_GE(param.stride.ndim(), 2U);
-    CHECK_GE(param.pad.ndim(), 2U);
-    CHECK_GE(param.dilate.ndim(), 2U);
+    CHECK_GE(param.stride.ndim(), 2);
+    CHECK_GE(param.pad.ndim(), 2);
+    CHECK_GE(param.dilate.ndim(), 2);
     strides[0] = param.stride[0];
     strides[1] = param.stride[1];
     padding[0] = param.pad[0];
@@ -241,15 +237,15 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   mkldnn::memory::dims strides(param.kernel.ndim());
   mkldnn::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
-    CHECK_GE(param.stride.ndim(), 1U);
-    CHECK_GE(param.pad.ndim(), 1U);
-    CHECK_GE(param.dilate.ndim(), 1U);
+    CHECK_GE(param.stride.ndim(), 1);
+    CHECK_GE(param.pad.ndim(), 1);
+    CHECK_GE(param.dilate.ndim(), 1);
     strides[0] = param.stride[0];
     padding[0] = param.pad[0];
   } else if (param.kernel.ndim() == 2) {
-    CHECK_GE(param.stride.ndim(), 2U);
-    CHECK_GE(param.pad.ndim(), 2U);
-    CHECK_GE(param.dilate.ndim(), 2U);
+    CHECK_GE(param.stride.ndim(), 2);
+    CHECK_GE(param.pad.ndim(), 2);
+    CHECK_GE(param.dilate.ndim(), 2);
     strides[0] = param.stride[0];
     strides[1] = param.stride[1];
     padding[0] = param.pad[0];
@@ -415,11 +411,12 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam &param,
     // For inference, we want to reorder the weight array so we don't need to
     // reorder data every time.
     if (weight.IsDefaultData()) {
-      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(),
-                              param.conv_param.num_group);
       // We also need to modify the layout on the original weight array. The
       // data conversion happens after the weight array is used.
       weight.MKLDNNDataReorderAsync(fwd->fwd_pd.weights_primitive_desc());
+      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(),
+                              param.conv_param.num_group);
+
     } else {
       weight_mem = weight.GetMKLDNNData();
       CHECK(weight_mem->get_primitive_desc() == fwd->fwd_pd.weights_primitive_desc());
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index 4da48fa3f83c..557901c517f1 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -90,9 +90,9 @@ static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwdImpl(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
+  CHECK_GE(param.stride.ndim(), 2);
+  CHECK_GE(param.pad.ndim(), 2);
+  CHECK_GE(param.dilate.ndim(), 2);
   mkldnn::memory::dims strides{0, 0};
   strides[0] = param.stride[0];
   strides[1] = param.stride[1];
@@ -128,9 +128,9 @@ static mkldnn::convolution_forward::primitive_desc GetDeconvBwdDataImpl(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
+  CHECK_GE(param.stride.ndim(), 2);
+  CHECK_GE(param.pad.ndim(), 2);
+  CHECK_GE(param.dilate.ndim(), 2);
   mkldnn::memory::dims strides{0, 0};
   strides[0] = param.stride[0];
   strides[1] = param.stride[1];
@@ -153,9 +153,9 @@ GetDeconvBwdWeightsImpl(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
+  CHECK_GE(param.stride.ndim(), 2);
+  CHECK_GE(param.pad.ndim(), 2);
+  CHECK_GE(param.dilate.ndim(), 2);
   mkldnn::memory::dims strides{0, 0};
   strides[0] = param.stride[0];
   strides[1] = param.stride[1];
@@ -262,10 +262,11 @@ void MKLDNNDeconvForward::SetDataHandle(const DeconvolutionParam& param,
     // For inference, we want to reorder the weight array so we don't need to
     // reorder data every time.
     if (weight.IsDefaultData()) {
-      weight_mem = GetWeights(weight, fwd_pd.weights_primitive_desc(), param.num_group);
-      // We also need to modify the layout on the original weight array. The
-      // data conversion happens after the weight array is used.
+      // We also need to modify the layout on the original weight array.
+      // Don't switch below sequence because naive engine will executes
+      // pushAsync synchronously.
       const_cast<NDArray&>(weight).MKLDNNDataReorderAsync(fwd_pd.weights_primitive_desc());
+      weight_mem = GetWeights(weight, fwd_pd.weights_primitive_desc(), param.num_group);
     } else {
       weight_mem = weight.GetMKLDNNData();
       CHECK(weight_mem->get_primitive_desc() == fwd_pd.weights_primitive_desc());
@@ -296,10 +297,11 @@ static void MKLDNNDeconvFwdBiasPostProcess(const DeconvolutionParam& param,
     typedef float DType;
     Stream<cpu> *s = ctx.get_stream<cpu>();
     Tensor<cpu, 1, DType> b = bias.data().get<cpu, 1, DType>(s);
-    // If the output data is stored in a special MKLDNN format, data()
-    // automatically converts its format to the default format.
+    // The output data is stored in a special MKLDNN format,
+    // converts its format to the default format.
     // Unfortunately, MKLDNN doesn't support broadcast.
-    Tensor<cpu, 4, DType> out_cpu = out_data[deconv::kOut].data().get<cpu, 4, DType>(s);
+    auto out_data_def = out_data[deconv::kOut].Reorder2Default();
+    Tensor<cpu, 4, DType> out_cpu = out_data_def.data().get<cpu, 4, DType>(s);
     out_cpu += mshadow::expr::broadcast<1>(b, out_cpu.shape_);
   }
 }
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
index 03d7e62da399..1dfd2a95f338 100644
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -253,8 +253,11 @@ void MKLDNNFCForwardFullFeature(const MKLDNNFCFullParam &full_param,
     weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(), 1);
   } else {
     if (weight.IsDefaultData()) {
-      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(), 1);
+      // We also need to modify the layout on the original weight array.
+      // Don't switch below sequence because naive engine will executes
+      // pushAsync synchronously.
       weight.MKLDNNDataReorderAsync(fwd->fwd_pd.weights_primitive_desc());
+      weight_mem = GetWeights(weight, fwd->fwd_pd.weights_primitive_desc(), 1);
     } else {
       weight_mem = weight.GetMKLDNNData();
       CHECK(weight_mem->get_primitive_desc() == fwd->fwd_pd.weights_primitive_desc());
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn_impl.h b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
new file mode 100644
index 000000000000..ea8e07ea617c
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_rnn_impl.h
@@ -0,0 +1,740 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_IMPL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_IMPL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <mxnet/storage.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <utility>
+#include <string>
+#include "../../math_functions-inl.h"
+#include "../../operator_common.h"
+#include "../../rnn_impl.h"
+#include "../../rnn-inl.h"
+#include "mkldnn.hpp"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static algorithm GetMKLDNNRNNAlgo(int mode,
+                                  int* ngates,
+                                  int* nstates) {
+  algorithm algo = algorithm::vanilla_rnn;
+  switch (mode) {
+    case rnn_enum::kLstm:
+      *ngates = 4;
+      *nstates = 2;
+      algo = algorithm::vanilla_lstm;
+      break;
+    case rnn_enum::kGru:
+      *ngates = 3;
+      *nstates = 1;
+      algo = algorithm::vanilla_gru;
+      break;
+    case rnn_enum::kRnnRelu:
+    case rnn_enum::kRnnTanh:
+      *ngates = 1;
+      *nstates = 1;
+      algo = algorithm::vanilla_rnn;
+      break;
+    default:
+      LOG(FATAL) << "unsupported RNN mode:" << mode;
+      break;
+  }
+  return algo;
+}
+
+static void ConcatData(mkldnn::memory::format src_format,
+                       mkldnn::memory::format dst_format,
+                       std::vector<mkldnn::memory::dims> srcs_cds,
+                       mkldnn::memory::dims dst_cds,
+                       mkldnn::memory::data_type mkldnn_dtype,
+                       int concat_dimension,
+                       std::vector<void*> srcs_data,
+                       const mkldnn::memory &dst) {
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  std::vector<mkldnn::memory::primitive_desc> srcs_pd;
+  std::vector<mkldnn::memory> srcs;
+  for (size_t i = 0; i < srcs_cds.size(); i++) {
+    auto desc = mkldnn::memory::desc(srcs_cds[i], mkldnn_dtype, src_format);
+    auto mpd = mkldnn::memory::primitive_desc(desc, cpu_engine);
+    auto src_memory = mkldnn::memory(mpd, srcs_data[i]);
+    srcs_pd.push_back(mpd);
+    srcs.push_back(src_memory);
+  }
+  std::vector<primitive::at> inputs;
+  for (size_t i = 0; i < srcs_cds.size(); i++) {
+    inputs.push_back(srcs[i]);
+  }
+  auto dst_desc = mkldnn::memory::desc(dst_cds, mkldnn_dtype, dst_format);
+  auto concat_pd = concat::primitive_desc(dst_desc, concat_dimension, srcs_pd);
+  MKLDNNStream::Get()->RegisterPrim(concat(concat_pd, inputs, dst));
+  MKLDNNStream::Get()->Submit();
+}
+
+//  cached mkldnn memory
+//  first layer wx, wh with next L - 1 layers wx and wh
+//  with L layers hx and cx, src and dst data/iter etc.
+//  it will prepare memory on before and after reorder and concat.
+//  for unidirectional, it will fused as dim like 1  + (L - 1) when I != H.
+//  for bidirectional, it will fused as data + back_data (weight, bias, iter etc),
+//  also need to identify first layer and next layers
+static size_t GetMKLDNNRNNCacheMemorySize(int L,
+                                          int D,
+                                          int T,
+                                          int N,
+                                          int I,
+                                          int H,
+                                          int mode) {
+  size_t size = 0;
+  switch (mode) {
+    case rnn_enum::kLstm:
+      size = 2 * (D * (I + H) * 4 * H + (L - 1) * D * (D * H + H) * 4 * H +
+             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 4 * H + (L + 2) * D * 2 * N * H +
+             6 * D * (I + H + 2) * 4 * H + T * N * I * 2;
+      break;
+    case rnn_enum::kGru:
+      size = 2 * (D * (I + H) * 3 * H + (L - 1) * D * (D * H + H) * 3 * H +
+             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 3 * H + (L + 2) * D * 2 * N * H +
+             6 * D * (I + H + 2) * 3 * H + T * N * I * 2;
+      break;
+    case rnn_enum::kRnnRelu:
+    case rnn_enum::kRnnTanh:
+      size = 2 * (D * (I + H) * 1 * H + (L - 1) * D * (D * H + H) * 1 * H +
+             L * D * 2 * N * H) + T * N * D * H + L * 2 * D * 1 * H + (L + 2) * D * 2 * N * H +
+             6 * D * (I + H + 2) * 1 * H + T * N * I * 2;
+      break;
+    default:
+      LOG(FATAL) << "unknown RNN mode " << mode;
+      break;
+  }
+  return size;
+}
+
+template <typename DType>
+static void AdjustGruWeightGateOrder(DType* weight,
+                                     const int I,
+                                     const int H) {
+  // mxnet gru gate order is reset, update and new gates
+  // mkldnn gru gate order is update, reset and new gates
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  DType* weight_reset = weight;
+  DType* weight_update = weight + I * H;
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < I * H; i++) {
+    DType tmp = weight_update[i];
+    weight_update[i] = weight_reset[i];
+    weight_reset[i] = tmp;
+  }
+}
+
+template <typename DType>
+static void AdjustGruBiasGateOrder(DType* bias,
+                                   const int H) {
+  // mxnet gru gate order is reset, update and new gates
+  // mkldnn gru gate order is update, reset and new gates
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  DType* bias_reset = bias;
+  DType* bias_update = bias + H;
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < H; i++) {
+    DType tmp = bias_update[i];
+    bias_update[i] = bias_reset[i];
+    bias_reset[i] = tmp;
+  }
+}
+// since there is different sematics of MKLDNN's Fused RNN and MXNet FusedRNN,
+// bidirectional will be fused layer by layer,
+// unidirectional will be done by fused 1 + fused (L - 1) layers or fused L layers(when I = H)
+
+template <typename DType>
+static void MKLDNNRNNForwardSingleLayerBi(bool state_outputs,
+                                          const int T,
+                                          const int N,
+                                          const int I,
+                                          const int H,
+                                          DType* x_ptr,
+                                          mkldnn::memory *user_src_layer_memory,
+                                          DType* hx_ptr,
+                                          DType* cx_ptr,
+                                          DType* w_ptr,
+                                          DType* b_ptr,
+                                          DType* y_ptr,
+                                          DType* hy_ptr,
+                                          DType* cy_ptr,
+                                          std::vector<mkldnn::memory> *concat_weight_memory,
+                                          std::vector<mkldnn::memory> *concat_iter_memory,
+                                          std::vector<mkldnn::memory> *x_memory,
+                                          std::vector<mkldnn::memory> *hcx_memory,
+                                          std::vector<mkldnn::memory> *wx_memory,
+                                          std::vector<mkldnn::memory> *wh_memory,
+                                          std::vector<mkldnn::memory> *bias_memory,
+                                          std::vector<mkldnn::memory> *y_memory,
+                                          std::vector<mkldnn::memory> *hcy_memory,
+                                          std::vector<primitive> *rnn_forward_prim,
+                                          int layer_index,
+                                          bool *has_cache,
+                                          int lvalue,
+                                          int dtype,
+                                          bool is_train,
+                                          int mode) {
+  int ngates = 0, nstates = 0;
+  algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
+  mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype);
+  const int single_cell_size = N * H;
+  const int single_b_size = ngates * H;
+  DType* wx = w_ptr;  //  ngates * H, I
+  DType* wh = w_ptr + I * H * ngates;  //  ngates * H, H
+  DType* back_wx = w_ptr + ngates * H * (I + H);
+  DType* back_wh = back_wx + I * H * ngates;
+  DType* bx = b_ptr;
+  DType* bh = b_ptr + H * ngates;
+  DType* back_bx = b_ptr + single_b_size * 2;
+  DType* back_bh = back_bx + H * ngates;
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  auto null_memory_ = null_memory(cpu_engine);
+  int offset1 = 0, offset2 = 0;
+  bool initialized = *has_cache;
+  mkldnn::memory::dims src_layer_tz = {T, N, I};
+  mkldnn::memory::dims dst_layer_tz = {T, N, 2 * H};
+  mkldnn::memory::dims weights_layer_tz = {1, 2, I, ngates, H};  //  ldigo
+  mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H};  //  ldigo for reorder
+  mkldnn::memory::dims weights_iter_tz = {1, 2, H, ngates, H};  //  ldigo
+  mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H};  //  ldigo for reorder
+  mkldnn::memory::dims bias_tz = {1, 2, ngates, H};
+  mkldnn::memory::dims src_iter_tz = {1, 2, nstates, N, H};  //  ldsnc
+  mkldnn::memory::dims dst_iter_tz = {1, 2, nstates, N, H};  //  ldsnc
+
+  if (!initialized) {
+    if (mode == rnn_enum::kGru) {
+      AdjustGruWeightGateOrder(wx, I, H);
+      AdjustGruWeightGateOrder(back_wx, I, H);
+      AdjustGruWeightGateOrder(wh, H, H);
+      AdjustGruWeightGateOrder(back_wh, H, H);
+      AdjustGruBiasGateOrder(bx, H);
+      AdjustGruBiasGateOrder(back_bx, H);
+      AdjustGruBiasGateOrder(bh, H);
+      AdjustGruBiasGateOrder(back_bh, H);
+    }
+    auto src_wx = (*concat_weight_memory)[2 * layer_index];
+    auto src_wh = (*concat_weight_memory)[2 * layer_index + 1];
+    std::vector<void*> srcs_data1;
+    srcs_data1.push_back(wx);
+    srcs_data1.push_back(back_wx);
+    ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
+        {weights_layer_r_tz, weights_layer_r_tz}, weights_layer_tz,
+        mkldnn_dtype, 1, srcs_data1, src_wx);
+    srcs_data1.clear();
+    srcs_data1.push_back(wh);
+    srcs_data1.push_back(back_wh);
+    ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
+        {weights_iter_r_tz, weights_iter_r_tz}, weights_iter_tz,
+         mkldnn_dtype, 1, srcs_data1, src_wh);
+    int tmpvalue = 0;
+    if (lvalue > 0) {
+      tmpvalue = lvalue + 1;
+    }
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx, (*wx_memory)[tmpvalue]));
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh, (*wh_memory)[tmpvalue]));
+
+    DType* user_bias = reinterpret_cast<DType *>
+        ((*bias_memory)[tmpvalue].get_data_handle());
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int j = 0; j < single_b_size; j++) {
+      user_bias[j] = bx[j] + bh[j];
+      user_bias[single_b_size + j] = back_bx[j] + back_bh[j];
+    }
+  }
+  if (lvalue > 0) {
+    (*wx_memory)[layer_index].set_data_handle((*wx_memory)[lvalue + 1].get_data_handle());
+    (*wh_memory)[layer_index].set_data_handle((*wh_memory)[lvalue + 1].get_data_handle());
+    (*bias_memory)[layer_index].set_data_handle((*bias_memory)[lvalue + 1].get_data_handle());
+  }
+
+  auto src_layer_md = mkldnn::memory::desc(
+      { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
+  auto weight_layer_md = mkldnn::memory::desc(
+      { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+  auto weight_iter_md = mkldnn::memory::desc(
+      { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+  auto dst_layer_md = mkldnn::memory::desc(
+      { dst_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
+  auto dst_iter_md = mkldnn::memory::desc(
+      { dst_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+  auto src_iter_md = mkldnn::memory::desc(
+      {src_iter_tz}, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+  auto bias_md = mkldnn::memory::desc({bias_tz},
+      mkldnn_dtype, mkldnn::memory::format::ldgo);
+
+  auto user_src_iter_memory = (*concat_iter_memory)[2];
+  if (mode == rnn_enum::kLstm) {
+    std::vector<void*> srcs_data1;
+    srcs_data1.push_back(hx_ptr);
+    srcs_data1.push_back(cx_ptr);
+    auto tmp1_src_iter_memory = (*concat_iter_memory)[0];
+    ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
+        {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2,
+        srcs_data1, tmp1_src_iter_memory);
+    std::vector<void*> srcs_data2;
+    srcs_data2.push_back(hx_ptr + single_cell_size);
+    srcs_data2.push_back(cx_ptr + single_cell_size);
+    auto tmp2_src_iter_memory = (*concat_iter_memory)[1];
+    ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
+        {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype, 2,
+        srcs_data2, tmp2_src_iter_memory);
+    std::vector<void*> srcs_data3;
+    srcs_data3.push_back(reinterpret_cast<DType *>(tmp1_src_iter_memory.get_data_handle()));
+    srcs_data3.push_back(reinterpret_cast<DType *>(tmp2_src_iter_memory.get_data_handle()));
+    ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
+        {{1, 1, nstates, N, H}, {1, 1, nstates, N, H}}, {1, 2, nstates, N, H},
+        mkldnn_dtype, 1, srcs_data3, user_src_iter_memory);
+  } else {
+    user_src_iter_memory.set_data_handle(hx_ptr);
+  }
+  (*hcx_memory)[layer_index].set_data_handle(user_src_iter_memory.get_data_handle());
+
+  rnn_cell::desc rnn_cell(nalgorithm,
+      mode == rnn_enum::kRnnRelu ? algorithm::eltwise_relu : algorithm::eltwise_tanh);
+
+  rnn_forward::desc layer_desc(prop_kind::forward_inference, rnn_cell,
+      rnn_direction::bidirectional_concat, src_layer_md,
+      src_iter_md, weight_layer_md, weight_iter_md,
+      bias_md, dst_layer_md, dst_iter_md);
+
+  auto prim_desc
+       = rnn_forward::primitive_desc(layer_desc, cpu_engine);
+
+  if (x_ptr && layer_index == 0) {
+    (*x_memory)[layer_index].set_data_handle(x_ptr);
+  } else {
+    (*x_memory)[layer_index].set_data_handle((*user_src_layer_memory).get_data_handle());
+  }
+  (*y_memory)[layer_index].set_data_handle(y_ptr);
+
+  if (rnn_forward_prim->size() <= (size_t)layer_index) {
+    primitive rnn_prim = rnn_forward(prim_desc, (*x_memory)[layer_index],
+          (*hcx_memory)[layer_index], (*wx_memory)[layer_index],
+          (*wh_memory)[layer_index], (*bias_memory)[layer_index],
+          (*y_memory)[layer_index],
+         (*hcy_memory)[layer_index], null_memory_);
+    rnn_forward_prim->push_back(rnn_prim);
+  }
+  MKLDNNStream::Get()->RegisterPrim((*rnn_forward_prim)[layer_index]);
+  MKLDNNStream::Get()->Submit();
+
+  if (state_outputs) {
+    DType* dst_hcy = reinterpret_cast<DType *> ((*hcy_memory)[layer_index].get_data_handle());
+    if (mode == rnn_enum::kLstm) {
+      offset1 = nstates * single_cell_size;
+      offset2 = (nstates + 1) * single_cell_size;
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int n = 0; n < single_cell_size; n++) {
+        hy_ptr[n] = dst_hcy[n];
+        hy_ptr[n + single_cell_size] = dst_hcy[n + offset1];
+        cy_ptr[n] = dst_hcy[n + single_cell_size];
+        cy_ptr[n + single_cell_size] = dst_hcy[n + offset2];
+      }
+    } else {
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int n = 0; n < 2 * single_cell_size; n++) {
+        hy_ptr[n] = dst_hcy[n];
+      }
+    }
+  }
+}
+
+
+template <typename DType>
+static void MKLDNNRNNForwardUnidi(bool state_outputs,
+                                  const int L,
+                                  const int T,
+                                  const int N,
+                                  const int I,
+                                  const int H,
+                                  DType* x_ptr,
+                                  mkldnn::memory *user_src_layer_memory,
+                                  DType* hx_ptr,
+                                  DType* cx_ptr,
+                                  DType* w_ptr,
+                                  DType* b_ptr,
+                                  DType* y_ptr,
+                                  DType* hy_ptr,
+                                  DType* cy_ptr,
+                                  std::vector<mkldnn::memory> *concat_weight_memory,
+                                  std::vector<mkldnn::memory> *concat_iter_memory,
+                                  std::vector<mkldnn::memory> *x_memory,
+                                  std::vector<mkldnn::memory> *hcx_memory,
+                                  std::vector<mkldnn::memory> *wx_memory,
+                                  std::vector<mkldnn::memory> *wh_memory,
+                                  std::vector<mkldnn::memory> *bias_memory,
+                                  std::vector<mkldnn::memory> *y_memory,
+                                  std::vector<mkldnn::memory> *hcy_memory,
+                                  std::vector<primitive> *rnn_forward_prim,
+                                  int layer_index,
+                                  bool *has_cache,
+                                  int dtype,
+                                  bool is_train,
+                                  int mode) {
+  int ngates = 0, nstates = 0;
+  algorithm nalgorithm = GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
+  mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype);
+  const int cell_size = N * H;
+  const int single_cell_size = N * H;
+  const int single_b_size = ngates * H;
+  int w_size = (I + H) * H * ngates;
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  auto null_memory_ = null_memory(cpu_engine);
+  int offset1 = 0, offset2 = 0;
+  bool initialized = *has_cache;
+
+  mkldnn::memory::dims src_layer_tz = {T, N, I};
+  mkldnn::memory::dims dst_layer_tz = {T, N, H};
+  mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H};  //  ldigo
+  mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H};  //  ldigo
+  mkldnn::memory::dims bias_tz = {L, 1, ngates, H};
+  mkldnn::memory::dims src_iter_tz = {L, 1, nstates, N, H};  //  ldsnc
+  mkldnn::memory::dims dst_iter_tz = {L, 1, nstates, N, H};  //  ldsnc
+  mkldnn::memory::dims weights_layer_r_tz = {1, 1, I, ngates, H};  //  ldigo for reorder
+  mkldnn::memory::dims weights_iter_r_tz = {1, 1, H, ngates, H};  //  ldigo for reorder
+
+  auto weight_layer_md = mkldnn::memory::desc(
+      { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+  auto weight_iter_md = mkldnn::memory::desc(
+      { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+  auto src_layer_md = mkldnn::memory::desc(
+      { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
+  auto dst_layer_md = mkldnn::memory::desc(
+      {dst_layer_tz}, mkldnn_dtype, mkldnn::memory::format::tnc);
+  auto src_iter_md = mkldnn::memory::desc(
+      {src_iter_tz}, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+  auto bias_md = mkldnn::memory::desc({bias_tz},
+      mkldnn_dtype, mkldnn::memory::format::ldgo);
+  auto dst_iter_md = mkldnn::memory::desc(
+      {dst_iter_tz}, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+
+  for (int l = 0; l < L; l++) {
+    if (mode == rnn_enum::kLstm) {
+      std::vector<void*> srcs_data;
+      srcs_data.push_back(hx_ptr);
+      srcs_data.push_back(cx_ptr);
+      auto tmp_src_iter_memory = (*concat_iter_memory)[l + layer_index];
+      ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc,
+          {{1, 1, 1, N, H}, {1, 1, 1, N, H}}, {1, 1, nstates, N, H}, mkldnn_dtype,
+          2, srcs_data, tmp_src_iter_memory);
+    } else {
+      (*concat_iter_memory)[l + layer_index].set_data_handle(hx_ptr);
+    }
+    hx_ptr += cell_size;
+    if (mode == rnn_enum::kLstm) {
+      cx_ptr += cell_size;
+    }
+  }
+
+  auto user_src_iter_memory = null_memory_;
+  if (L == 1) {
+    user_src_iter_memory = (*concat_iter_memory)[layer_index];
+  } else {
+    user_src_iter_memory = (*concat_iter_memory)[L + layer_index];
+    std::vector<void*> src_l_data;
+    std::vector<mkldnn::memory::dims> src_l_dim;
+    for (int l = 0; l < L; l++) {
+      src_l_data.push_back(reinterpret_cast<DType *>
+          ((*concat_iter_memory)[l + layer_index].get_data_handle()));
+      src_l_dim.push_back({1, 1, nstates, N, H});
+    }
+    ConcatData(mkldnn::memory::format::ldsnc, mkldnn::memory::format::ldsnc, src_l_dim,
+        {L, 1, nstates, N, H}, mkldnn_dtype, 0, src_l_data, user_src_iter_memory);
+  }
+  (*hcx_memory)[layer_index].set_data_handle(user_src_iter_memory.get_data_handle());
+
+  auto src_wx_f = (*concat_weight_memory)[2 * layer_index];
+  auto src_wh_f = (*concat_weight_memory)[2 * layer_index + 1];
+
+  std::vector<void*> srcs_data_x;
+  std::vector<void*> srcs_data_h;
+  std::vector<mkldnn::memory::dims> src_l_dim_x;
+  std::vector<mkldnn::memory::dims> src_l_dim_h;
+  if (!initialized) {
+    if (L == 1) {
+      DType* wx = w_ptr;
+      DType* wh = w_ptr + I * H * ngates;
+      if (mode == rnn_enum::kGru) {
+        AdjustGruWeightGateOrder(wx, I, H);
+        AdjustGruWeightGateOrder(wh, H, H);
+        AdjustGruBiasGateOrder(b_ptr, H);
+        AdjustGruBiasGateOrder(b_ptr + H * ngates, H);
+      }
+      src_wx_f.set_data_handle(wx);
+      src_wh_f.set_data_handle(wh);
+    } else {
+      for (int l = 0; l < L; l++) {
+        DType* wx = w_ptr;
+        DType* wh = w_ptr + I * H * ngates;
+        DType* bx = b_ptr + l * ngates * H * 2;
+        DType* bh = b_ptr + l * ngates * H * 2 + H * ngates;
+        if (mode == rnn_enum::kGru) {
+          AdjustGruWeightGateOrder(wx, I, H);
+          AdjustGruWeightGateOrder(wh, H, H);
+          AdjustGruBiasGateOrder(bx, H);
+          AdjustGruBiasGateOrder(bh, H);
+        }
+        srcs_data_x.push_back(wx);
+        srcs_data_h.push_back(wh);
+        src_l_dim_x.push_back(weights_layer_r_tz);
+        src_l_dim_h.push_back(weights_iter_r_tz);
+        w_ptr = w_ptr + w_size;
+      }
+      ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
+          src_l_dim_x, weights_layer_tz, mkldnn_dtype, 0, srcs_data_x, src_wx_f);
+      ConcatData(mkldnn::memory::format::ldgoi, mkldnn::memory::format::ldgoi,
+          src_l_dim_h, weights_iter_tz, mkldnn_dtype, 0, srcs_data_h, src_wh_f);
+    }
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wx_f, (*wx_memory)[layer_index]));
+    MKLDNNStream::Get()->RegisterPrim(reorder(src_wh_f, (*wh_memory)[layer_index]));
+
+    DType* user_bias_f = reinterpret_cast<DType *> ((*bias_memory)[layer_index].get_data_handle());
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int j = 0; j < L * single_b_size; j++) {
+      int k = j / single_b_size;
+      user_bias_f[j] = b_ptr[j + k * single_b_size] + b_ptr[j + k * single_b_size + single_b_size];
+    }
+  }
+
+  rnn_cell::desc rnn_cell(nalgorithm,
+      mode == rnn_enum::kRnnRelu ? algorithm::eltwise_relu : algorithm::eltwise_tanh);
+
+  rnn_forward::desc layer_desc(prop_kind::forward_inference, rnn_cell,
+      rnn_direction::unidirectional, src_layer_md,
+      src_iter_md, weight_layer_md, weight_iter_md,
+      bias_md, dst_layer_md, dst_iter_md);
+
+  auto prim_desc
+       = rnn_forward::primitive_desc(layer_desc, cpu_engine);
+
+  if (x_ptr && layer_index == 0) {
+    (*x_memory)[layer_index].set_data_handle(x_ptr);
+  } else {
+    (*x_memory)[layer_index].set_data_handle((*user_src_layer_memory).get_data_handle());
+  }
+  (*y_memory)[layer_index].set_data_handle(y_ptr);
+
+  if (rnn_forward_prim->size() <= (size_t)layer_index) {
+    primitive rnn_prim = rnn_forward(prim_desc, (*x_memory)[layer_index],
+          (*hcx_memory)[layer_index], (*wx_memory)[layer_index],
+          (*wh_memory)[layer_index], (*bias_memory)[layer_index],
+          (*y_memory)[layer_index],
+         (*hcy_memory)[layer_index], null_memory_);
+    rnn_forward_prim->push_back(rnn_prim);
+  }
+  MKLDNNStream::Get()->RegisterPrim((*rnn_forward_prim)[layer_index]);
+  MKLDNNStream::Get()->Submit();
+
+  if (state_outputs) {
+    DType* dst_hcy = reinterpret_cast<DType *> ((*hcy_memory)[layer_index].get_data_handle());
+    if (mode == rnn_enum::kLstm) {
+      for (int l = 0; l < L; l++) {
+        offset1 = l * single_cell_size;
+        offset2 = l * nstates * single_cell_size;
+        #pragma omp parallel for num_threads(omp_threads)
+        for (int n = 0; n < single_cell_size; n++) {
+          hy_ptr[offset1 + n] = dst_hcy[offset2 + n];
+          cy_ptr[offset1 + n] = dst_hcy[offset2 + n + single_cell_size];
+        }
+      }
+    } else {
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int n = 0; n < L * single_cell_size; n++) {
+        hy_ptr[n] = dst_hcy[n];
+      }
+    }
+  }
+}
+
+template <typename DType>
+static void MKLDNNRNNForward(bool state_outputs,
+                             const int L,
+                             const int D,
+                             const int T,
+                             const int N,
+                             const int I,
+                             const int H,
+                             DType* x_ptr,
+                             DType* hx_ptr,
+                             DType* cx_ptr,
+                             DType* w_ptr,
+                             DType* b_ptr,
+                             DType* y_ptr,
+                             DType* hy_ptr,
+                             DType* cy_ptr,
+                             std::vector<mkldnn::memory> *concat_weight_memory,
+                             std::vector<mkldnn::memory> *concat_iter_memory,
+                             std::vector<mkldnn::memory> *x_memory,
+                             std::vector<mkldnn::memory> *hcx_memory,
+                             std::vector<mkldnn::memory> *wx_memory,
+                             std::vector<mkldnn::memory> *wh_memory,
+                             std::vector<mkldnn::memory> *bias_memory,
+                             std::vector<mkldnn::memory> *y_memory,
+                             std::vector<mkldnn::memory> *hcy_memory,
+                             std::vector<primitive> *rnn_forward_prim,
+                             bool *has_cache,
+                             int dtype,
+                             bool is_train,
+                             int mode) {
+  int ngates = 0, nstates = 0;
+  GetMKLDNNRNNAlgo(mode, &ngates, &nstates);
+  const int b_size = 2 * H * ngates * D;
+  const int cell_size = N * H * D;
+  //  First layer
+  int w_size = (I + H) * H * ngates * D;
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  auto null_memory_ = null_memory(cpu_engine);
+  DType* tmpNull = NULL;
+  // when D = 1 and I == H, L layers can be fused together
+  if (D == 1 && I == H) {
+    MKLDNNRNNForwardUnidi(state_outputs, L, T, N, I, H, x_ptr, &null_memory_,
+        hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
+        concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory,
+        bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+        0, has_cache, dtype, is_train, mode);
+  } else {
+    auto user_src_layer_memory_l = null_memory_;
+    if (D == 2) {
+      MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, I, H, x_ptr, &user_src_layer_memory_l,
+          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
+          concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory,
+          bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+          0, has_cache, 0, dtype, is_train, mode);
+    } else {
+      MKLDNNRNNForwardUnidi(state_outputs, 1, T, N, I, H, x_ptr, &user_src_layer_memory_l,
+          hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
+          concat_iter_memory, x_memory, hcx_memory, wx_memory, wh_memory,
+          bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+          0, has_cache, dtype, is_train, mode);
+    }
+    if (L > 1) {
+      user_src_layer_memory_l = (*y_memory)[0];
+      //  go to next L - 1 layers.
+      //  If D = 2, do it layer by layer. If D = 1, fused L - 1 layers
+      w_ptr += w_size;
+      b_ptr += b_size;
+      if (D == 2) {
+        w_size = (H * D + H) * H * ngates * D;
+        for (int l = 0; l < L - 1; l++) {
+          if (state_outputs) {
+            hy_ptr += cell_size;
+            if (mode == rnn_enum::kLstm) {
+              cy_ptr += cell_size;
+            }
+          }
+          hx_ptr += cell_size;
+          if (mode == rnn_enum::kLstm) {
+            cx_ptr += cell_size;
+          }
+          MKLDNNRNNForwardSingleLayerBi(state_outputs, T, N, D * H, H, tmpNull,
+              &user_src_layer_memory_l, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr,
+              cy_ptr, concat_weight_memory, concat_iter_memory, x_memory,
+              hcx_memory, wx_memory, wh_memory, bias_memory,
+              y_memory, hcy_memory, rnn_forward_prim,
+              1, has_cache, l + 1, dtype, is_train, mode);
+          user_src_layer_memory_l = (*y_memory)[1];
+          w_ptr += w_size;
+          b_ptr += b_size;
+        }
+      }
+      if (D == 1) {
+        if (state_outputs) {
+          hy_ptr += cell_size;
+          if (mode == rnn_enum::kLstm) {
+            cy_ptr += cell_size;
+          }
+        }
+        w_size = (H + H) * H * ngates;
+        MKLDNNRNNForwardUnidi(state_outputs, L - 1, T, N, H, H, tmpNull, &user_src_layer_memory_l,
+            hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr, concat_weight_memory,
+            concat_iter_memory, x_memory, hcx_memory, wx_memory,
+            wh_memory, bias_memory, y_memory, hcy_memory,
+            rnn_forward_prim, 1, has_cache, dtype, is_train, mode);
+      }
+    }
+  }
+  *has_cache = true;
+}
+
+template <typename DType>
+static void MKLDNNRNNForwardInference(bool state_outputs,
+                                      const int num_layers,
+                                      const int direction,
+                                      const int seq_length,
+                                      const int batch_size,
+                                      const int input_size,
+                                      const int state_size,
+                                      DType* x_ptr,
+                                      DType* hx_ptr,
+                                      DType* cx_ptr,
+                                      DType* w_ptr,
+                                      DType* b_ptr,
+                                      DType* y_ptr,
+                                      DType* hy_ptr,
+                                      DType* cy_ptr,
+                                      std::vector<mkldnn::memory>* concat_weight_memory,
+                                      std::vector<mkldnn::memory>* concat_iter_memory,
+                                      std::vector<mkldnn::memory> *x_memory,
+                                      std::vector<mkldnn::memory> *hcx_memory,
+                                      std::vector<mkldnn::memory> *wx_memory,
+                                      std::vector<mkldnn::memory> *wh_memory,
+                                      std::vector<mkldnn::memory> *bias_memory,
+                                      std::vector<mkldnn::memory> *y_memory,
+                                      std::vector<mkldnn::memory> *hcy_memory,
+                                      std::vector<primitive> *rnn_forward_prim,
+                                      bool *has_cache,
+                                      int dtype,
+                                      bool is_train,
+                                      int mode) {
+  switch (mode) {
+    case rnn_enum::kLstm:
+    case rnn_enum::kGru:
+    case rnn_enum::kRnnTanh:
+    case rnn_enum::kRnnRelu:
+      MKLDNNRNNForward<DType>(state_outputs, num_layers, direction, seq_length,
+                              batch_size, input_size, state_size, x_ptr, hx_ptr,
+                              cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr,
+                              concat_weight_memory, concat_iter_memory, x_memory,
+                              hcx_memory, wx_memory, wh_memory,
+                              bias_memory, y_memory, hcy_memory, rnn_forward_prim,
+                              has_cache, dtype, is_train, mode);
+      break;
+    default:
+      LOG(FATAL) << "unknown RNN mode" << mode;
+      break;
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RNN_IMPL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
index 7268ed39339e..77ab43b63fd5 100644
--- a/src/operator/nn/mkldnn/mkldnn_softmax.cc
+++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc
@@ -26,43 +26,66 @@
 #include "../softmax-inl.h"
 #include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
-#include "../../tensor/broadcast_reduce_op.h"
 
 #if MXNET_USE_MKLDNN == 1
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNSoftmax(const SoftmaxParam &param) {
+bool SupportMKLDNNSoftmax(const SoftmaxParam &param,
+                          const NDArray &data,
+                          const NDArray &output) {
+  const int ndim = data.shape().ndim();
+  const int in_dtype = data.dtype();
+  const int out_dtype = output.dtype();
+
+  const int axis = CheckAxis(param.axis, ndim);
   // MKLDNN does not support temperature argument in their softmax function
   // now. Need update this once they start to support it.
-  if (param.temperature.has_value()) {
+  // Currently, MKLDNN shows bad performance when softmax is not performed on the last dimension
+  if (param.temperature.has_value() ||
+      in_dtype != mshadow::kFloat32 ||
+      in_dtype != out_dtype ||
+      axis != (ndim - 1)) {
     return false;
   }
-  return true;
+  // only supports ndim = 1, 2, 3, 4 for now
+  return (ndim >= 1 && ndim <= 4);
+}
+
+static mkldnn::softmax_forward::primitive_desc GetSoftmaxFwdPd(const int axis,
+                                                               const bool is_train,
+                                                               const mkldnn::memory &input) {
+  auto data_md = input.get_primitive_desc().desc();
+  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+  auto desc = mkldnn::softmax_forward::desc(prop, data_md, axis);
+  auto pd = mkldnn::softmax_forward::primitive_desc(desc, CpuEngine::Get()->get_engine());
+  return pd;
 }
 
-void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                          const NDArray &in_data, const OpReqType &req,
+void MKLDNNSoftmaxForward(const nnvm::NodeAttrs &attrs,
+                          const OpContext &ctx,
+                          const NDArray &in_data,
+                          const OpReqType &req,
                           const NDArray &out_data) {
+  if (req == kNullOp) return;
+  // same as the FCompute path, softmax only supports kWriteTo and kWriteInplace for now.
+  CHECK_NE(req, kAddTo);
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  auto input_mem = in_data.GetMKLDNNData();
-  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
-  mkldnn::memory::desc data_md = data_mpd.desc();
-  int axis = CheckAxis(param.axis, in_data.shape().ndim());
+  const int axis = CheckAxis(param.axis, in_data.shape().ndim());
 
-  auto cpu_engine = data_mpd.get_engine();
-  auto prop = ctx.is_train
-    ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop,
-      data_md, axis);
-  mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine);
+  NDArray data = in_data;
+  if (in_data.IsView() && in_data.IsMKLDNNData()) {
+    data = in_data.Reorder2Default();
+  }
 
-  auto output_memory = out_data.GetMKLDNNData();
+  auto data_mem = data.GetMKLDNNData();
+  auto pd = GetSoftmaxFwdPd(axis, ctx.is_train, *data_mem);
+  auto out_mem = CreateMKLDNNMem(out_data, pd.dst_primitive_desc(), req);
   MKLDNNStream *stream = MKLDNNStream::Get();
-  stream->RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory));
+  stream->RegisterPrim(mkldnn::softmax_forward(pd, *data_mem, *out_mem.second));
+  CommitOutput(out_data, out_mem);
   stream->Submit();
 }
-
 }   // namespace op
 }   // namespace mxnet
 #endif
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
index dfb0e254c128..724b8a2613d6 100644
--- a/src/operator/nn/mkldnn/mkldnn_sum.cc
+++ b/src/operator/nn/mkldnn/mkldnn_sum.cc
@@ -24,6 +24,7 @@
 */
 #include <iostream>
 
+#include "../../operator_common.h"
 #include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
 
@@ -58,37 +59,105 @@ void MKLDNNSum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
   MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out));
 }
 
+class MKLDNNSumFwd {
+ public:
+  mkldnn::sum::primitive_desc fwd_pd;
+
+  MKLDNNSumFwd(const std::vector<float> &scales,
+               const std::vector<mkldnn::memory::primitive_desc> &data_md)
+      : fwd_pd(scales, data_md) {
+    data_.resize(data_md.size());
+  }
+
+  void SetNewMem(const std::vector<const mkldnn::memory *> &in_data, const mkldnn::memory &output);
+
+  const mkldnn::sum &GetFwd() const { return *fwd_; }
+
+ private:
+  std::shared_ptr<mkldnn::sum> fwd_;
+  std::vector<std::shared_ptr<mkldnn::memory>> data_;
+  std::vector<mkldnn::primitive::at> data_mem_;
+  std::shared_ptr<mkldnn::memory> out_;
+};
+
+static MKLDNNSumFwd &GetSumForward(
+    const std::vector<float> &scales, const std::vector<NDArray> &in_data,
+    const std::vector<mkldnn::memory::primitive_desc> &data_md) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<OpSignature, MKLDNNSumFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<OpSignature, MKLDNNSumFwd, OpHash> fwds;
+#endif
+  OpSignature key;
+  key.AddSign(in_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNSumFwd fwd(scales, data_md);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void MKLDNNSumFwd::SetNewMem(const std::vector<const mkldnn::memory *> &in_data,
+                             const mkldnn::memory &output) {
+  auto num_inputs = data_.size();
+  CHECK_EQ(in_data.size(), num_inputs);
+  for (index_t i = 0; i < static_cast<index_t>(num_inputs); ++i) {
+    if (this->data_[i] == nullptr) {
+      this->data_[i] = std::shared_ptr<mkldnn::memory>(
+          new mkldnn::memory(in_data[i]->get_primitive_desc(), in_data[i]->get_data_handle()));
+      this->data_mem_.push_back(*this->data_[i]);
+    } else {
+      this->data_[i]->set_data_handle(in_data[i]->get_data_handle());
+    }
+  }
+  if (this->out_ == nullptr)
+    this->out_ = std::shared_ptr<mkldnn::memory>(
+        new mkldnn::memory(fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+  else
+    this->out_->set_data_handle(output.get_data_handle());
+
+  if (this->fwd_ == nullptr)
+    this->fwd_.reset(new mkldnn::sum(fwd_pd, this->data_mem_, *this->out_));
+}
+
 void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                       const std::vector<NDArray> &inputs, const OpReqType &req,
                       const NDArray &out_data) {
-  if (req == kNullOp) {
-    return;
-  }
-
   TmpMemMgr::Get()->Init(ctx.requested[0]);
-  std::vector<mkldnn::primitive::at> in_prims;
-  std::vector<mkldnn::memory::primitive_desc> in_pds(inputs.size());
-  std::vector<float> scales(inputs.size(), 1);
-  in_prims.reserve(inputs.size());
-  std::vector<NDArray> in_bufs(inputs.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
+  auto num_inputs = inputs.size();
+  std::vector<mkldnn::memory::primitive_desc> data_md;
+  std::vector<const mkldnn::memory *> data_mem;
+  std::vector<float> scales(num_inputs, 1);
+  std::vector<NDArray> in_bufs(num_inputs);
+
+  data_md.reserve(num_inputs);
+  data_mem.reserve(num_inputs);
+
+  for (index_t i = 0; i < static_cast<index_t>(num_inputs); ++i) {
     const mkldnn::memory *in_mem;
     if (inputs[i].IsMKLDNNData() && inputs[i].IsView()) {
       in_bufs[i] = inputs[i].Reorder2Default();
       in_mem = in_bufs[i].GetMKLDNNData();
     } else {
+      in_bufs[i] = inputs[i];
       in_mem = inputs[i].GetMKLDNNData();
     }
-    in_prims.push_back(*in_mem);
-    in_pds[i] = in_mem->get_primitive_desc();
+    mkldnn::memory::primitive_desc tmp_pd = in_mem->get_primitive_desc();
+    data_md.push_back(tmp_pd);
+    data_mem.push_back(in_mem);
   }
 
-  mkldnn::sum::primitive_desc pdesc(scales, in_pds);
-  auto mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req, &inputs[0]);
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *mem.second));
-  CommitOutput(out_data, mem);
-  stream->Submit();
+  MKLDNNSumFwd &fwd = GetSumForward(scales, in_bufs, data_md);
+  mxnet::mkldnn_output_t out_mem = CreateMKLDNNMem(out_data,
+                                                   fwd.fwd_pd.dst_primitive_desc(),
+                                                   req,
+                                                   &in_bufs[0]);
+  fwd.SetNewMem(data_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+  CommitOutput(out_data, out_mem);
+  MKLDNNStream::Get()->Submit();
 }
 
 }  // namespace op
diff --git a/src/operator/nn/moments-inl.h b/src/operator/nn/moments-inl.h
new file mode 100644
index 000000000000..6a9bdc54b905
--- /dev/null
+++ b/src/operator/nn/moments-inl.h
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file moments-inl.h
+ * \brief Moments operator
+ * \author Hao Jin
+*/
+
+#ifndef MXNET_OPERATOR_NN_MOMENTS_INL_H_
+#define MXNET_OPERATOR_NN_MOMENTS_INL_H_
+
+#include <vector>
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct MomentsParam : public dmlc::Parameter<MomentsParam> {
+  dmlc::optional<mxnet::TShape> axes;
+  bool keepdims;
+  DMLC_DECLARE_PARAMETER(MomentsParam) {
+    DMLC_DECLARE_FIELD(axes).set_default(dmlc::optional<mxnet::TShape>())
+      .describe("Array of ints. Axes along which to compute mean and variance.");
+    DMLC_DECLARE_FIELD(keepdims).set_default(false)
+      .describe("produce moments with the same dimensionality as the input.");
+  }
+};
+
+inline bool MomentsShape(const nnvm::NodeAttrs& attrs,
+                         mxnet::ShapeVector* in_attrs,
+                         mxnet::ShapeVector* out_attrs) {
+  const MomentsParam& param = nnvm::get<MomentsParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 2U);
+
+  mxnet::TShape out_shape =
+    ReduceAxesShapeImpl((*in_attrs)[0], param.axes, param.keepdims, false);
+  if (!param.axes.has_value() || param.axes.value().ndim() == 0) {
+    LOG(FATAL) << "Empty axes is not supported, if you would like to do global moments, "
+               << "please pass all axes to axes argument";
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, out_shape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, out_shape);
+  return true;
+}
+
+inline bool MomentsType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int>* in_attrs,
+                        std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 2U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(1));
+  return out_attrs->at(0) != -1 && out_attrs->at(1) != -1;
+}
+
+struct VarBroadcastKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType *out,
+                                  const DType *data,
+                                  const DType *mean,
+                                  mshadow::Shape<6> data_shape,
+                                  mshadow::Shape<6> mean_shape) {
+    size_t data_idx = i;
+    size_t mean_idx = i;
+    size_t data_stride = 1;
+    size_t mean_stride = 1;
+    for (int axis = 5; axis >= 0; --axis) {
+      size_t axis_idx = data_idx % data_shape[axis];
+      mean_idx -= axis_idx * data_stride;
+      if (mean_shape[axis] != 1) {
+        mean_idx += axis_idx * mean_stride;
+      }
+      data_idx /= data_shape[axis];
+      data_stride *= data_shape[axis];
+      mean_stride *= mean_shape[axis];
+    }
+    DType res = (data[i] - mean[mean_idx]);
+    out[i] = res * res;
+  }
+};
+
+template<typename xpu>
+inline void MomentsForwardImpl(const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs,
+                               const dmlc::optional<mxnet::TShape>& axes,
+                               const bool keepdims) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TBlob& data = inputs[0];
+  const TBlob& mean = outputs[0];
+  const TBlob& var = outputs[1];
+
+  mxnet::TShape small;
+  if (keepdims) {
+    small = outputs[0].shape_;
+  } else {
+    small = ReduceAxesShapeImpl(inputs[0].shape_, axes, true, false);
+  }
+
+  ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, true>(ctx, {data}, {req[0]}, {mean}, small);
+  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
+    Shape<6> data_shape, mean_shape;
+    for (int i = 0; i < 6; ++i) {
+      data_shape[i] = (i < data.shape_.ndim()) ? data.shape_[i] : 1;
+      mean_shape[i] = (i < small.ndim()) ? small[i] : 1;
+    }
+    Tensor<xpu, 1, DType> temp_data =
+      ctx.requested[0].get_space_typed<xpu, 1, DType>(Shape1(data.shape_.Size()), s);;
+    Kernel<VarBroadcastKernel, xpu>::Launch(s, data.shape_.Size(), temp_data.dptr_,
+      data.dptr<DType>(), mean.dptr<DType>(), data_shape, mean_shape);
+    ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, true>(
+      ctx, {TBlob(temp_data).reshape(data.shape_)}, {kWriteTo}, {var}, small);
+  });
+}
+
+template<typename xpu>
+inline void MomentsForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+
+  const MomentsParam& param = nnvm::get<MomentsParam>(attrs.parsed);
+
+  MomentsForwardImpl<xpu>(ctx, inputs, req, outputs, param.axes, param.keepdims);
+}
+
+template<int req>
+struct VarBackwardKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType *igrad,
+                                  const DType *ograd,
+                                  const DType *data,
+                                  const DType *mean,
+                                  mshadow::Shape<6> data_shape,
+                                  mshadow::Shape<6> mean_shape,
+                                  const float N,
+                                  const float ddof = 0.0f) {
+    size_t data_idx = i;
+    size_t mean_idx = i;
+    size_t data_stride = 1;
+    size_t mean_stride = 1;
+    for (int axis = 5; axis >= 0; --axis) {
+      size_t axis_idx = data_idx % data_shape[axis];
+      mean_idx -= axis_idx * data_stride;
+      if (mean_shape[axis] != 1) {
+        mean_idx += axis_idx * mean_stride;
+      }
+      data_idx /= data_shape[axis];
+      data_stride *= data_shape[axis];
+      mean_stride *= mean_shape[axis];
+    }
+    KERNEL_ASSIGN(igrad[i], req, ograd[mean_idx] * (data[i] - mean[mean_idx]) * 2 / (N - ddof));
+  }
+};
+
+template<typename xpu>
+inline void MomentsBackwardImpl(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs,
+                                const dmlc::optional<mxnet::TShape>& axes) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TBlob& mean_grad = inputs[0];
+  const TBlob& var_grad = inputs[1];
+  const TBlob& data = inputs[2];
+  const TBlob& mean = inputs[3];
+  const TBlob& var = inputs[4];
+  const TBlob& data_grad = outputs[0];
+
+  mxnet::TShape small = ReduceAxesShapeImpl(data.shape_, axes, true, false);
+  BroadcastComputeImpl<xpu>(attrs, ctx, {mean_grad}, req, outputs, small);
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 1, DType> igrad = outputs[0].FlatTo1D<xpu, DType>(s);
+    igrad /= scalar<DType>(outputs[0].Size()/inputs[0].Size());
+  });
+
+  Shape<6> data_shape, var_shape;
+  float N = data_grad.Size() / var.Size();
+  for (int i = 0; i < 6; ++i) {
+    data_shape[i] = (i < data.shape_.ndim()) ? data.shape_[i] : 1;
+    var_shape[i] = (i < small.ndim()) ? small[i] : 1;
+  }
+  MSHADOW_TYPE_SWITCH(data_grad.type_flag_, DType, {
+    Kernel<VarBackwardKernel<kAddTo>, xpu>::Launch(
+      s, data_grad.shape_.Size(), data_grad.dptr<DType>(), var_grad.dptr<DType>(),
+      data.dptr<DType>(), mean.dptr<DType>(), data_shape, var_shape, N);
+  });
+}
+
+template<typename xpu>
+inline void MomentsBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+
+  CHECK_EQ(inputs.size(), 5U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const MomentsParam& param = nnvm::get<MomentsParam>(attrs.parsed);
+
+  MomentsBackwardImpl<xpu>(attrs, ctx, inputs, req, outputs, param.axes);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_NN_MOMENTS_INL_H_
diff --git a/src/operator/nn/moments.cc b/src/operator/nn/moments.cc
new file mode 100644
index 000000000000..37b8cdf18750
--- /dev/null
+++ b/src/operator/nn/moments.cc
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file moments.cc
+ * \brief Moments operator
+ * \author Hao Jin
+*/
+
+#include "./moments-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(MomentsParam);
+
+NNVM_REGISTER_OP(moments)
+.describe(R"code(
+Calculate the mean and variance of `data`.
+
+The mean and variance are calculated by aggregating the contents of data across axes.
+If x is 1-D and axes = [0] this is just the mean and variance of a vector.
+
+Example:
+
+     x = [[1, 2, 3], [4, 5, 6]]
+     mean, var = moments(data=x, axes=[0])
+     mean = [2.5, 3.5, 4.5]
+     var = [2.25, 2.25, 2.25]
+     mean, var = moments(data=x, axes=[1])
+     mean = [2.0, 5.0]
+     var = [0.66666667, 0.66666667]
+     mean, var = moments(data=x, axis=[0, 1])
+     mean = [3.5]
+     var = [2.9166667]
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<MomentsParam>)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", MomentsShape)
+.set_attr<nnvm::FInferType>("FInferType", MomentsType)
+.set_attr<FCompute>("FCompute<cpu>", MomentsForward<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_moments"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(MomentsParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_moments)
+.set_attr_parser(ParamParser<MomentsParam>)
+.set_num_inputs(5)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", MomentsBackward<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/moments.cu b/src/operator/nn/moments.cu
new file mode 100644
index 000000000000..a45ae33281be
--- /dev/null
+++ b/src/operator/nn/moments.cu
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file moments.cu
+ * \brief Moments operator
+ * \author Hao Jin
+*/
+
+#include "./moments-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(moments)
+.set_attr<FCompute>("FCompute<gpu>", MomentsForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_moments)
+.set_attr<FCompute>("FCompute<gpu>", MomentsBackward<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 3e081c9a0552..870557756128 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -106,11 +106,11 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
     CHECK(param.pad[0] == 0 && param.pad[1] == 0 && param.pad[2] == 0)
       << "Same pooling convention disables the use of pad parameter.";
   }
-  CHECK_GE(dshape.ndim(), 3U)
+  CHECK_GE(dshape.ndim(), 3)
       << "Pooling: Input data should be  3D in (batch, channel, x)"
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
-  CHECK_LE(dshape.ndim(), 5U)
+  CHECK_LE(dshape.ndim(), 5)
       << "Pooling: Input data should be  3D in (batch, channel, x)"
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 096d87416081..d6113b05dbb9 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -410,18 +410,39 @@ void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
   const double temperature = param.temperature.has_value() ?
     param.temperature.value() : 1.0;
   mxnet::TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
+  bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
+  if (!safe_acc && inputs[0].type_flag_ == mshadow::kFloat16) {
+    common::LogOnce("MXNET_SAFE_ACCUMULATION=1 is recommended for softmax with float16 inputs. "
+                    "See https://mxnet.incubator.apache.org/versions/master/faq/env_var.html "
+                    "for more details.");
+  }
+
   MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, DType, AType, {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-      if (shape.ndim() == 2) {
-        Softmax<OP, negate, AType>(
-            ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-            outputs[0].dptr<OType>(), shape.get<2>(), axis,
-            static_cast<DType>(temperature));
+      if (safe_acc) {
+        if (shape.ndim() == 2) {
+          Softmax<OP, negate, AType>(
+              ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+              outputs[0].dptr<OType>(), shape.get<2>(), axis,
+              static_cast<DType>(temperature));
+        } else {
+          Softmax<OP, negate, AType>(
+              ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+              outputs[0].dptr<OType>(), shape.get<3>(), axis,
+              static_cast<DType>(temperature));
+        }
       } else {
-        Softmax<OP, negate, AType>(
-            ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-            outputs[0].dptr<OType>(), shape.get<3>(), axis,
-            static_cast<DType>(temperature));
+        if (shape.ndim() == 2) {
+          Softmax<OP, negate, DType>(
+              ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+              outputs[0].dptr<OType>(), shape.get<2>(), axis,
+              static_cast<DType>(temperature));
+        } else {
+          Softmax<OP, negate, DType>(
+              ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+              outputs[0].dptr<OType>(), shape.get<3>(), axis,
+              static_cast<DType>(temperature));
+        }
       }
     });
   });
@@ -443,20 +464,35 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
   mxnet::TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
 
   int out_idx = softmax_has_dtype_override(attrs) ? 2 : 1;
+  bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
 
   MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, OType, AType, {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        if (shape.ndim() == 2) {
-          SoftmaxGrad<OP1, OP2, Req, negate, AType>(
-              ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
-              inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-              shape.get<2>(), axis, static_cast<DType>(temperature));
+        if (safe_acc) {
+          if (shape.ndim() == 2) {
+            SoftmaxGrad<OP1, OP2, Req, negate, AType>(
+                ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
+                inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
+                shape.get<2>(), axis, static_cast<DType>(temperature));
+          } else {
+            SoftmaxGrad<OP1, OP2, Req, negate, AType>(
+                ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
+                inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
+                shape.get<3>(), axis, static_cast<DType>(temperature));
+          }
         } else {
-          SoftmaxGrad<OP1, OP2, Req, negate, AType>(
-              ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
-              inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
-              shape.get<3>(), axis, static_cast<DType>(temperature));
+          if (shape.ndim() == 2) {
+            SoftmaxGrad<OP1, OP2, Req, negate, DType>(
+                ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
+                inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
+                shape.get<2>(), axis, static_cast<DType>(temperature));
+          } else {
+            SoftmaxGrad<OP1, OP2, Req, negate, DType>(
+                ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
+                inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
+                shape.get<3>(), axis, static_cast<DType>(temperature));
+          }
         }
       });
     });
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index b84dd93300f8..e44bbbb6b8f6 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -43,7 +43,7 @@ static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& outputs) {
   // It seems MKLDNN softmax doesn't support training.
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && SupportMKLDNNSoftmax(param)) {
+  if (SupportMKLDNNSoftmax(param, inputs[0], outputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]);
     auto fn = SoftmaxCompute<cpu, mxnet_op::softmax_fwd>;
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 59f572211d0e..5290c09ec00d 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -397,7 +397,7 @@ inline std::vector<nnvm::NodeEntry> MakeGradNode(
                     &inputs, &dict, &n);
   std::vector<nnvm::NodeEntry> ret;
   for (uint32_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
+    ret.emplace_back(p, i, 0);
   }
   return ret;
 }
@@ -414,8 +414,7 @@ inline std::vector<nnvm::NodeEntry> MakeZeroGradNodes(
     } else {
       os << n->attrs.name << "_in" << i << "_backward";
     }
-    auto p = MakeNode("zeros_like", os.str(), {n->inputs[i]}, nullptr, &n);
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    ret.emplace_back(MakeNode("zeros_like", os.str(), {n->inputs[i]}, nullptr, &n));
   }
   return ret;
 }
@@ -425,10 +424,13 @@ inline std::vector<nnvm::NodeEntry> MakeZeroGradNodes(
 inline bool CheckGradAllZero(const std::vector<nnvm::NodeEntry>& ograds) {
   static const auto zero_op = nnvm::Op::Get("_zeros");
   static const auto zero_like_op = nnvm::Op::Get("zeros_like");
-  if (!ograds.size()) return false;
+  if (ograds.empty())
+    return false;
   for (const auto& grad : ograds) {
-    if (!grad.node) return false;
-    if (grad.node->op() != zero_op && grad.node->op() != zero_like_op ) return false;
+    if (!grad.node)
+      return false;
+    if (grad.node->op() != zero_op && grad.node->op() != zero_like_op )
+      return false;
   }
   return true;
 }
@@ -440,14 +442,15 @@ inline std::vector<nnvm::NodeEntry> MakeNonlossGradNode(
     const std::vector<nnvm::NodeEntry>& ograds,
     const std::vector<nnvm::NodeEntry>& inputs,
     const std::unordered_map<std::string, std::string>& dict) {
-  if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
+  if (CheckGradAllZero(ograds))
+    return MakeZeroGradNodes(n, ograds);
   auto p = MakeNode(op_name, n->attrs.name + "_backward",
                     nullptr, &dict, &n);
   p->inputs.insert(p->inputs.end(), ograds.begin(), ograds.end());
   p->inputs.insert(p->inputs.end(), inputs.begin(), inputs.end());
   std::vector<nnvm::NodeEntry> ret;
   for (uint32_t i = 0; i < p->num_outputs(); ++i) {
-    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
+    ret.emplace_back(p, i, 0);
   }
   return ret;
 }
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index bd923aebbb80..50637a8e7b42 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -140,6 +140,7 @@ struct MultiSGDMomParam : public dmlc::Parameter<MultiSGDMomParam> {
   }
 };
 
+
 template<typename ParamType, int input_stride>
 inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs,
                           mxnet::ShapeVector *in_attrs,
@@ -639,7 +640,7 @@ inline void SGDMomUpdate(const nnvm::NodeAttrs& attrs,
 }
 
 template<int n_in, int n_out, int total_in>
-inline bool MP_SGD_InferType(const nnvm::NodeAttrs& attrs,
+inline bool MP_InferType(const nnvm::NodeAttrs& attrs,
                              std::vector<int> *in_attrs,
                              std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(total_in)) << " in operator " << attrs.name;
@@ -1003,6 +1004,166 @@ inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs,
 }
 
 
+struct NAGParam : public dmlc::Parameter<NAGParam> {
+  float lr;
+  float wd;
+  float rescale_grad;
+  float clip_gradient;
+  DMLC_DECLARE_PARAMETER(NAGParam) {
+    DMLC_DECLARE_FIELD(lr)
+    .describe("Learning rate");
+    DMLC_DECLARE_FIELD(wd)
+    .set_default(0.0f)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude "
+              "of each weight.");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+  }
+};
+
+struct NAGMomParam : public dmlc::Parameter<NAGMomParam> {
+  float lr;
+  float momentum;
+  float wd;
+  float rescale_grad;
+  float clip_gradient;
+  DMLC_DECLARE_PARAMETER(NAGMomParam) {
+    DMLC_DECLARE_FIELD(lr)
+    .describe("Learning rate");
+    DMLC_DECLARE_FIELD(momentum)
+    .set_default(0.0f)
+    .describe("The decay rate of momentum estimates at each epoch.");
+    DMLC_DECLARE_FIELD(wd)
+    .set_default(0.0f)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude "
+              "of each weight.");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+  }
+};
+
+struct NAGMomKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, DType* mom_data,
+    const DType* weight_data, const DType* grad_data,
+    const DType param_clip_gradient, const DType param_momentum,
+    const DType param_lr, const DType param_wd,
+    const DType param_rescale_grad, const OpReqType req) {
+    if (param_clip_gradient >= 0.0f) {
+      mom_data[i] = param_momentum*mom_data[i]
+                    + mshadow_op::clip::Map(param_rescale_grad*grad_data[i],
+                                            param_clip_gradient)
+                    + (param_wd*weight_data[i]);
+      KERNEL_ASSIGN(out_data[i], req, weight_data[i]
+                    - param_lr*(param_momentum*mom_data[i]
+                    + mshadow_op::clip::Map(param_rescale_grad*grad_data[i],
+                                            param_clip_gradient)));
+    } else {
+      mom_data[i] = param_momentum*mom_data[i]
+                    + param_rescale_grad*grad_data[i]
+                    + (param_wd*weight_data[i]);
+      KERNEL_ASSIGN(out_data[i], req, weight_data[i]
+                    - param_lr*(param_momentum*mom_data[i]
+                    + param_rescale_grad*grad_data[i]));
+    }
+  }
+};
+
+template<typename xpu>
+inline void NAGMomUpdate(const nnvm::NodeAttrs& attrs,
+                         const OpContext &ctx,
+                         const std::vector<TBlob> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  NAGMomParam param = nnvm::get<NAGMomParam>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> mom = inputs[2].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+    Kernel<NAGMomKernel, xpu>::Launch(s, weight.shape_.Size(), out.dptr_,
+      mom.dptr_, weight.dptr_, grad.dptr_,
+      static_cast<DType>(param.clip_gradient),
+      static_cast<DType>(param.momentum), static_cast<DType>(param.lr),
+      static_cast<DType>(param.wd), static_cast<DType>(param.rescale_grad),
+      req[0]);
+  });
+}
+
+struct MP_NAGMomKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data,
+    float* mom_data, const DType* weight_data,
+    const DType* grad_data, float* weight32,
+    const float param_clip_gradient,
+    const float param_momentum, const float param_lr,
+    const float param_wd, const float param_rescale_grad,
+    const OpReqType req) {
+    float w = weight32[i];
+    if (param_clip_gradient >= 0.0f) {
+      mom_data[i] = param_momentum*mom_data[i]
+                    + mshadow_op::clip::Map(param_rescale_grad
+                    *static_cast<float>(grad_data[i]), param_clip_gradient)
+                    + (param_wd*w);
+      w = w - param_lr*(param_momentum*mom_data[i]
+                        + mshadow_op::clip::Map(param_rescale_grad
+                        *static_cast<float>(grad_data[i]),
+                        param_clip_gradient));
+      weight32[i] = w;
+      KERNEL_ASSIGN(out_data[i], req, w);
+    } else {
+      mom_data[i] = param_momentum*mom_data[i]
+                    + param_rescale_grad*static_cast<float>(grad_data[i])
+                    + (param_wd*w);
+      w = w - param_lr*(param_momentum*mom_data[i]
+       + param_rescale_grad*static_cast<float>(grad_data[i]));
+      weight32[i] = w;
+      KERNEL_ASSIGN(out_data[i], req, w);
+    }
+  }
+};
+
+template<typename xpu>
+inline void MP_NAGMomUpdate(const nnvm::NodeAttrs& attrs,
+                         const OpContext &ctx,
+                         const std::vector<TBlob> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  NAGMomParam param = nnvm::get<NAGMomParam>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, float> mom = inputs[2].FlatTo2D<xpu, float>(s);
+    Tensor<xpu, 2, float> weight32 = inputs[3].FlatTo2D<xpu, float>(s);
+    Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+    Kernel<MP_NAGMomKernel, xpu>::Launch(s, weight.shape_.Size(), out.dptr_,
+      mom.dptr_, weight.dptr_, grad.dptr_, weight32.dptr_,
+      param.clip_gradient, param.momentum, param.lr, param.wd,
+      param.rescale_grad, req[0]);
+  });
+}
+
+
 struct FTMLParam : public dmlc::Parameter<FTMLParam> {
   float lr;
   float beta1;
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 367b91b2646c..01410863640f 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -35,6 +35,8 @@ DMLC_REGISTER_PARAMETER(MultiSGDParam);
 DMLC_REGISTER_PARAMETER(MultiSGDMomParam);
 DMLC_REGISTER_PARAMETER(FTMLParam);
 DMLC_REGISTER_PARAMETER(AdamParam);
+DMLC_REGISTER_PARAMETER(NAGParam);
+DMLC_REGISTER_PARAMETER(NAGMomParam);
 DMLC_REGISTER_PARAMETER(RMSPropParam);
 DMLC_REGISTER_PARAMETER(RMSPropAlexParam);
 DMLC_REGISTER_PARAMETER(FtrlParam);
@@ -590,7 +592,7 @@ NNVM_REGISTER_OP(mp_sgd_update)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<SGDParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
-.set_attr<nnvm::FInferType>("FInferType", MP_SGD_InferType<2, 1, 3>)
+.set_attr<nnvm::FInferType>("FInferType", MP_InferType<2, 1, 3>)
 .set_attr<FCompute>("FCompute<cpu>", MP_SGDUpdate<cpu>)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
   [](const nnvm::NodeAttrs& attrs) {
@@ -607,7 +609,7 @@ NNVM_REGISTER_OP(mp_sgd_mom_update)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<SGDMomParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<4, 1>)
-.set_attr<nnvm::FInferType>("FInferType", MP_SGD_InferType<2, 1, 4>)
+.set_attr<nnvm::FInferType>("FInferType", MP_InferType<2, 1, 4>)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
   [](const nnvm::NodeAttrs& attrs) {
     return std::vector<uint32_t>{2, 3};
@@ -705,6 +707,57 @@ only the row slices whose indices appear in grad.indices are updated (for w, m a
 .add_arguments(AdamParam::__FIELDS__());
 
 
+NNVM_REGISTER_OP(nag_mom_update)
+.describe(R"code(Update function for Nesterov Accelerated Gradient( NAG) optimizer.
+It updates the weights using the following formula,
+
+.. math::
+  v_t = \gamma v_{t-1} + \eta * \nabla J(W_{t-1} - \gamma v_{t-1})\\
+  W_t = W_{t-1} - v_t
+
+Where 
+:math:`\eta` is the learning rate of the optimizer
+:math:`\gamma` is the decay rate of the momentum estimate
+:math:`\v_t` is the update vector at time step `t`
+:math:`\W_t` is the weight vector at time step `t`
+
+)code" ADD_FILELINE)
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NAGMomParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NAGMomUpdate<cpu>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mom", "NDArray-or-Symbol", "Momentum")
+.add_arguments(NAGMomParam::__FIELDS__());
+
+
+NNVM_REGISTER_OP(mp_nag_mom_update)
+.describe(R"code(Update function for multi-precision Nesterov Accelerated Gradient( NAG) optimizer.
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NAGMomParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<4, 1>)
+.set_attr<nnvm::FInferType>("FInferType", MP_InferType<2, 1, 4>)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2, 3};
+  })
+.set_attr<FCompute>("FCompute<cpu>", MP_NAGMomUpdate<cpu>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mom", "NDArray-or-Symbol", "Momentum")
+.add_argument("weight32", "NDArray-or-Symbol", "Weight32")
+.add_arguments(NAGMomParam::__FIELDS__());
+
+
 NNVM_REGISTER_OP(rmsprop_update)
 .describe(R"code(Update function for `RMSProp` optimizer.
 
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index c42cf1831c43..2c72462de016 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -251,6 +251,12 @@ NNVM_REGISTER_OP(multi_mp_sgd_update)
 NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, single_precision, 4>);
 
+NNVM_REGISTER_OP(nag_mom_update)
+.set_attr<FCompute>("FCompute<gpu>", NAGMomUpdate<gpu>);
+
+NNVM_REGISTER_OP(mp_nag_mom_update)
+.set_attr<FCompute>("FCompute<gpu>", MP_NAGMomUpdate<gpu>);
+
 NNVM_REGISTER_OP(ftml_update)
 .set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);
 
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index 4241b08a0c5e..efd211312093 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -243,9 +243,9 @@ class PoolingV1Prop : public OperatorProperty {
                   mxnet::ShapeVector *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1U);
     const mxnet::TShape &dshape = (*in_shape)[0];
-    CHECK_GE(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
                                << "Or 5D in (batch, channel, d, y, x)";
-    CHECK_LE(dshape.ndim(), 5U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+    CHECK_LE(dshape.ndim(), 5) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
                                << "Or 5D in (batch, channel, d, y, x)";
     mxnet::TShape oshape = dshape;
     if (dshape.ndim() ==  -1) return false;
diff --git a/src/operator/quantization/dequantize-inl.h b/src/operator/quantization/dequantize-inl.h
index 92b74b787141..b5f9e385c48e 100644
--- a/src/operator/quantization/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -74,11 +74,18 @@ inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 1U);
 
+  mxnet::TShape dshape = (*in_attrs)[0];
   for (size_t i = 1; i < 3; ++i) {
     SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape(1, 1));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+
+  if ((*out_attrs)[0].ndim() > 0) {
+    dshape[0] = ((*out_attrs)[0])[0];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, dshape);
+  }
+
   return shape_is_known(out_attrs->at(0));
 }
 
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
index 55028d8c8ccc..f81071704762 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
@@ -52,10 +52,11 @@ static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
   // For inference, we want to reorder the weight array so we don't need to
   // reorder data every time.
   if (weight.IsDefaultData()) {
-    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
-    // We also need to modify the layout on the original weight array. The
-    // data conversion happens after the weight array is used.
+    // We also need to modify the layout on the original weight array.
+    // Don't switch below sequence because naive engine will executes
+    // pushAsync synchronously.
     weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_primitive_desc());
+    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
   } else {
     weight_mem = weight.GetMKLDNNData();
     CHECK(weight_mem->get_primitive_desc() == fwd.fwd_pd.weights_primitive_desc());
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
index e8abab22446e..aca129a56f3e 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -93,8 +93,11 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
   const mkldnn::memory *weight_mem = nullptr;
 
   if (weight.IsDefaultData()) {
-    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), 1);
+    // We also need to modify the layout on the original weight array.
+    // Don't switch below sequence because naive engine will executes
+    // pushAsync synchronously.
     weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_primitive_desc());
+    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), 1);
   } else {
     weight_mem = weight.GetMKLDNNData();
     CHECK(weight_mem->get_primitive_desc() == fwd.fwd_pd.weights_primitive_desc());
diff --git a/src/operator/quantization/quantize-inl.h b/src/operator/quantization/quantize-inl.h
index 7b856579a7b5..5108b130e1ab 100644
--- a/src/operator/quantization/quantize-inl.h
+++ b/src/operator/quantization/quantize-inl.h
@@ -119,13 +119,20 @@ inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
 
+  mxnet::TShape dshape = (*in_attrs)[0];
   for (size_t i = 1; i < 3; ++i) {
     SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape(1, 1));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape{1});
-  SHAPE_ASSIGN_CHECK(*out_attrs, 2, mxnet::TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 2, mxnet::TShape(1, 1));
+
+  if ((*out_attrs)[0].ndim() > 0) {
+    dshape[0] = ((*out_attrs)[0])[0];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, dshape);
+  }
+
   return shape_is_known(out_attrs->at(0));
 }
 
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 7591477b1081..412e78e70fff 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -59,7 +59,7 @@ NodePtr InsertNode(std::string op_name,
     std::string node_name, NodePtr current, NodeEntry previous) {
   NodePtr node = CreateNode(op_name, node_name);
   node->inputs.emplace_back(previous);
-  current->inputs.emplace_back(NodeEntry{node, 0, 0});
+  current->inputs.emplace_back(node);
   return node;
 }
 
@@ -191,7 +191,7 @@ Graph QuantizeGraph(Graph &&src) {
             mirror_entry_map[e] = NodeEntry{quantize_node, 0, e.version};
           }
         } else if (mirror_node->op() == Op::Get("_contrib_dequantize")) {
-          new_node->inputs.emplace_back(NodeEntry{mirror_node->inputs[0].node, e.index, e.version});
+          new_node->inputs.emplace_back(mirror_node->inputs[0].node, e.index, e.version);
         } else {
           // If the entry e's node needs quantization, or mirror_entry is from a quantize op,
           // simply add mirror_entry to the input of the new_node.
@@ -232,11 +232,11 @@ Graph QuantizeGraph(Graph &&src) {
         }
         if (mirror_entry_map.count(e)) {
           auto quantize_entry = mirror_entry_map[e];
-          new_node->inputs.emplace_back(NodeEntry{quantize_entry.node, min_index, 0});
-          new_node->inputs.emplace_back(NodeEntry{quantize_entry.node, max_index, 0});
+          new_node->inputs.emplace_back(quantize_entry.node, min_index, 0);
+          new_node->inputs.emplace_back(quantize_entry.node, max_index, 0);
         } else {
-          new_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
-          new_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+          new_node->inputs.emplace_back(mirror_node, min_index, 0);
+          new_node->inputs.emplace_back(mirror_node, max_index, 0);
         }
       }
 
@@ -253,8 +253,7 @@ Graph QuantizeGraph(Graph &&src) {
           requantize_node->op()->attr_parser(&(requantize_node->attrs));
         }
         for (size_t i = 0; i < 3; ++i) {
-          requantize_node->inputs.emplace_back(
-              NodeEntry{new_node, static_cast<uint32_t>(i), 0});
+          requantize_node->inputs.emplace_back(new_node, static_cast<uint32_t>(i), 0);
         }
         new_node = requantize_node;
       }
@@ -283,18 +282,17 @@ Graph QuantizeGraph(Graph &&src) {
           NodePtr dequantize_node = CreateNode("_contrib_dequantize",
             e.node->attrs.name + "_dequantize");
           dequantize_node->inputs.emplace_back(mirror_entry);
-          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
-          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+          dequantize_node->inputs.emplace_back(mirror_node, min_index, 0);
+          dequantize_node->inputs.emplace_back(mirror_node, max_index, 0);
           dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
 
-          new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
+          new_node->inputs.emplace_back(dequantize_node, 0, 0);
           mirror_map[e.node.get()] = std::move(dequantize_node);
         } else if (mirror_entry_map.count(e)) {
           new_node->inputs.emplace_back(
-              NodeEntry{mirror_entry_map[e].node->inputs[0].node, e.index, e.version});
+              mirror_entry_map[e].node->inputs[0].node, e.index, e.version);
         } else {
-          new_node->inputs.emplace_back(
-              NodeEntry{mirror_node, e.index, e.version});
+          new_node->inputs.emplace_back(mirror_node, e.index, e.version);
         }
       }
     }
@@ -318,12 +316,12 @@ Graph QuantizeGraph(Graph &&src) {
       NodePtr dequantize_node = CreateNode("_contrib_dequantize",
           e.node->attrs.name + "_dequantize");
       dequantize_node->inputs.emplace_back(mirror_entry);
-      dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
-      dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+      dequantize_node->inputs.emplace_back(mirror_node, min_index, 0);
+      dequantize_node->inputs.emplace_back(mirror_node, max_index, 0);
       dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
-      outputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
+      outputs.emplace_back(dequantize_node, 0, 0);
     } else {
-      outputs.emplace_back(NodeEntry{mirror_map.at(e.node.get()), e.index, e.version});
+      outputs.emplace_back(mirror_map.at(e.node.get()), e.index, e.version);
     }
   }
 
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
index a8cbc0b6fdf5..d8814cc6cb20 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -109,9 +109,16 @@ static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs, std::vector<TSh
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 3U);
 
+  mxnet::TShape dshape = (*in_attrs)[0];
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
-  SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape(1, 1));
+
+  if ((*out_attrs)[0].ndim() > 0) {
+    dshape[0] = ((*out_attrs)[0])[0];
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, dshape);
+  }
+
   return !shape_is_none(out_attrs->at(0));
 }
 
diff --git a/src/operator/quantization/quantized_activation.cc b/src/operator/quantization/quantized_activation.cc
index 4ab74d0b1c3f..40a28d6bb018 100644
--- a/src/operator/quantization/quantized_activation.cc
+++ b/src/operator/quantization/quantized_activation.cc
@@ -115,20 +115,22 @@ the float32 data into int8.
 .add_argument("max_data", "NDArray-or-Symbol", "Maximum value of data.")
 .add_arguments(ActivationParam::__FIELDS__());
 
+
 NNVM_REGISTER_OP(Activation)
 .set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
-  ActivationParam param;
-  param.Init(attrs.dict);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   nnvm::NodePtr node = nnvm::Node::Create();
   if (param.act_type == activation::kReLU) {
     node->attrs.op = Op::Get("_contrib_quantized_act");
     node->attrs.name = "quantized_" + attrs.name;
   } else {
+    LOG(INFO) << "Currently, quantized activation only supports relu, exclude "
+              << attrs.name << " which act_type is " << param.act_type;
     node->attrs.op = nullptr;
     node->attrs.name = attrs.name;
   }
   node->attrs.dict = attrs.dict;
-  if (node->op()->attr_parser != nullptr) {
+  if (node->op() != nullptr && node->op()->attr_parser != nullptr) {
     node->op()->attr_parser(&(node->attrs));
   }
   return node;
diff --git a/src/operator/quantization/quantized_concat.cc b/src/operator/quantization/quantized_concat.cc
index d6aeb41da1f8..f7a810b1e404 100644
--- a/src/operator/quantization/quantized_concat.cc
+++ b/src/operator/quantization/quantized_concat.cc
@@ -138,11 +138,19 @@ If any input holds int8, then the output will be int8. Otherwise output will be
 
 NNVM_REGISTER_OP(Concat)
 .set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
   nnvm::NodePtr node = nnvm::Node::Create();
-  node->attrs.op = Op::Get("_contrib_quantized_concat");
-  node->attrs.name = "quantized_" + attrs.name;
+  if (param.dim > 0) {
+    node->attrs.op = Op::Get("_contrib_quantized_concat");
+    node->attrs.name = "quantized_" + attrs.name;
+  } else {
+    LOG(INFO) << "Currently, quantized concat only supports dim>0, exclude "
+              << attrs.name << " which dim is " << param.dim;
+    node->attrs.op = nullptr;
+    node->attrs.name = attrs.name;
+  }
   node->attrs.dict = attrs.dict;
-  if (node->op()->attr_parser != nullptr) {
+  if (node->op() != nullptr && node->op()->attr_parser != nullptr) {
     node->op()->attr_parser(&(node->attrs));
   }
   return node;
diff --git a/src/operator/quantization/quantized_elemwise_add.cc b/src/operator/quantization/quantized_elemwise_add.cc
index f821e6598192..0e7034e88b8c 100644
--- a/src/operator/quantization/quantized_elemwise_add.cc
+++ b/src/operator/quantization/quantized_elemwise_add.cc
@@ -125,6 +125,9 @@ and max thresholds representing the threholds for quantizing the float32 output
 .add_argument("rhs_max", "NDArray-or-Symbol", "6th input");
 
 
+// TODO(zhangrong): need extra condition check if there's benefited if it's switched on
+// Since it's not compute-intensive.
+#if 0
 NNVM_REGISTER_OP(elemwise_add)
 .set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
   nnvm::NodePtr node = nnvm::Node::Create();
@@ -136,6 +139,7 @@ NNVM_REGISTER_OP(elemwise_add)
   }
   return node;
 });
+#endif
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index ceac0b6ec9a0..23790ca78b3d 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -47,9 +47,10 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_shape->size(), num_inputs * 3);
   CHECK_EQ(out_shape->size(), 3U);
 
-  CHECK(shape_is_known(in_shape->at(0)))
-    << "QuantizedFullyConnectedOp input data shape must be given";
-  const mxnet::TShape& dshape = in_shape->at(0);
+  mxnet::TShape dshape = (*in_shape)[0];
+  // require data ndim to be known
+  if (!mxnet::ndim_is_known(dshape)) return false;
+
   index_t num_input;
   if (!param.flatten) {
     num_input = dshape[dshape.ndim() - 1];
@@ -57,7 +58,7 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
     num_input = dshape.ProdShape(1, dshape.ndim());
   }
 
-  TShape wshape = Shape2(param.num_hidden, num_input);
+  mxnet::TShape wshape = Shape2(param.num_hidden, num_input);
   SHAPE_ASSIGN_CHECK(*in_shape, 1, wshape);
   if (!param.no_bias) {
     mxnet::TShape bshape = Shape1(param.num_hidden);
@@ -65,11 +66,11 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
   }
 
   for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
-    SHAPE_ASSIGN_CHECK(*in_shape, i, mxnet::TShape{1});
+    SHAPE_ASSIGN_CHECK(*in_shape, i, mxnet::TShape(1, 1));
   }
 
   if (!param.flatten) {
-    TShape result_shape(dshape);
+    mxnet::TShape result_shape(dshape);
     result_shape[dshape.ndim() - 1] = param.num_hidden;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
   } else {
@@ -77,6 +78,11 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
   }
   SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape(1, 1));
   SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape(1, 1));
+
+  if ((*out_shape)[0].ndim() > 0) {
+    dshape[0] = ((*out_shape)[0])[0];
+    SHAPE_ASSIGN_CHECK(*in_shape, 0, dshape);
+  }
   return true;
 }
 
diff --git a/src/operator/random/shuffle_op.cc b/src/operator/random/shuffle_op.cc
index 1bd70b1f323a..70315716dea2 100644
--- a/src/operator/random/shuffle_op.cc
+++ b/src/operator/random/shuffle_op.cc
@@ -33,6 +33,7 @@
 #include <vector>
 #include <cstring>
 #ifdef USE_GNU_PARALLEL_SHUFFLE
+  #include <unistd.h>
   #include <parallel/algorithm>
 #endif
 #include "../elemwise_op_common.h"
@@ -45,8 +46,13 @@ namespace {
 template<typename DType, typename Rand>
 void Shuffle1D(DType* const out, const index_t size, Rand* const prnd) {
   #ifdef USE_GNU_PARALLEL_SHUFFLE
-    auto rand_n = [prnd](index_t n) {
-      std::uniform_int_distribution<index_t> dist(0, n - 1);
+     /*
+      * See issue #15029: the data type of n needs to be compatible with
+      * the gcc library: https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B\
+      * -v3/include/parallel/random_shuffle.h#L384
+      */
+    auto rand_n = [prnd](uint32_t n) {
+      std::uniform_int_distribution<uint32_t> dist(0, n - 1);
       return dist(*prnd);
     };
     __gnu_parallel::random_shuffle(out, out + size, rand_n);
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index d8f102de1675..ba59937a7152 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -272,7 +272,7 @@ struct RegressionOpGrad {
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> heads;
     heads.push_back(n->inputs[reg_enum::kLabel]);
-    heads.emplace_back(nnvm::NodeEntry{n, reg_enum::kOut, 0});
+    heads.emplace_back(n, reg_enum::kOut, 0);
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
 };
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index d164333953f2..1046f01cf6e2 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -44,17 +44,13 @@
 #include "./math_functions-inl.h"
 #include "./operator_common.h"
 #include "./rnn_impl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./nn/mkldnn/mkldnn_rnn_impl.h"
+#endif
 
 namespace mxnet {
 namespace op {
 
-namespace rnn_enum {
-  enum RNNOpInputs {kData, kParams, kState, kStateCell, kSequenceLength};
-  enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
-  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
-  enum RNNOpResource {kTempSpace, kCuDNNDropoutDescSpace};
-}
-
 inline int GetRnnParamSize(int num_layer,
                            int input_size,
                            int state_size,
@@ -400,9 +396,29 @@ class RNNOp {
  public:
   RNNParam param_;
   Context ctx_;
+  #if MXNET_USE_MKLDNN == 1
+  std::vector<mkldnn::memory> concat_weight_memory;
+  std::vector<mkldnn::memory> concat_iter_memory;
+  std::vector<primitive> rnn_forward_prim;
+  std::vector<mkldnn::memory> x_memory;
+  std::vector<mkldnn::memory> hcx_memory;
+  std::vector<mkldnn::memory> wx_memory;
+  std::vector<mkldnn::memory> wh_memory;
+  std::vector<mkldnn::memory> bias_memory;
+  std::vector<mkldnn::memory> y_memory;
+  std::vector<mkldnn::memory> hcy_memory;
+  bool has_cache;
+  bool init_mem_;
+  size_t reserve_mem_size_;
+  Storage::Handle mem_space_;
+  #endif
   explicit RNNOp(RNNParam param, Context ctx) {
     this->param_ = param;
     this->ctx_ = ctx;
+    #if MXNET_USE_MKLDNN == 1
+    init_mem_ = false;
+    reserve_mem_size_ = 0;
+    #endif
     #if MXNET_USE_CUDNN_RNN
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
@@ -410,8 +426,8 @@ class RNNOp {
     // No tests in place for fp16 RNNs, so leave TensorCore disabled for now.
     cudnn_tensor_core_ = false;
     // When fp16 RNN tests are introduced, we can enable TensorCore as follows:
-//    cudnn_tensor_core =
-//        mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
+    // cudnn_tensor_core =
+    //     mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
     // Defaults
     input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
     // RNN Mode
@@ -492,7 +508,6 @@ class RNNOp {
       this->temp_init_space_ = false;
       this->reserve_cpu_space_size_ = 0;
       this->temp_cpu_space_size_ = 0;
-
       if (param_.projection_size.has_value()) {
         LOG(FATAL) <<
             "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
@@ -505,6 +520,12 @@ class RNNOp {
   }
 
   ~RNNOp() {
+    #if MXNET_USE_MKLDNN == 1
+    if (init_mem_) {
+      Storage::Get()->Free(mem_space_);
+      init_mem_ = false;
+    }
+    #endif
     #if MXNET_USE_CUDNN_RNN
     CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
     CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
@@ -528,7 +549,6 @@ class RNNOp {
         CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
       }
       init_cudnn_ = false;
-      Storage::Get()->Free(temp_space_);
       Storage::Get()->Free(reserve_space_);
     }
     #if MXNET_USE_CUDNN_GE_7200
@@ -656,6 +676,12 @@ class RNNOp {
       Init(ctx, s, in_data, out_data);
     }
 
+    // Get temp space
+    int temp_size = workspace_size_;
+    Tensor<gpu, 1, DType> temp_space =
+        ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+            mshadow::Shape1(temp_size), s);
+
 #if MXNET_USE_CUDNN_GE_7200
 
     cudnnRNNDataLayout_t layout_t;
@@ -749,7 +775,7 @@ class RNNOp {
                                            nullptr,
                                            nullptr,
                                            nullptr,
-                                           temp_space_.dptr,
+                                           temp_space.dptr_,
                                            workspace_byte_,
                                            reserve_space_.dptr,
                                            reserve_space_byte_));
@@ -771,7 +797,7 @@ class RNNOp {
                                          hy_ptr,
                                          cy_desc_,
                                          cy_ptr,
-                                         temp_space_.dptr,
+                                         temp_space.dptr_,
                                          workspace_byte_,
                                          reserve_space_.dptr,
                                          reserve_space_byte_));
@@ -802,7 +828,7 @@ class RNNOp {
                                             nullptr,
                                             nullptr,
                                             nullptr,
-                                            temp_space_.dptr,
+                                            temp_space.dptr_,
                                             workspace_byte_));
 #else
       CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
@@ -822,29 +848,30 @@ class RNNOp {
                                           hy_ptr,
                                           cy_desc_,
                                           cy_ptr,
-                                          temp_space_.dptr,
+                                          temp_space.dptr_,
                                           workspace_byte_));
 #endif
     }
 #endif
 
     if (ctx_.dev_type == kCPU) {
-      // allocate temp space
-      const size_t work_cpu_space_size =
-        GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                            param_.state_size, direction, param_.mode);
-      if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
-        Storage::Get()->Free(temp_cpu_space_);
-        temp_init_space_ = false;
-      }
-      if (!temp_init_space_) {
-        temp_cpu_space_ = Storage::Get()->Alloc
-          (work_cpu_space_size * sizeof(DType), Context::CPU());
-        temp_cpu_space_size_ = work_cpu_space_size;
-        temp_init_space_ = true;
-      }
-      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
       if (ctx.is_train) {
+        // allocate temp space
+        const size_t work_cpu_space_size =
+            GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                              param_.state_size, direction, param_.mode);
+        if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
+            Storage::Get()->Free(temp_cpu_space_);
+            temp_init_space_ = false;
+        }
+        if (!temp_init_space_) {
+          temp_cpu_space_ = Storage::Get()->Alloc
+              (work_cpu_space_size * sizeof(DType), Context::CPU());
+          temp_cpu_space_size_ = work_cpu_space_size;
+          temp_init_space_ = true;
+        }
+        DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+
         const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                      param_.seq_length_, param_.batch_size_,
                                                      param_.state_size, param_.mode);
@@ -880,23 +907,78 @@ class RNNOp {
                                   param_.p,
                                   param_.mode);
       } else {
-        RNNForwardInference<DType>(work_cpu_space,
-                                   param_.state_outputs,
-                                   param_.num_layers,
-                                   direction,
-                                   param_.seq_length_,
-                                   param_.batch_size_,
-                                   param_.input_size_,
-                                   param_.state_size,
-                                   x.dptr_,
-                                   hx.dptr_,
-                                   cx_ptr,
-                                   w.dptr_,
-                                   b_ptr,
-                                   y.dptr_,
-                                   hy_ptr,
-                                   cy_ptr,
-                                   param_.mode);
+        #if MXNET_USE_MKLDNN == 1
+        if (dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1) && param_.mode != rnn_enum::kGru) {
+          // TODO(zixuanweeei): MKLDNN GRU has precision issue. A stable one
+          //   will be added to MXNet when we figure out the issue.
+          int dtype = in_data[rnn_enum::kData].type_flag_;
+          MKLDNNRNNForwardInference<DType>(param_.state_outputs,
+                                           param_.num_layers,
+                                           direction,
+                                           param_.seq_length_,
+                                           param_.batch_size_,
+                                           param_.input_size_,
+                                           param_.state_size,
+                                           x.dptr_,
+                                           hx.dptr_,
+                                           cx_ptr,
+                                           w.dptr_,
+                                           b_ptr,
+                                           y.dptr_,
+                                           hy_ptr,
+                                           cy_ptr,
+                                           &concat_weight_memory,
+                                           &concat_iter_memory,
+                                           &x_memory,
+                                           &hcx_memory,
+                                           &wx_memory,
+                                           &wh_memory,
+                                           &bias_memory,
+                                           &y_memory,
+                                           &hcy_memory,
+                                           &rnn_forward_prim,
+                                           &has_cache,
+                                           dtype,
+                                           ctx.is_train,
+                                           param_.mode);
+        } else {
+        #endif
+          //  Before integrating MKLDNN GRU fp32 inference
+          //  using below code for keep func being OK
+          const size_t work_cpu_space_size =
+              GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                                  param_.state_size, direction, param_.mode);
+          if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
+            Storage::Get()->Free(temp_cpu_space_);
+            temp_init_space_ = false;
+          }
+          if (!temp_init_space_) {
+            temp_cpu_space_ = Storage::Get()->Alloc
+                (work_cpu_space_size * sizeof(DType), Context::CPU());
+            temp_cpu_space_size_ = work_cpu_space_size;
+            temp_init_space_ = true;
+          }
+          DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+          RNNForwardInference<DType>(work_cpu_space,
+                                     param_.state_outputs,
+                                     param_.num_layers,
+                                     direction,
+                                     param_.seq_length_,
+                                     param_.batch_size_,
+                                     param_.input_size_,
+                                     param_.state_size,
+                                     x.dptr_,
+                                     hx.dptr_,
+                                     cx_ptr,
+                                     w.dptr_,
+                                     b_ptr,
+                                     y.dptr_,
+                                     hy_ptr,
+                                     cy_ptr,
+                                     param_.mode);
+        #if MXNET_USE_MKLDNN == 1
+        }
+        #endif
       }
     }
   }
@@ -984,6 +1066,12 @@ class RNNOp {
       Init(ctx, s, in_data, out_data);
     }
 
+    // Get temp space
+    int temp_size = workspace_size_;
+    Tensor<gpu, 1, DType> temp_space =
+        ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+            mshadow::Shape1(temp_size), s);
+
     #if MXNET_USE_CUDNN_GE_7200
     CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
                                       rnn_desc_,
@@ -1011,7 +1099,7 @@ class RNNOp {
                                       dcx_ptr,
                                       nullptr,
                                       nullptr,
-                                      temp_space_.dptr,
+                                      temp_space.dptr_,
                                       workspace_byte_,
                                       reserve_space_.dptr,
                                       reserve_space_byte_));
@@ -1023,7 +1111,7 @@ class RNNOp {
                                          hx.dptr_,
                                          y_data_desc_,
                                          y.dptr_,
-                                         temp_space_.dptr,
+                                         temp_space.dptr_,
                                          workspace_byte_,
                                          dw_desc_,
                                          dw.dptr_,
@@ -1053,7 +1141,7 @@ class RNNOp {
                                     dhx.dptr_,
                                     dcx_desc_,
                                     dcx_ptr,
-                                    temp_space_.dptr,
+                                    temp_space.dptr_,
                                     workspace_byte_,
                                     reserve_space_.dptr,
                                     reserve_space_byte_));
@@ -1066,7 +1154,7 @@ class RNNOp {
                                        hx.dptr_,
                                        y_desc_vec_.data(),
                                        y.dptr_,
-                                       temp_space_.dptr,
+                                       temp_space.dptr_,
                                        workspace_byte_,
                                        dw_desc_,
                                        dw.dptr_,
@@ -1301,22 +1389,34 @@ class RNNOp {
                                             strideA));
 
       // Create Dropout descriptors
-      DType* dropout_states_ = NULL;
       if (param_.p > 0) {
          ctx.requested[rnn_enum::kCuDNNDropoutDescSpace].get_cudnn_dropout_desc
             (&dropout_desc_, s, 1.0f - param_.p, seed_);
-      } else {
-        dropout_byte_ = 0;
       }
-
+      // Only update the probability by passing in a null dropout_states ptr
+      DType* dropout_states = NULL;
+      size_t dropout_bytes = 0;
       CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
                                            param_.p,  // discard probability
-                                           dropout_states_, dropout_byte_,
+                                           dropout_states, dropout_bytes,
                                            seed_));
 
       // RNN descriptors
+      cudnnDataType_t dtype_with_fallback_;
       #if CUDNN_MAJOR >= 6
       cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+      // On arch's 50 and 52(Maxwell), the gpu doesn't support native fp16 compute.
+      // Before cuDNN 7.5.0, when running fp16, cuDNN fallback to fp32 under the hood on Maxwell.
+      // That's not the case begining from 7.5.0. Thereby adding fallback explicitly here.
+      #if __CUDA_ARCH__ < 530 && CUDNN_MAJOR >=7 && CUDNN_MINOR >= 5
+      if (dtype_ == CUDNN_DATA_HALF) {
+        dtype_with_fallback_ = CUDNN_DATA_FLOAT;
+      } else {
+        dtype_with_fallback_ = dtype_;
+      }
+      #else
+        dtype_with_fallback_ = dtype_;
+      #endif
       CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
                                           rnn_desc_,
                                           param_.state_size,
@@ -1326,7 +1426,7 @@ class RNNOp {
                                           direction_,
                                           mode_,
                                           rnn_algo,
-                                          dtype_));
+                                          dtype_with_fallback_));
       #else
       CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
                                        param_.state_size,
@@ -1379,8 +1479,6 @@ class RNNOp {
       workspace_size_ = workspace_byte_ / sizeof(DType);
       // Allocate the reserve space
       reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
-      // Allocate the temp space
-      temp_space_ = Storage::Get()->Alloc(workspace_byte_, Context::GPU(s->dev_id));
       // Check that number of params are correct
       size_t cudnn_param_size;
       CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
@@ -1449,9 +1547,9 @@ class RNNOp {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
-  Storage::Handle reserve_space_, temp_space_;
+  Storage::Handle reserve_space_;
   uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
-  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  size_t workspace_byte_, reserve_space_byte_;
   int workspace_size_;
   std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
   #if MXNET_USE_CUDNN_GE_7200
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 296d57eb4713..6a0dbd7a4e23 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -167,6 +167,37 @@ static bool RNNType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+static std::vector<ResourceRequest> RNNResourceEx(const NodeAttrs& attrs, const int dev_mask,
+                                                  const DispatchMode dispatch_mode) {
+  std::vector<ResourceRequest> request;
+  if (dev_mask == kGPU) {
+#if MXNET_USE_CUDNN_RNN
+    request.emplace_back(ResourceRequest::kTempSpace);
+
+    const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+    if (param.p != 0 && 1.0f - param.p > 0) {
+      request.emplace_back(ResourceRequest::kCuDNNDropoutDesc);
+    }
+#endif
+  }
+  return request;
+}
+
+inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs,
+                                  const int dev_mask,
+                                  DispatchMode* dispatch_mode,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  DispatchMode wanted_mode = DispatchMode::kFCompute;
+
+  #if MXNET_USE_MKLDNN == 1
+    wanted_mode = DispatchMode::kFComputeEx;
+  #endif
+
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
 struct RNNGrad {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr &n,
@@ -174,16 +205,16 @@ struct RNNGrad {
     const RNNParam& params = nnvm::get<RNNParam>(n->attrs.parsed);
     std::vector<nnvm::NodeEntry> heads{ n->inputs[rnn_enum::kData],
       n->inputs[rnn_enum::kParams], n->inputs[rnn_enum::kState] };
-    heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kOut, 0});
+    heads.emplace_back(n, rnn_enum::kOut, 0);
     heads.push_back(ograd[rnn_enum::kOut]);
     if (params.state_outputs) {
-      heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateOut, 0});
+      heads.emplace_back(n, rnn_enum::kStateOut, 0);
       heads.push_back(ograd[rnn_enum::kStateOut]);
     }
     if (params.mode == rnn_enum::kLstm) {
       heads.push_back(n->inputs[rnn_enum::kStateCell]);
       if (params.state_outputs) {
-        heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateCellOut, 0});
+        heads.emplace_back(n, rnn_enum::kStateCellOut, 0);
         heads.push_back(ograd[rnn_enum::kStateCellOut]);
       }
     }
@@ -191,6 +222,417 @@ struct RNNGrad {
   }
 };
 
+#if MXNET_USE_MKLDNN == 1
+static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
+  std::vector<TBlob> in_blobs;
+  std::vector<TBlob> out_blobs;
+  std::vector<NDArray> temp_ndarrays_i;
+  std::vector<NDArray> temp_ndarrays_o;
+  for (const NDArray& in : inputs) {
+    if (in.storage_type() == kDefaultStorage) {
+      temp_ndarrays_i.push_back(in.Reorder2Default());
+      in_blobs.emplace_back(temp_ndarrays_i.back().data());
+    } else {
+      in_blobs.emplace_back(in.data());
+    }
+  }
+
+  for (const NDArray& out : outputs) {
+    if (out.storage_type() == kDefaultStorage) {
+      temp_ndarrays_o.push_back(out.Reorder2Default());
+      out_blobs.emplace_back(temp_ndarrays_o.back().data());
+    } else {
+      out_blobs.emplace_back(out.data());
+    }
+  }
+  int dtype = in_blobs[rnn_enum::kData].type_flag_;
+  int itype = in_blobs[inputs.size()-1].type_flag_;
+  mkldnn::memory::data_type mkldnn_dtype = get_mkldnn_type(dtype);
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    MSHADOW_TYPE_SWITCH(itype, IType, {
+      RNNOp<cpu, DType, IType>& op = state_ptr.get_state<RNNOp<cpu, DType, IType>>();
+      const RNNParam& param = op.param_;
+      int ngates = 0, nstates = 0;
+      GetMKLDNNRNNAlgo(param.mode, &ngates, &nstates);
+      int D = param.bidirectional ? 2 : 1;
+      Tensor<cpu, 3, DType> x = in_blobs[rnn_enum::kData].get<cpu, 3, DType>(s);
+      int T = x.shape_[0];
+      int N = x.shape_[1];
+      int I = x.shape_[2];
+      int H = param.state_size;
+      int L = param.num_layers;
+
+      const size_t r_size = GetMKLDNNRNNCacheMemorySize(L, D, T, N, I, H, param.mode);
+      if (op.init_mem_ && op.reserve_mem_size_ < r_size) {
+        Storage::Get()->Free(op.mem_space_);
+        op.init_mem_ = false;
+      }
+      if (!op.init_mem_) {
+        op.mem_space_ = Storage::Get()->Alloc(
+            r_size * sizeof(DType),
+            Context::CPU());
+        op.reserve_mem_size_ = r_size;
+        op.init_mem_ = true;
+        op.has_cache = false;
+      }
+      if (op.has_cache && op.x_memory.size() == 0) {
+        op.has_cache = false;
+      }
+
+      DType* workptr = static_cast<DType*>(op.mem_space_.dptr);
+      mkldnn::memory::dims src_layer_tz_0 = {T, N, I};
+      mkldnn::memory::dims src_layer_tz = {T, N, D * H};
+      mkldnn::memory::dims dst_layer_tz = {T, N, D * H};
+      auto dst_layer_md = mkldnn::memory::desc(
+        { dst_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
+      if (op.x_memory.size() == 0) {
+        if (D == 1 && I == H) {
+          auto user_src_layer_md = mkldnn::memory::desc(
+              { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
+          auto user_src_layer_memory_n = mkldnn::memory({ user_src_layer_md, cpu_engine });
+          op.x_memory.push_back(user_src_layer_memory_n);
+
+          mkldnn::memory::dims weights_layer_tz = {L, 1, I, ngates, H};  //  ldigo
+          mkldnn::memory::dims weights_iter_tz = {L, 1, H, ngates, H};  //  ldigo
+          mkldnn::memory::dims bias_tz = {L, 1, ngates, H};
+          auto user_weight_layer_md = mkldnn::memory::desc(
+              { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+          auto user_weight_iter_md = mkldnn::memory::desc(
+              { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+          auto user_bias_md = mkldnn::memory::desc({ bias_tz },
+              mkldnn_dtype, mkldnn::memory::format::ldgo);
+          DType* weight_layer_n = workptr;  //  L * I * ngates * H
+          auto user_weight_layer_memory_n
+              = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
+          op.wx_memory.push_back(user_weight_layer_memory_n);
+
+          DType* weight_iter_n = weight_layer_n + L * I * ngates * H;  //  L * H * ngates * H
+          auto user_weight_iter_memory_n
+              = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
+          op.wh_memory.push_back(user_weight_iter_memory_n);
+
+          DType* bias_n = weight_iter_n + L * H * ngates * H;  //  L * ngates * H
+          auto user_bias_memory_n =
+              mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
+          op.bias_memory.push_back(user_bias_memory_n);
+
+          auto wx_md_n = mkldnn::memory::desc(
+              { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+          DType* wx_n = bias_n + L * ngates * H;  //   L * ngates * I * H
+          auto wx_memory_n =
+              mkldnn::memory({ wx_md_n, cpu_engine }, wx_n);
+          DType* wh_n = wx_n + L * ngates * I * H;  //  L * ngates * H * H
+          auto wh_md_n = mkldnn::memory::desc(
+              { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+          auto wh_memory_n =
+              mkldnn::memory({ wh_md_n, cpu_engine }, wh_n);
+
+          op.concat_weight_memory.push_back(wx_memory_n);
+          op.concat_weight_memory.push_back(wh_memory_n);
+          workptr = wh_n + L * ngates * H * H;
+
+          mkldnn::memory::dims src_iter_tz_n1 = {1, 1, nstates, N, H};  //  ldsnc
+          auto src_iter_md_n1 = mkldnn::memory::desc(
+              { src_iter_tz_n1 }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+          for (int l = 0; l < L; l++) {
+            DType* src_iter_n1 = workptr;  //  nstates * N * H
+            auto src_iter_memory_n1 =
+                mkldnn::memory({ src_iter_md_n1, cpu_engine }, src_iter_n1);
+            op.concat_iter_memory.push_back(src_iter_memory_n1);
+            workptr = src_iter_n1 + nstates * N * H;
+          }
+          mkldnn::memory::dims src_iter_tz_n = {L, 1, nstates, N, H};  //  ldsnc
+          auto src_iter_md_n = mkldnn::memory::desc(
+              { src_iter_tz_n }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+          DType* src_iter_n = workptr;  //  L * nstates * N * H
+          auto src_iter_memory_n =
+              mkldnn::memory({ src_iter_md_n, cpu_engine }, src_iter_n);
+          op.concat_iter_memory.push_back(src_iter_memory_n);
+          op.hcx_memory.push_back(src_iter_memory_n);
+          DType* dst_layer_n = src_iter_n + L * nstates * N * H;  //  T * N * D * H
+          auto dst_layer_memory_n
+              = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n);
+          op.y_memory.push_back(dst_layer_memory_n);
+
+          mkldnn::memory::dims dst_iter_tz_n = {L, 1, nstates, N, H};  //  ldsnc
+          auto dst_iter_md_n = mkldnn::memory::desc(
+              { dst_iter_tz_n }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+          DType* dst_iter_n = dst_layer_n + T * N * D * H;  //  L * nstates * N * H
+          auto dst_iter_memory_n =
+              mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n);
+          op.hcy_memory.push_back(dst_iter_memory_n);
+          workptr = dst_iter_n + L * nstates * N * H;
+
+        } else {
+          auto user_src_layer_md_0 = mkldnn::memory::desc(
+              { src_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::tnc);
+          auto user_src_layer_memory_0 = mkldnn::memory({ user_src_layer_md_0, cpu_engine });
+          op.x_memory.push_back(user_src_layer_memory_0);
+
+          mkldnn::memory::dims weights_layer_tz_0 = {1, D, I, ngates, H};  //  ldigo
+          mkldnn::memory::dims weights_iter_tz_0 = {1, D, H, ngates, H};  //  ldigo
+          mkldnn::memory::dims bias_tz_0 = {1, D, ngates, H};
+          auto user_weight_layer_md_0 = mkldnn::memory::desc(
+              { weights_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+          auto user_weight_iter_md_0 = mkldnn::memory::desc(
+              { weights_iter_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+          auto user_bias_md_0 = mkldnn::memory::desc({ bias_tz_0 },
+              mkldnn_dtype, mkldnn::memory::format::ldgo);
+
+          DType* weight_layer_0 = workptr;  //  D * I * ngates * H
+          auto user_weight_layer_memory_0
+              = mkldnn::memory({ user_weight_layer_md_0, cpu_engine }, weight_layer_0);
+          op.wx_memory.push_back(user_weight_layer_memory_0);
+
+          DType* weight_iter_0 = weight_layer_0 + D * I * ngates * H;  //  D * H * ngates * H
+          auto user_weight_iter_memory_0
+              = mkldnn::memory({ user_weight_iter_md_0, cpu_engine }, weight_iter_0);
+          op.wh_memory.push_back(user_weight_iter_memory_0);
+
+          DType* bias_0 = weight_iter_0 + D * H * ngates * H;  //  D * ngates * H
+          auto user_bias_memory_0 =
+              mkldnn::memory({ user_bias_md_0, cpu_engine }, bias_0);
+          op.bias_memory.push_back(user_bias_memory_0);
+          workptr = bias_0 + D * ngates * H;
+
+          auto wx_md_0 = mkldnn::memory::desc(
+              { weights_layer_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+          auto wx_memory_0 =
+              mkldnn::memory({ wx_md_0, cpu_engine });
+          auto wh_md_0 = mkldnn::memory::desc(
+              { weights_iter_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+          auto wh_memory_0 =
+              mkldnn::memory({ wh_md_0, cpu_engine });
+          if (D == 2) {
+            DType* wx_0 = workptr;  //  D * ngates * I * H
+            wx_memory_0.set_data_handle(wx_0);
+            DType* wh_0 = wx_0 + D * ngates * I * H;  //  D * ngates * H * H
+            wh_memory_0.set_data_handle(wh_0);
+            workptr = wh_0 + D * ngates * H * H;
+          }
+          op.concat_weight_memory.push_back(wx_memory_0);
+          op.concat_weight_memory.push_back(wh_memory_0);
+
+          mkldnn::memory::dims src_iter_undi_tz_0 = {1, 1, nstates, N, H};  //  ldsnc
+          auto src_iter_undi_md_0 = mkldnn::memory::desc(
+              { src_iter_undi_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+          DType* src_iter_undi_0 = workptr;  //  nstates * N * H
+          auto src_iter_undi_memory_0 =
+              mkldnn::memory({ src_iter_undi_md_0, cpu_engine }, src_iter_undi_0);
+          op.concat_iter_memory.push_back(src_iter_undi_memory_0);
+          workptr = src_iter_undi_0 + nstates * N * H;
+          if (D == 1) {
+            op.hcx_memory.push_back(src_iter_undi_memory_0);
+          } else {
+            DType* src_iter_undi2_0 = workptr;  //  nstates * N * H
+            auto src_iter_undi2_memory_0 =
+                mkldnn::memory({ src_iter_undi_md_0, cpu_engine }, src_iter_undi2_0);
+            op.concat_iter_memory.push_back(src_iter_undi2_memory_0);
+
+            mkldnn::memory::dims src_iter_tz_0 = {1, D, nstates, N, H};  //  ldsnc
+            auto src_iter_md_0 = mkldnn::memory::desc(
+                { src_iter_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+            DType* src_iter_0 = src_iter_undi2_0 + nstates * N * H;  //  D * nstates * N * H
+            auto src_iter_memory_0 =
+                mkldnn::memory({ src_iter_md_0, cpu_engine }, src_iter_0);
+            op.concat_iter_memory.push_back(src_iter_memory_0);
+            op.hcx_memory.push_back(src_iter_memory_0);
+            workptr = src_iter_0 + D * nstates * N * H;
+          }
+
+          DType* dst_layer_0 = workptr;  //  T * N * D * H
+          auto dst_layer_memory_0
+              = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_0);
+          op.y_memory.push_back(dst_layer_memory_0);
+
+          mkldnn::memory::dims dst_iter_tz_0 = {1, D, nstates, N, H};  //  ldsnc
+          auto dst_iter_md_0 = mkldnn::memory::desc(
+              { dst_iter_tz_0 }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+          DType* dst_iter_0 = dst_layer_0 + T * N * D * H;  //  D * nstates * N * H
+          auto dst_iter_memory_0 =
+              mkldnn::memory({ dst_iter_md_0, cpu_engine }, dst_iter_0);
+          op.hcy_memory.push_back(dst_iter_memory_0);
+          workptr = dst_iter_0 + D * nstates * N * H;
+
+          //  next L - 1 layers
+          if (L > 1 && D == 1) {
+            auto user_src_layer_md = mkldnn::memory::desc(
+                { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
+            auto user_src_layer_memory = mkldnn::memory({ user_src_layer_md, cpu_engine });
+            op.x_memory.push_back(user_src_layer_memory);
+
+            mkldnn::memory::dims weights_layer_tz = {L - 1, 1, H, ngates, H};  //  ldigo
+            mkldnn::memory::dims weights_iter_tz = {L - 1, 1, H, ngates, H};  //  ldigo
+            mkldnn::memory::dims bias_tz = {L - 1, 1, ngates, H};
+            auto user_weight_layer_md = mkldnn::memory::desc(
+                { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+            auto user_weight_iter_md = mkldnn::memory::desc(
+                { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+            auto user_bias_md = mkldnn::memory::desc({ bias_tz },
+                mkldnn_dtype, mkldnn::memory::format::ldgo);
+
+            DType* weight_layer_n = workptr;  //  (L - 1) * H * ngates * H
+            auto user_weight_layer_memory_n
+                = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
+            op.wx_memory.push_back(user_weight_layer_memory_n);
+
+            DType* weight_iter_n = weight_layer_n +
+                (L - 1) * H * ngates * H;  //  (L - 1) * H * ngates * H
+            auto user_weight_iter_memory_n
+                = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
+            op.wh_memory.push_back(user_weight_iter_memory_n);
+
+            DType* bias_n = weight_iter_n + (L - 1) * H * ngates * H;  //  (L - 1) * ngates * H
+            auto user_bias_memory_n =
+                mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
+            op.bias_memory.push_back(user_bias_memory_n);
+
+            auto wx_md_n = mkldnn::memory::desc(
+                { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+            DType* wx_n = bias_n + (L - 1) * ngates * H;  //  (L - 1) * ngates * H * H
+            auto wx_memory_n =
+                mkldnn::memory({ wx_md_n, cpu_engine }, wx_n);
+            DType* wh_n = wx_n + (L - 1) * ngates * H * H;  //  (L - 1) * ngates * H * H
+            auto wh_md_n = mkldnn::memory::desc(
+                { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+            auto wh_memory_n =
+                mkldnn::memory({ wh_md_n, cpu_engine }, wh_n);
+
+            op.concat_weight_memory.push_back(wx_memory_n);
+            op.concat_weight_memory.push_back(wh_memory_n);
+            workptr = wh_n + (L - 1) * ngates * H * H;
+
+            mkldnn::memory::dims src_iter_tz_n1 = {1, 1, nstates, N, H};  //  ldsnc
+            auto src_iter_md_n1 = mkldnn::memory::desc(
+                { src_iter_tz_n1 }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+            for (int l = 0; l < L - 1; l++) {
+              DType* src_iter_n1 = workptr;  //  nstates * N * H
+              auto src_iter_memory_n1 =
+                  mkldnn::memory({ src_iter_md_n1, cpu_engine }, src_iter_n1);
+              op.concat_iter_memory.push_back(src_iter_memory_n1);
+              workptr = src_iter_n1 + nstates * N * H;
+            }
+            mkldnn::memory::dims src_iter_tz_n = {L - 1, 1, nstates, N, H};  //  ldsnc
+            auto src_iter_md_n = mkldnn::memory::desc(
+                { src_iter_tz_n }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+            DType* src_iter_n = workptr;  //  (L - 1) * nstates * N * H
+            auto src_iter_memory_n =
+                mkldnn::memory({ src_iter_md_n, cpu_engine }, src_iter_n);
+            op.concat_iter_memory.push_back(src_iter_memory_n);
+            op.hcx_memory.push_back(src_iter_memory_n);
+
+            DType* dst_layer_n = src_iter_n + (L - 1) * nstates * N * H;  //  T * N * D * H
+            auto dst_layer_memory_n
+                = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n);
+            op.y_memory.push_back(dst_layer_memory_n);
+
+            mkldnn::memory::dims dst_iter_tz_n = {L - 1, 1, nstates, N, H};  //  ldsnc
+            auto dst_iter_md_n = mkldnn::memory::desc(
+                { dst_iter_tz_n }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+            DType* dst_iter_n = dst_layer_n + T * N * D * H;  //  (L - 1) * nstates * N * H
+            auto dst_iter_memory_n =
+                mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n);
+            op.hcy_memory.push_back(dst_iter_memory_n);
+          }
+
+          if (L > 1 && D == 2) {
+            mkldnn::memory::dims weights_layer_tz = {1, D, H * D, ngates, H};  //  ldigo
+            mkldnn::memory::dims weights_iter_tz = {1, D, H, ngates, H};  //  ldigo
+            mkldnn::memory::dims bias_tz = {1, D, ngates, H};
+            auto user_weight_layer_md = mkldnn::memory::desc(
+                { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+            auto user_weight_iter_md = mkldnn::memory::desc(
+                { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldigo);
+            auto user_bias_md = mkldnn::memory::desc({ bias_tz },
+                mkldnn_dtype, mkldnn::memory::format::ldgo);
+
+            auto user_src_layer_md = mkldnn::memory::desc(
+                { src_layer_tz }, mkldnn_dtype, mkldnn::memory::format::tnc);
+            auto user_src_layer_memory = mkldnn::memory({ user_src_layer_md, cpu_engine });
+            op.x_memory.push_back(user_src_layer_memory);
+
+            auto wx_md_n = mkldnn::memory::desc(
+                { weights_layer_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+            auto wh_md_n = mkldnn::memory::desc(
+                { weights_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldgoi);
+
+            for (int l = 0; l < L; l++) {
+              DType* weight_layer_n = workptr;  //  D * (H * D) * ngates * H
+              auto user_weight_layer_memory_n
+                  = mkldnn::memory({ user_weight_layer_md, cpu_engine }, weight_layer_n);
+              op.wx_memory.push_back(user_weight_layer_memory_n);
+
+              DType* weight_iter_n = weight_layer_n +
+                  D * (H * D) * ngates * H;  //  D * H * ngates * H
+              auto user_weight_iter_memory_n
+                  = mkldnn::memory({ user_weight_iter_md, cpu_engine }, weight_iter_n);
+              op.wh_memory.push_back(user_weight_iter_memory_n);
+
+              DType* bias_n = weight_iter_n + D * H * ngates * H;  //  D * ngates * H
+              auto user_bias_memory_n =
+                  mkldnn::memory({ user_bias_md, cpu_engine }, bias_n);
+              op.bias_memory.push_back(user_bias_memory_n);
+              workptr = bias_n + D * ngates * H;
+            }
+
+            DType* wx_n = workptr;  //  D * ngates * (D * H) * H
+            DType* wh_n = wx_n + D * ngates * (D * H) * H;  //  D * ngates * H * H
+            auto wx_memory_n =
+                mkldnn::memory({ wx_md_n, cpu_engine }, wx_n);
+            auto wh_memory_n =
+                mkldnn::memory({ wh_md_n, cpu_engine }, wh_n);
+            op.concat_weight_memory.push_back(wx_memory_n);
+            op.concat_weight_memory.push_back(wh_memory_n);
+
+            mkldnn::memory::dims src_iter_undi_tz = {1, 1, nstates, N, H};  //  ldsnc
+            auto src_iter_undi_md = mkldnn::memory::desc(
+                { src_iter_undi_tz }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+            DType* src_iter_undi = wh_n + D * ngates * H * H;  //  nstates * N * H
+            auto src_iter_undi_memory =
+                mkldnn::memory({ src_iter_undi_md, cpu_engine }, src_iter_undi);
+            op.concat_iter_memory.push_back(src_iter_undi_memory_0);
+
+            DType* src_iter_undi2 = src_iter_undi + nstates * N * H;  //  nstates * N * H
+            auto src_iter_undi2_memory =
+                mkldnn::memory({ src_iter_undi_md, cpu_engine }, src_iter_undi2);
+            op.concat_iter_memory.push_back(src_iter_undi2_memory);
+
+            mkldnn::memory::dims src_iter_tz = {1, D, nstates, N, H};  //  ldsnc
+            auto src_iter_md = mkldnn::memory::desc(
+                { src_iter_tz }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+            DType* src_iter = src_iter_undi2 + nstates * N * H;  //  D * nstates * N * H
+            auto src_iter_memory =
+                mkldnn::memory({ src_iter_md, cpu_engine }, src_iter);
+            op.concat_iter_memory.push_back(src_iter_memory);
+            op.hcx_memory.push_back(src_iter_memory);
+
+            DType* dst_layer_n = src_iter + D * nstates * N * H;  //  T * N * D * H
+            auto dst_layer_memory_n
+                = mkldnn::memory({ dst_layer_md, cpu_engine }, dst_layer_n);
+            op.y_memory.push_back(dst_layer_memory_n);
+
+            mkldnn::memory::dims dst_iter_tz_n = {1, D, nstates, N, H};  //  ldsnc
+            auto dst_iter_md_n = mkldnn::memory::desc(
+                { dst_iter_tz_n }, mkldnn_dtype, mkldnn::memory::format::ldsnc);
+            DType* dst_iter_n = dst_layer_n + T * N * D * H;  //  D * nstates * N * H
+            auto dst_iter_memory_n =
+                mkldnn::memory({ dst_iter_md_n, cpu_engine }, dst_iter_n);
+            op.hcy_memory.push_back(dst_iter_memory_n);
+          }
+        }
+      }
+      op.Forward(ctx, in_blobs, req, out_blobs);
+    });
+  });
+}
+#endif
+
 NNVM_REGISTER_OP(RNN)
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
@@ -269,24 +711,15 @@ The definition of GRU here is slightly different from paper but compatible with
 })
 .set_attr<mxnet::FInferShape>("FInferShape", RNNShape)
 .set_attr<nnvm::FInferType>("FInferType", RNNType)
+.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
-.set_attr<FResourceRequestEx>("FResourceRequestEx",
-  [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) {
-    std::vector<ResourceRequest> request;
-    if (dev_mask == kGPU) {
-#if MXNET_USE_CUDNN_RNN
-      request.emplace_back(ResourceRequest::kTempSpace);
-
-      const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
-      if (param.p != 0 && 1.0f - param.p > 0) {
-        request.emplace_back(ResourceRequest::kCuDNNDropoutDesc);
-      }
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", RNNStatefulComputeCPU)
 #endif
-    }
-    return request;
-})
+.set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
+.set_attr<FResourceRequestEx>("FResourceRequestEx", RNNResourceEx)
 .add_argument("data", "NDArray-or-Symbol", "Input data to RNN")
 .add_argument("parameters", "NDArray-or-Symbol",
               "Vector of all RNN trainable parameters concatenated")
@@ -306,6 +739,7 @@ NNVM_REGISTER_OP(_backward_RNN)
 .set_attr_parser(ParamParser<RNNParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>)
+.set_attr<FResourceRequestEx>("FResourceRequestEx", RNNResourceEx);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index e1b4a2b79c0a..425ea4a3c6ab 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -44,6 +44,13 @@
 namespace mxnet {
 namespace op {
 
+namespace rnn_enum {
+  enum RNNOpInputs {kData, kParams, kState, kStateCell, kSequenceLength};
+  enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
+  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
+  enum RNNOpResource {kTempSpace, kCuDNNDropoutDescSpace};
+}
+
 template<typename DType>
 inline DType sigmoid(DType x) {
   return 1.0f / (1.0f + exp(-x));
diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index 28b89613ee86..e0fb615a7ac0 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -299,6 +299,7 @@ void PreSelectSubgraphNodes(const nnvm::Graph& g, SubgraphSelectorV2Ptr subgraph
       }
       LOG(INFO) << "Found a cycle when BFS from node " << simple_nodes[snid]->node->attrs.name
                 << ". Excluding nodes " << excluded_node_names << "and retrying";
+      subgraph_selector->Reset();
     }
     ++count;
   }
@@ -509,9 +510,9 @@ void FindOutputEntries(nnvm::Graph* g,
 void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
                     std::vector<nnvm::NodeEntry> *orig_entries,
                     const bool skip_var = false) {
-  orig_entries->reserve(input_entries.size());
+  orig_entries->resize(input_entries.size());
   // map for creating unique var nodes for deduplicating entries from the same node
-  std::unordered_map<std::string, nnvm::NodePtr> new_node_map;
+  std::unordered_map<std::string, int> name_count_map;
   for (size_t i = 0; i < input_entries.size(); ++i) {
     nnvm::NodeEntry *e = input_entries[i];
     // If the node is a variable itself, we may want to skip the node.
@@ -519,17 +520,19 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
       continue;
     }
 
+    orig_entries->at(i) = *e;
     nnvm::Symbol sym;
     sym.outputs.push_back(*e);
     const auto output_names = sym.ListOutputNames();
     CHECK_EQ(output_names.size(), 1U);
     const std::string& var_name = output_names[0];
-    auto it = new_node_map.find(var_name);
-    if (it == new_node_map.end()) {
-      orig_entries->push_back(*e);
-      new_node_map[var_name] = nnvm::CreateVariableNode(var_name);
+    auto it = name_count_map.find(var_name);
+    if (name_count_map.end() == it) {
+      name_count_map.emplace(var_name, 0);
+    } else {
+      ++(it->second);
     }
-    nnvm::NodePtr n = new_node_map[var_name];
+    nnvm::NodePtr n = nnvm::CreateVariableNode(var_name + std::to_string(name_count_map[var_name]));
     *e = nnvm::NodeEntry{n, 0, 0};
   }
 }
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h b/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h
index b44f2fb0e31e..fcf767adebad 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv-inl.h
@@ -21,11 +21,12 @@
 #define MXNET_OPERATOR_SUBGRAPH_MKLDNN_MKLDNN_CONV_INL_H_
 #if MXNET_USE_MKLDNN == 1
 
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
-#include "../../nn/convolution-inl.h"
+#include "../../nn/activation-inl.h"
 #include "../../nn/batch_norm-inl.h"
+#include "../../nn/convolution-inl.h"
 #include "../../nn/mkldnn/mkldnn_convolution-inl.h"
 
 namespace mxnet {
@@ -36,6 +37,25 @@ struct MKLDNNConvFusionParam {
   std::shared_ptr<BatchNormParam> bn_param;
 };
 
+static inline bool IsOutputUInt8(const MKLDNNConvFusionParam& param) {
+  bool result = false;
+  const auto& mkldnn_param = param.full_conv_param.mkldnn_param;
+  auto IsOutputUInt8Helper = [](const mkldnn::algorithm& act_alg) {
+    return (act_alg == mkldnn::algorithm::eltwise_relu ||
+            act_alg == mkldnn::algorithm::eltwise_logistic ||
+            act_alg == mkldnn::algorithm::eltwise_soft_relu ||
+            act_alg == mkldnn::algorithm::eltwise_bounded_relu);
+  };
+  if ((!mkldnn_param.with_sum) && mkldnn_param.with_act) {
+    CHECK(param.full_conv_param.act_param.alg != mkldnn::algorithm::algorithm_undef);
+    result = IsOutputUInt8Helper(param.full_conv_param.act_param.alg);
+  } else if (mkldnn_param.with_postsum_act) {
+    CHECK(param.full_conv_param.postsum_act_param.alg != mkldnn::algorithm::algorithm_undef);
+    result = IsOutputUInt8Helper(param.full_conv_param.postsum_act_param.alg);
+  }
+  return result;
+}
+
 enum MKLDNNConvOpOutputs { kOut, kMin, kMax };
 
 }  // namespace op
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index 2c05fda9a879..b7776d648e18 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -27,6 +27,8 @@
 #include "../../nn/mkldnn/mkldnn_ops-inl.h"
 #include "../../quantization/quantization_utils.h"
 #include "mkldnn_conv-inl.h"
+#include "../../nn/mkldnn/mkldnn_act-inl.h"
+#include "../../tensor/matrix_op-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -294,7 +296,6 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     cached_data_max_ = data_max;
     cached_sum_min_ = sum_min;
     cached_sum_max_ = sum_max;
-    full_conv_param.sum_scale = 1.0;
     cached_weight_ = inputs[in_weight].Reorder2Default();
     weight_ver_ = inputs[in_weight].version();
     if (!conv_param.no_bias) {
@@ -348,7 +349,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
         sum_in_scale = quantized_sum_range / MaxAbs(cached_sum_min_, cached_sum_max_);
       }
       if (post_requantize_) {
-        quantized_out_range = IsOutputUInt8(mkldnn_param) ? kUint8Range : kInt8Range;
+        quantized_out_range = IsOutputUInt8(param_) ? kUint8Range : kInt8Range;
         out_range = MaxAbs(cached_output_min_, cached_output_max_);
         output_scale = quantized_out_range / out_range;
         full_conv_param.requantize_scales.resize(weight_channelwise_scale ? channel : 1);
@@ -373,6 +374,19 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
       if (mkldnn_param.with_sum) {
         full_conv_param.sum_scale = output_scale / sum_in_scale;
       }
+      if (mkldnn_param.with_act &&
+          full_conv_param.act_param.alg == mkldnn::algorithm::eltwise_bounded_relu) {
+        if (mkldnn_param.with_sum) {
+          LOG(ERROR) << "mkldnn doesn't support conv + relu + sum fusion yet.";
+          full_conv_param.act_param.alpha *= output_scale;
+        } else {
+          // For conv+relu6 without sum, we don't need post_ops as output_scale can do the cut off.
+          mkldnn_param.with_act = false;
+        }
+      }
+      if (mkldnn_param.with_postsum_act) {
+        CHECK(full_conv_param.postsum_act_param.alg == mkldnn::algorithm::eltwise_relu);
+      }
     }
     fwd_.reset(new MKLDNNConvForward(
         full_conv_param, ctx.is_train, data, cached_weight_,
@@ -385,6 +399,25 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     initialized_ = true;
   }
 
+  if (mkldnn_param.with_sum) {
+    const auto output_mem = output.GetMKLDNNData();
+    const auto out_mem_desc = output_mem->get_primitive_desc().desc();
+    const auto dst_format = fwd_->fwd_pd.dst_primitive_desc().desc().data.format;
+    if (out_mem_desc.data.format != dst_format) {
+      auto tmp_out_mem = output.GetMKLDNNDataReorder(fwd_->fwd_pd.dst_primitive_desc());
+      mkldnn::memory::desc data_md(
+          mkldnn::memory::dims(out_mem_desc.data.dims,
+                               out_mem_desc.data.dims + out_mem_desc.data.ndims),
+          static_cast<mkldnn::memory::data_type>(out_mem_desc.data.data_type),
+          static_cast<memory::format>(dst_format));
+      mkldnn::memory::primitive_desc pd(data_md, CpuEngine::Get()->get_engine());
+      mkldnn_mem_ptr new_out_mem(new mkldnn::memory(pd, output_mem->get_data_handle()));
+      MKLDNNStream::Get()->RegisterMem(new_out_mem);
+      mxnet::MKLDNNCopy(*tmp_out_mem, new_out_mem.get());
+      output = NDArray(new_out_mem);
+    }
+  }
+
   if (mkldnn_param.quantized) {
     auto data_mem = data.GetMKLDNNDataReorder(fwd_->fwd_pd.src_primitive_desc());
     mkldnn::memory *mem = output.CreateMKLDNNData(fwd_->fwd_pd.dst_primitive_desc());
@@ -437,6 +470,23 @@ static uint32_t SgMKLDNNConvNumInputs(const NodeAttrs &attrs) {
 
 static void SgMKLDNNConvParamParser(nnvm::NodeAttrs *attrs) {
   MKLDNNConvFusionParam param_;
+
+  // For back-compatible, rename
+  // with_relu -> with_act
+  // with_postsum_relu -> with_postsum_act
+
+  auto old = attrs->dict.find("with_relu");
+  if (old != attrs->dict.end()) {
+    attrs->dict["with_act"] = old->second;
+    attrs->dict.erase(old);
+  }
+
+  old = attrs->dict.find("with_postsum_relu");
+  if (old != attrs->dict.end()) {
+    attrs->dict["with_postsum_act"] = old->second;
+    attrs->dict.erase(old);
+  }
+
   try {
     param_.full_conv_param.mkldnn_param.Init(attrs->dict);
   } catch (const dmlc::ParamError &e) {
@@ -452,6 +502,7 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs *attrs) {
   }
   CHECK_EQ(attrs->subgraphs.size(), 1);
   auto subgraph_sym = attrs->subgraphs[0];
+  bool with_act = false;
   DFSVisit(subgraph_sym->outputs, [&](const nnvm::NodePtr &node) {
     if (node->is_variable()) return;
     auto &node_name = node->op()->name;
@@ -463,6 +514,20 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs *attrs) {
     } else if (node_name == "Convolution") {
       param_.full_conv_param.conv_param =
           nnvm::get<ConvolutionParam>(node->attrs.parsed);
+    } else if (node_name == "Activation" || node_name == "clip") {
+      auto &post_act_param =
+          (param_.full_conv_param.mkldnn_param.with_act && !with_act)
+              ? param_.full_conv_param.act_param
+              : param_.full_conv_param.postsum_act_param;
+      with_act = true;
+      if (node_name == "Activation") {
+        const auto act_param = nnvm::get<ActivationParam>(node->attrs.parsed);
+        post_act_param.alg = GetMKLDNNActAlgo(act_param);
+      } else {
+        const auto clip_param = nnvm::get<ClipParam>(node->attrs.parsed);
+        post_act_param.alg = mkldnn::algorithm::eltwise_bounded_relu;
+        post_act_param.alpha = clip_param.a_max;
+      }
     }
   });
   attrs->parsed = std::move(param_);
@@ -605,7 +670,7 @@ static bool SgMKLDNNConvInferType(const nnvm::NodeAttrs &attrs,
     }
     if (param.full_conv_param.mkldnn_param.min_calib_range.has_value() &&
         param.full_conv_param.mkldnn_param.max_calib_range.has_value()) {
-      if (IsOutputUInt8(param.full_conv_param.mkldnn_param)) {
+      if (IsOutputUInt8(param)) {
         TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kUint8);
       } else {
         TYPE_ASSIGN_CHECK(*out_types, 0, mshadow::kInt8);
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_property.h b/src/operator/subgraph/mkldnn/mkldnn_conv_property.h
index 7fe4727a4990..d7a237e08c87 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv_property.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv_property.h
@@ -23,15 +23,17 @@
 
 #include <string>
 #include <vector>
+#include "../../nn/activation-inl.h"
+#include "../../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../tensor/matrix_op-inl.h"
 #include "../common.h"
 #include "../subgraph_property.h"
-#include "../../nn/activation-inl.h"
 
 namespace mxnet {
 namespace op {
 class SgMKLDNNConvSelector : public SubgraphSelector {
  public:
-  /*! \brief pattern match status */
+  /*! \brief pattern match status_ */
   enum SelectStatus {
     kFail = 0,
     kStart,
@@ -41,25 +43,28 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
   };
 
  private:
-  bool disable_all;
-  bool disable_conv_bn;
-  bool disable_conv_relu;
-  bool disable_conv_sum;
-  SelectStatus status;
-  std::vector<const nnvm::Node *> matched_list;
+  bool disable_all_;
+  bool disable_conv_bn_;
+  bool disable_conv_act_;
+  bool disable_conv_sum_;
+  bool quantize_;
+  SelectStatus status_;
+  std::vector<const nnvm::Node *> matched_list_;
 
  public:
-  SgMKLDNNConvSelector(int dis_all, int dis_conv_bn, int dis_conv_relu, int dis_conv_sum)
-      : disable_all(dis_all),
-        disable_conv_bn(dis_conv_bn),
-        disable_conv_relu(dis_conv_relu),
-        disable_conv_sum(dis_conv_sum) {}
+  SgMKLDNNConvSelector(int dis_all, int dis_conv_bn, int dis_conv_act, int dis_conv_sum,
+                       int quantize)
+      : disable_all_(dis_all),
+        disable_conv_bn_(dis_conv_bn),
+        disable_conv_act_(dis_conv_act),
+        disable_conv_sum_(dis_conv_sum),
+        quantize_(quantize) {}
 
   bool Select(const nnvm::Node &n) override {
     if (n.op() && n.op()->name == "Convolution") {
-      status = disable_all ? kSuccess : kStart;
-      matched_list.clear();
-      matched_list.push_back(&n);
+      status_ = disable_all_ ? kSuccess : kStart;
+      matched_list_.clear();
+      matched_list_.push_back(&n);
       return true;
     }
     return false;
@@ -72,60 +77,72 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
   bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
     // If n isn't the last matched node, then we encoutered a internal
     // branch, we should pop out the node behind n and stop fusion.
-    if (matched_list.back() != &n) {
-      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
-          matched_list.end()) {
-        while (matched_list.back() != &n) {
-          matched_list.pop_back();
+    if (matched_list_.back() != &n) {
+      if (std::find(matched_list_.begin(), matched_list_.end(), &n) !=
+          matched_list_.end()) {
+        while (matched_list_.back() != &n) {
+          matched_list_.pop_back();
         }
       }
-      status = kSuccess;
+      status_ = kSuccess;
       return false;
     }
-    if (status == kFail || status == kSuccess || new_node.is_variable())
+    if (status_ == kFail || status_ == kSuccess || new_node.is_variable())
       return false;
 
-    // Use status machine to do selection. The status change is
+    // Use status_ machine to do selection. The status_ change is
     // kStart -> kBN -> kSum -> kSuccess
-    switch (status) {
+    switch (status_) {
       case kStart:
-        if ((!disable_conv_bn) && new_node.op()->name == "BatchNorm") {
-          matched_list.push_back(&new_node);
-          status = kBN;
+        if ((!disable_conv_bn_) && new_node.op()->name == "BatchNorm") {
+          matched_list_.push_back(&new_node);
+          status_ = kBN;
           return true;
         }
       case kBN:
-        if ((!disable_conv_sum) && new_node.op()->name == "elemwise_add") {
-          matched_list.push_back(&new_node);
-          status = kSum;
+        if ((!disable_conv_sum_) && new_node.op()->name == "elemwise_add") {
+          matched_list_.push_back(&new_node);
+          status_ = kSum;
           return true;
         }
       case kSum:
       default:
-        if ((!disable_conv_relu) && new_node.op()->name == "Activation") {
+        if ((!disable_conv_act_) && new_node.op()->name == "Activation") {
           const ActivationParam &param =
               nnvm::get<ActivationParam>(new_node.attrs.parsed);
-          if (param.act_type == activation::kReLU) {
-            matched_list.push_back(&new_node);
-            // If we find conv+relu, then we can't match anymore.
-            // TODO(zhennan): mkldnn only supports convolution + relu + sum in
-            // int8, not in fp32. So we disable this pattern at moment.
-            status = kSuccess;
+          if ((quantize_ && SupportQuantizedMKLDNNAct(param)) ||
+              (!quantize_ && SupportMKLDNNAct(param))) {
+            matched_list_.push_back(&new_node);
+            // not support conv+relu+sum yet.
+            status_ = kSuccess;
             return true;
           }
+        } else if ((!disable_conv_act_) && new_node.op()->name == "clip") {
+          if (!(quantize_ && (status_ == kSum))) {
+            // TODO(zhennan): doesn't support int8 conv+sum+relu6 at moment. To support this, we
+            // need to fuse conv+sum first, and calibrate with it. Then fuse int8 relu6 into fused
+            // conv.
+            const ClipParam &param = nnvm::get<ClipParam>(new_node.attrs.parsed);
+            if (param.a_min == 0.f) {
+              matched_list_.push_back(&new_node);
+              // not support conv+relu+sum yet.
+              status_ = kSuccess;
+              return true;
+            }
+          }
         }
-        status = kSuccess;
+        status_ = kSuccess;
         return false;
     }
   }
 
   std::vector<nnvm::Node *> Filter(
       const std::vector<nnvm::Node *> &candidates) override {
-    if (status == kFail) {
+    if (status_ == kFail) {
       return std::vector<nnvm::Node *>(0);
     } else {
       std::vector<nnvm::Node *> ret;
-      for (auto i : matched_list) {
+      for (auto i : matched_list_) {
         auto non_const_i = const_cast<nnvm::Node *>(i);
         if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
             candidates.end()) {
@@ -135,16 +152,24 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
       return ret;
     }
   }
+
+  void Reset() override {
+    CHECK_GE(matched_list_.size(), 1);
+    auto new_selector = SgMKLDNNConvSelector(disable_all_, disable_conv_bn_, disable_conv_act_,
+                                             disable_conv_sum_, quantize_);
+    new_selector.Select(*matched_list_[0]);
+    *this = new_selector;
+  }
 };
 
 class SgMKLDNNConvProperty : public SubgraphProperty {
  public:
   SgMKLDNNConvProperty() {
-    disable_conv_bn = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_BN", 0);
-    disable_conv_relu = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU", 0);
-    disable_conv_sum = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_SUM", 0);
+    disable_conv_bn_ = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_BN", 0);
+    disable_conv_act_ = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU", 0);
+    disable_conv_sum_ = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_SUM", 0);
 
-    disable_all = disable_conv_bn && disable_conv_relu && disable_conv_sum;
+    disable_all_ = disable_conv_bn_ && disable_conv_act_ && disable_conv_sum_;
   }
   static SubgraphPropertyPtr Create() {
     static const std::string &name = "MKLDNN convolution optimization pass";
@@ -163,7 +188,7 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
     // This op has single output, remove duplicated.
     auto last_node = sym.outputs[0].node;
     nnvm::Symbol new_sym;
-    new_sym.outputs.emplace_back(nnvm::NodeEntry{last_node, 0, 0});
+    new_sym.outputs.emplace_back(last_node);
     std::ostringstream node_name;
     node_name << "sg_mkldnn_";
     bool _with_sum = false;
@@ -180,12 +205,12 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
         n->attrs.dict["with_sum"] = "true";
         _with_sum = true;
 
-      } else if (sub_name == "Activation") {
-        node_name << "relu_";
+      } else if (sub_name == "Activation" || sub_name == "clip") {
+        node_name << "act_";
         if (!_with_sum) {
-          n->attrs.dict["with_relu"] = "true";
+          n->attrs.dict["with_act"] = "true";
         } else {
-          n->attrs.dict["with_postsum_relu"] = "true";
+          n->attrs.dict["with_postsum_act"] = "true";
         }
       }
     });
@@ -199,8 +224,9 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
   }
 
   SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    int quantize = HasAttr("quantize") ? GetAttr<int>("quantize") : 0;
     auto selector = std::make_shared<SgMKLDNNConvSelector>(
-        disable_all, disable_conv_bn, disable_conv_relu, disable_conv_sum);
+        disable_all_, disable_conv_bn_, disable_conv_act_, disable_conv_sum_, quantize);
     return selector;
   }
 
@@ -241,10 +267,10 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
   }
 
  private:
-  int disable_all;
-  int disable_conv_bn;
-  int disable_conv_relu;
-  int disable_conv_sum;
+  int disable_all_;
+  int disable_conv_bn_;
+  int disable_conv_act_;
+  int disable_conv_sum_;
 };
 
 }  // namespace op
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h b/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h
index f8d7ee1da6c9..8b5c08802986 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.h
@@ -132,6 +132,13 @@ class SgMKLDNNFCPostQuantizeSelector : public SubgraphSelector {
       return ret;
     }
   }
+
+  void Reset() override {
+    CHECK_GE(matched_list.size(), 1);
+    auto new_selector = SgMKLDNNFCPostQuantizeSelector(disable_all, disable_float_output);
+    new_selector.Select(*matched_list[0]);
+    *this = new_selector;
+  }
 };
 
 class SgMKLDNNFCPostQuantizeProperty : public SubgraphProperty {
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc_property.h b/src/operator/subgraph/mkldnn/mkldnn_fc_property.h
index 04e140c72d86..28350c2f0e99 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc_property.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc_property.h
@@ -115,6 +115,13 @@ class SgMKLDNNFCSelector : public SubgraphSelector {
       return candidates;
     }
   }
+
+  void Reset() override {
+    CHECK_GE(matched_list.size(), 1);
+    auto new_selector = SgMKLDNNFCSelector(disable_fc_relu);
+    new_selector.Select(*matched_list[0]);
+    *this = new_selector;
+  }
 };
 
 class SgMKLDNNFCProperty : public SubgraphProperty {
@@ -141,7 +148,7 @@ class SgMKLDNNFCProperty : public SubgraphProperty {
     // This op has single output, remove duplicated.
     auto last_node = sym.outputs[0].node;
     nnvm::Symbol new_sym;
-    new_sym.outputs.emplace_back(nnvm::NodeEntry{last_node, 0, 0});
+    new_sym.outputs.emplace_back(last_node);
     std::ostringstream node_name;
     node_name << "sg_mkldnn_";
     DFSVisit(new_sym.outputs, [&](const nnvm::NodePtr &node) {
diff --git a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h
index f8c47f0ce036..5c5037e7a116 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_align_scale_property.h
@@ -34,6 +34,7 @@ class SgMKLDNNConcatPostQuantizeSelector : public SubgraphSelectorV2 {
   bool Select(const BiDirectedNode &sn) override {
     const auto &n = *sn.node;
     if (n.op() == Op::Get("_contrib_quantized_concat")) {
+      head_ = sn;
       matched_list_.clear();
       visit_list_.clear();
       visit_list_.insert(&n);
@@ -97,7 +98,14 @@ class SgMKLDNNConcatPostQuantizeSelector : public SubgraphSelectorV2 {
     }
   }
 
+  void Reset() override {
+    auto new_selector = SgMKLDNNConcatPostQuantizeSelector();
+    new_selector.Select(head_);
+    *this = new_selector;
+  }
+
  private:
+  BiDirectedNode head_;
   bool select_output_;
   std::vector<const BiDirectedNode *> matched_list_;
   std::unordered_set<const nnvm::Node*> visit_list_;
@@ -127,7 +135,7 @@ class SgMKLDNNPostQuantizeAlignScaleProperty : public SubgraphProperty {
  * conv4 = mx.symbol.Convolution(data=data, weight=weight * 4, name='conv4', num_filter=64,
  *                               kernel=(3, 3), stride=(1, 1), no_bias=True)
  * concat = mx.symbol.Concat(*[conv1, conv2, conv3, conv4], name="concat", dim=1)
- * 
+ *
  * This pass will collect the maximum calib range from conv1 to conv4, and apply it to all
  * conv1 to conv4. Then concat don't need extra scale alignment operation. Performance and
  * accuracy are both improved.
diff --git a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
index b61a303757b3..e78b8d1bfa42 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_post_quantize_property.h
@@ -107,6 +107,13 @@ class SgMKLDNNPostQuantizeSelector : public SubgraphSelector {
       return candidates;
     }
   }
+
+  void Reset() override {
+    CHECK_GE(matched_list.size(), 1);
+    auto new_selector = SgMKLDNNPostQuantizeSelector();
+    new_selector.Select(*matched_list[0]);
+    *this = new_selector;
+  }
 };
 
 class SgMKLDNNPostQuantizeProperty : public SubgraphProperty {
diff --git a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
index 4fc2d2c024bf..7fbc859cc8d1 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_subgraph_property.cc
@@ -29,10 +29,20 @@ namespace mxnet {
 namespace op {
 
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNConvProperty);
+
 MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN, SgMKLDNNFCProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNPostQuantizeProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
-MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNConvProperty)
+.set_attr("quantize", true);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCProperty)
+.set_attr("quantize", true);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeProperty);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QUANTIZE, SgMKLDNNPostQuantizeAlignScaleProperty);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph/subgraph_property.h b/src/operator/subgraph/subgraph_property.h
index aac3b3f2d0fc..460055f9ed86 100644
--- a/src/operator/subgraph/subgraph_property.h
+++ b/src/operator/subgraph/subgraph_property.h
@@ -104,6 +104,11 @@ class SubgraphSelector {
   virtual std::vector<nnvm::Node*> Filter(const std::vector<nnvm::Node*>& candidates) {
     return candidates;
   }
+  /*!
+   * \brief Reset the state of selector for SelectInput.
+   *        Note: the state should reset to Select() is successful.
+   */
+  virtual void Reset() {}
 };
 
 using SubgraphSelectorPtr = std::shared_ptr<SubgraphSelector>;
@@ -141,6 +146,12 @@ class SubgraphSelectorV2 {
       const std::vector<BiDirectedNode*>& candidates) {
     return candidates;
   }
+
+  /*!
+   * \brief Reset the state of selector for SelectInput.
+   *        Note: the state should reset to Select() is successful.
+   */
+  virtual void Reset() {}
 };
 
 using SubgraphSelectorV2Ptr = std::shared_ptr<SubgraphSelectorV2>;
@@ -179,6 +190,8 @@ class SubgraphSelectorV2Bridge : public SubgraphSelectorV2 {
     return ret;
   }
 
+  void Reset() override { ss_ptr_->Reset(); }
+
   const SubgraphSelectorPtr& GetV1ptr() const { return ss_ptr_; }
 
  private:
@@ -257,7 +270,7 @@ class SubgraphProperty {
   /*!
    * \brief Adjust nnvm nodes from a given subgraph. No new node is created, but adjust
    *        selected nodes' attributes. This can be used to implement peephole optimization.
-   *        Here users can customize how to adjust the operators in the subgraph. 
+   *        Here users can customize how to adjust the operators in the subgraph.
    * \param subgraph_nodes the subgraph nodes to adjust
    * \param subgraph_selector The selector used for selecting this node set.
    * \param subgraph_id subgraph id
@@ -329,6 +342,18 @@ class SubgraphProperty {
 
 using SubgraphPropertyPtr = std::shared_ptr<SubgraphProperty>;
 
+class SubgraphPropertyEntry {
+ public:
+  explicit SubgraphPropertyEntry(std::shared_ptr<SubgraphProperty> entry) : entry_(entry) {}
+  SubgraphPropertyEntry set_attr(const std::string& name, const int value) const {
+    entry_->SetAttr<int>(name, value);
+    return *this;
+  }
+
+ private:
+  std::shared_ptr<SubgraphProperty> entry_;
+};
+
 class SubgraphPropertyRegistry {
  public:
   typedef SubgraphPropertyPtr (*SubgraphPropertyCreateFn)(void);
@@ -338,33 +363,22 @@ class SubgraphPropertyRegistry {
   }
 
   std::vector<SubgraphPropertyPtr> CreateSubgraphProperty(const std::string& name) {
-    auto it = prop_fn_map_.find(name);
-    CHECK(it != prop_fn_map_.end()) << "SubgraphProperty " << name
+    auto it = prop_ptr_map_.find(name);
+    CHECK(it != prop_ptr_map_.end()) << "SubgraphProperty " << name
                                     << " is not found in SubgraphPropertyRegistry";
-    std::vector<SubgraphPropertyPtr> ret;
-    ret.reserve(it->second.size());
-     for (auto i : it->second) {
-       auto ptr_it = prop_ptr_map_.find(i);
-       if (ptr_it == prop_ptr_map_.end()) {
-         prop_ptr_map_[i] = i();
-         ptr_it = prop_ptr_map_.find(i);
-       }
-       if (ptr_it->second) ret.emplace_back(ptr_it->second);
-    }
-    return ret;
+    return it->second;
   }
 
-  SubgraphPropertyCreateFn __REGISTER__(const std::string& name, SubgraphPropertyCreateFn fn) {
-    prop_fn_map_[name].push_back(fn);
-    return fn;
+  SubgraphPropertyEntry __REGISTER__(const std::string& name, SubgraphPropertyCreateFn fn) {
+    prop_ptr_map_[name].emplace_back(fn());
+    return SubgraphPropertyEntry(prop_ptr_map_[name].back());
   }
 
   SubgraphPropertyRegistry() = default;
   SubgraphPropertyRegistry(const SubgraphPropertyRegistry&) = delete;
   SubgraphPropertyRegistry(SubgraphPropertyRegistry&&) = delete;
   SubgraphPropertyRegistry& operator=(const SubgraphPropertyRegistry&) = delete;
-  std::unordered_map<std::string, std::vector<SubgraphPropertyCreateFn>> prop_fn_map_;
-  std::unordered_map<SubgraphPropertyCreateFn, SubgraphPropertyPtr> prop_ptr_map_;
+  std::unordered_map<std::string, std::vector<SubgraphPropertyPtr>> prop_ptr_map_;
 };
 
 // This op name set is for setting the names of operators that should be grouped into
diff --git a/src/operator/tensor/amp_cast.cc b/src/operator/tensor/amp_cast.cc
new file mode 100644
index 000000000000..08d438724ebc
--- /dev/null
+++ b/src/operator/tensor/amp_cast.cc
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file amp_cast.cc
+ * \brief Casts used by AMP
+ */
+
+#include "./amp_cast.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(AMPCastParam);
+DMLC_REGISTER_PARAMETER(AMPMultiCastParam);
+
+NNVM_REGISTER_OP(amp_cast)
+.describe(R"code(Cast function between low precision float/FP32 used by AMP.
+
+It casts only between low precision float/FP32 and does not do anything for other types.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<AMPCastParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", AMPCastType)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<FCompute>("FCompute<cpu>", AMPCastCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_amp_cast"})
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(AMPCastParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_amp_cast)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<FCompute>("FCompute<cpu>", AMPCastCompute<cpu>);
+
+NNVM_REGISTER_OP(amp_multicast)
+.describe(R"code(Cast function used by AMP, that casts its inputs to the common widest type.
+
+It casts only between low precision float/FP32 and does not do anything for other types.
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const AMPMultiCastParam& param = dmlc::get<AMPMultiCastParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_outputs);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const AMPMultiCastParam& param = dmlc::get<AMPMultiCastParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_outputs);
+  })
+.set_attr_parser(ParamParser<AMPMultiCastParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", AMPMultiCastShape)
+.set_attr<nnvm::FInferType>("FInferType", AMPMultiCastType)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<AMPMultiCastParam>(attrs.parsed).num_outputs;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("data_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    int num_args = dmlc::get<AMPMultiCastParam>(attrs.parsed).num_outputs;
+    std::vector<std::pair<int, int>> ret;
+    for (int i = 0; i < num_args; ++i) {
+      ret.emplace_back(i, i);
+    }
+    return ret;
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    int num_args = dmlc::get<AMPMultiCastParam>(attrs.parsed).num_outputs;
+    return std::vector<bool>(num_args, true);
+  })
+.set_attr<FCompute>("FCompute<cpu>", AMPMultiCastCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_amp_multicast"})
+.add_argument("data", "NDArray-or-Symbol[]", "Weights")
+.add_arguments(AMPMultiCastParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_amp_multicast)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const AMPMultiCastParam& param = dmlc::get<AMPMultiCastParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_outputs);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const AMPMultiCastParam& param = dmlc::get<AMPMultiCastParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_outputs);
+  })
+.set_attr_parser(ParamParser<AMPMultiCastParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<AMPMultiCastParam>(attrs.parsed).num_outputs;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("grad_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    int num_args = dmlc::get<AMPMultiCastParam>(attrs.parsed).num_outputs;
+    std::vector<std::pair<int, int>> ret;
+    for (int i = 0; i < num_args; ++i) {
+      ret.emplace_back(i, i);
+    }
+    return ret;
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    int num_args = dmlc::get<AMPMultiCastParam>(attrs.parsed).num_outputs;
+    return std::vector<bool>(num_args, true);
+  })
+.set_attr<FCompute>("FCompute<cpu>", AMPMultiCastCompute<cpu>)
+.add_argument("grad", "NDArray-or-Symbol[]", "Gradients")
+.add_arguments(AMPMultiCastParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/amp_cast.cu b/src/operator/tensor/amp_cast.cu
new file mode 100644
index 000000000000..0a4f7c56c90b
--- /dev/null
+++ b/src/operator/tensor/amp_cast.cu
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file amp_cast.cu
+ * \brief Casts used by AMP (GPU operators)
+ */
+
+#include "./amp_cast.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(amp_cast)
+.set_attr<FCompute>("FCompute<gpu>", AMPCastCompute<gpu>);
+NNVM_REGISTER_OP(_backward_amp_cast)
+.set_attr<FCompute>("FCompute<gpu>", AMPCastCompute<gpu>);
+
+NNVM_REGISTER_OP(amp_multicast)
+.set_attr<FCompute>("FCompute<gpu>", AMPMultiCastCompute<gpu>);
+NNVM_REGISTER_OP(_backward_amp_multicast)
+.set_attr<FCompute>("FCompute<gpu>", AMPMultiCastCompute<gpu>);
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/amp_cast.h b/src/operator/tensor/amp_cast.h
new file mode 100644
index 000000000000..a722b417c715
--- /dev/null
+++ b/src/operator/tensor/amp_cast.h
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file amp_cast.h
+ * \brief Function definition of casts used by AMP
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_AMP_CAST_H_
+#define MXNET_OPERATOR_TENSOR_AMP_CAST_H_
+
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../elemwise_op_common.h"
+#include "../operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+struct AMPCastParam : public dmlc::Parameter<AMPCastParam> {
+  // use int for enumeration
+  int dtype;
+  DMLC_DECLARE_PARAMETER(AMPCastParam) {
+    DMLC_DECLARE_FIELD(dtype)
+    MXNET_ADD_ALL_TYPES
+    .describe("Output data type.");
+  }
+};
+
+struct AMPMultiCastParam : public dmlc::Parameter<AMPMultiCastParam> {
+  int num_outputs;
+
+  DMLC_DECLARE_PARAMETER(AMPMultiCastParam) {
+    DMLC_DECLARE_FIELD(num_outputs)
+    .describe("Number of input/output pairs to be casted to the widest type.");
+  }
+};
+
+inline bool AMPCastType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_attrs,
+                        std::vector<int> *out_attrs) {
+  using mshadow::kFloat32;
+  using mshadow::kFloat16;
+  const AMPCastParam& param = nnvm::get<AMPCastParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if ((*in_attrs)[0] == kFloat32 || (*in_attrs)[0] == kFloat16) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+  }
+  return (*in_attrs)[0] != -1;
+}
+
+inline bool AMPMultiCastType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_attrs,
+                        std::vector<int> *out_attrs) {
+  using mshadow::kFloat32;
+  using mshadow::kFloat16;
+  const AMPMultiCastParam& param = nnvm::get<AMPMultiCastParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), param.num_outputs);
+  CHECK_EQ(out_attrs->size(), param.num_outputs);
+  bool ret = true;
+  int widest_type = kFloat16;
+  for (int i = 0; i < param.num_outputs; ++i) {
+    if ((*in_attrs)[i] == kFloat32 || (*out_attrs)[i] == kFloat32) {
+      widest_type = kFloat32;
+    }
+  }
+  for (int i = 0; i < param.num_outputs; ++i) {
+    if ((*in_attrs)[i] == kFloat32 || (*in_attrs)[i] == kFloat16) {
+      TYPE_ASSIGN_CHECK(*out_attrs, i, widest_type);
+    } else {
+      TYPE_ASSIGN_CHECK(*out_attrs, i, (*in_attrs)[i]);
+    }
+    ret = ret && ((*in_attrs)[i] != -1);
+  }
+  return ret;
+}
+
+inline bool AMPMultiCastShape(const nnvm::NodeAttrs& attrs,
+                              std::vector<TShape> *in_attrs,
+                              std::vector<TShape> *out_attrs) {
+  const AMPMultiCastParam& param = dmlc::get<AMPMultiCastParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), param.num_outputs);
+  CHECK_EQ(out_attrs->size(), param.num_outputs);
+
+  bool all_inferred = true;
+  for (size_t i = 0; i < in_attrs->size(); ++i) {
+    // forward inference
+    SHAPE_ASSIGN_CHECK(*out_attrs, i, (*in_attrs)[i]);
+    // backward inference
+    SHAPE_ASSIGN_CHECK(*in_attrs, i, (*out_attrs)[i]);
+    all_inferred = all_inferred && !shape_is_none((*in_attrs)[i]);
+  }
+  return all_inferred;
+}
+
+template<typename xpu>
+void AMPCastCompute(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DstDType, {
+    Tensor<xpu, 1, DstDType> out = outputs[0].FlatTo1D<xpu, DstDType>(s);
+    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, SrcDType, {
+      Tensor<xpu, 1, SrcDType> data = inputs[0].FlatTo1D<xpu, SrcDType>(s);
+      if (outputs[0].type_flag_ != inputs[0].type_flag_ ||
+          req[0] != kWriteInplace) {
+        Assign(out, req[0], tcast<DstDType>(data));
+      }
+    });
+  });
+}
+
+template<typename xpu>
+void AMPMultiCastCompute(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    MSHADOW_TYPE_SWITCH(outputs[i].type_flag_, DstDType, {
+      Tensor<xpu, 1, DstDType> out = outputs[i].FlatTo1D<xpu, DstDType>(s);
+      MSHADOW_TYPE_SWITCH(inputs[i].type_flag_, SrcDType, {
+        Tensor<xpu, 1, SrcDType> data = inputs[i].FlatTo1D<xpu, SrcDType>(s);
+        if (outputs[i].type_flag_ != inputs[i].type_flag_ ||
+            req[i] != kWriteInplace) {
+          Assign(out, req[i], tcast<DstDType>(data));
+        }
+      });
+    });
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_AMP_CAST_H_
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index 9fec6cd1255a..c7c49937730c 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -304,12 +304,12 @@ inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if (!shape_is_known((*in_attrs)[0])) return false;
+  if (!ndim_is_known((*in_attrs)[0])) return false;
   const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0,
                      ReduceAxesShapeImpl((*in_attrs)[0], param.axis,
                                          param.keepdims, param.exclude));
-  return true;
+  return shape_is_known((*out_attrs)[0]);
 }
 
 inline bool ReduceMinMaxAxesShape(const nnvm::NodeAttrs& attrs,
@@ -379,7 +379,7 @@ inline bool BroadcastAxesShape(const nnvm::NodeAttrs& attrs,
 
 inline bool BroadcastToShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector *in_attrs,
-                            mxnet::ShapeVector *out_attrs) {
+                             mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
@@ -389,7 +389,7 @@ inline bool BroadcastToShape(const nnvm::NodeAttrs& attrs,
     << "Operand of shape " << ishape << " cannot be broadcasted to " << param.shape;
   mxnet::TShape oshape = param.shape;
   for (int i = 0; i < ishape.ndim(); ++i) {
-    if (oshape[i] != -1) {
+    if (oshape[i] != 0) {
       CHECK(ishape[i] == oshape[i] || ishape[i] == 1)
         << "Array cannot be broadcasted from " << ishape << " to " << param.shape;
     } else {
@@ -1183,12 +1183,28 @@ void LpNormCompute(const nnvm::NodeAttrs& attrs,
   } else {
     small = ReduceAxesShapeImpl(inputs[0].shape_, param.axis, true, false);
   }
+  bool safe_acc = dmlc::GetEnv("MXNET_SAFE_ACCUMULATION", false);
+  if (!safe_acc && inputs[0].type_flag_ == mshadow::kFloat16) {
+    common::LogOnce("MXNET_SAFE_ACCUMULATION=1 is recommended for LpNorm with float16 inputs. "
+                    "See https://mxnet.incubator.apache.org/versions/master/faq/env_var.html "
+                    "for more details.");
+  }
   if (param.ord == 1) {
-    ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, false, mshadow_op::abs>(
+    if (safe_acc) {
+      ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, false, mshadow_op::abs>(
+        ctx, inputs, req, outputs, small);
+    } else {
+      ReduceAxesComputeImpl<xpu, mshadow_op::sum, false, false, mshadow_op::abs>(
         ctx, inputs, req, outputs, small);
+    }
   } else if (param.ord == 2) {
-    ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, true, false, mshadow_op::identity>(
+    if (safe_acc) {
+      ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, true, false, mshadow_op::identity>(
+        ctx, inputs, req, outputs, small);
+    } else {
+      ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, false, false, mshadow_op::identity>(
         ctx, inputs, req, outputs, small);
+    }
   }
 }
 
@@ -1362,7 +1378,7 @@ inline bool PickOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 2);
   CHECK_EQ(out_attrs->size(), 1);
   const mxnet::TShape& ishape = (*in_attrs)[0];
-  if (ishape.ndim() == 0) return false;
+  if (!ndim_is_known(ishape)) return false;
   const PickParam& param = nnvm::get<PickParam>(attrs.parsed);
   if (!param.axis) LOG(FATAL)
     << "axis=None is not supported by pick yet. Must specify an axis.";
@@ -1376,7 +1392,7 @@ inline bool PickOpShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*in_attrs, 1,
                        ReduceAxisShapeImpl(ishape, param.axis, false));
   }
-  return true;
+  return shape_is_known((*out_attrs)[0]);
 }
 
 inline bool PickOpType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index f3d101372a1c..56af3887c763 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -167,9 +167,8 @@ Examples::
     if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
     auto ret = MakeGradNode("_backward_pick", n, {ograds[0], n->inputs[1]},
                             n->attrs.dict);
-    auto p = MakeNode("zeros_like", n->attrs.name + "_index_backward",
-                      {n->inputs[1]}, nullptr, &n);
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    ret.emplace_back(MakeNode("zeros_like", n->attrs.name + "_index_backward",
+                     {n->inputs[1]}, nullptr, &n));
     return ret;
   })
 .add_argument("data", "NDArray-or-Symbol", "The input array")
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index f890963c2cf1..861060514181 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -286,12 +286,12 @@ NNVM_REGISTER_OP(broadcast_like)
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::NodePtr& n,
     const std::vector<nnvm::NodeEntry>& ograds) {
-      if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
-      auto lhs = MakeNonlossGradNode("_broadcast_backward", n, ograds, {},
-                                 {{"keepdims", "true"}});
-      auto ng = MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
-                         {n->inputs[1]}, nullptr, &n);
-      lhs.push_back(nnvm::NodeEntry{ng, 0, 0});
+      if (CheckGradAllZero(ograds))
+        return MakeZeroGradNodes(n, ograds);
+      std::vector<nnvm::NodeEntry> lhs = MakeNonlossGradNode("_broadcast_backward", n, ograds, {},
+            {{"keepdims", "true"}});
+      lhs.emplace_back(MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
+                       {n->inputs[1]}, nullptr, &n));
       return lhs;
     })
 .add_argument("lhs", "NDArray-or-Symbol", "First input.")
diff --git a/src/operator/tensor/control_flow_op.cc b/src/operator/tensor/control_flow_op.cc
index 5a05253478c8..b0394d0268f8 100644
--- a/src/operator/tensor/control_flow_op.cc
+++ b/src/operator/tensor/control_flow_op.cc
@@ -75,7 +75,7 @@ Examples::
     // make zero grad node for grad[condition]
     auto p = MakeNode("zeros_like", n->attrs.name + "_cond_backward",
                       {n->inputs[0]}, nullptr, &n);
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    ret.emplace_back(p);
 
     // make grad nodes for grad[x] and grad[y]
     std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
@@ -89,9 +89,8 @@ Examples::
     }
     p->control_deps.emplace_back(n);
     p->inputs = std::move(heads);
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
-    ret.emplace_back(nnvm::NodeEntry{p, 1, 0});
-
+    ret.emplace_back(p, 0, 0);
+    ret.emplace_back(p, 1, 0);
     return ret;
   })
 .add_argument("condition", "NDArray-or-Symbol", "condition array")
diff --git a/src/operator/tensor/control_flow_op.h b/src/operator/tensor/control_flow_op.h
index 96696b244bc3..8fda3344d8f1 100644
--- a/src/operator/tensor/control_flow_op.h
+++ b/src/operator/tensor/control_flow_op.h
@@ -175,6 +175,7 @@ inline bool WhereOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 3U)
     << "where operator takes 3 arguments (" << in_attrs->size() << " given)";
   CHECK_EQ(out_attrs->size(), 1U);
+  if (!mxnet::shape_is_known((*in_attrs)[0])) return false;
 
   mxnet::TShape tshape((*in_attrs)[1]);
   if (!shape_assign(&tshape, (*in_attrs)[2])) return false;
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index f81eb9c04f3a..77e8e36bbef8 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -1207,6 +1207,9 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& lshape = (*in_attrs)[0];
   mxnet::TShape& rshape = (*in_attrs)[1];
+  if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
+  CHECK_GT(lshape.ndim(), 0) << "scalar tensor is not supported by this operator.";
+  CHECK_GT(rshape.ndim(), 0) << "scalar tensor is not supported by this operator.";
   if (lshape.ndim() == 1 && rshape.ndim() == 1) {
     CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors";
     CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape;
@@ -1243,7 +1246,8 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
     mxnet::TShape oshape(buf.begin(), buf.end());
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   }
-  return true;
+  // return true if output shape is fully inferred
+  return shape_is_known((*out_attrs)[0]);
 }
 
 template<typename xpu>
@@ -1479,7 +1483,13 @@ inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   mxnet::TShape& lshape = (*in_attrs)[0];
   mxnet::TShape& rshape = (*in_attrs)[1];
+  // return false if lhs and rhs both have fully unknown shape
+  if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
   if (lshape.ndim() == 3 && rshape.ndim() == 3) {
+    // only partially infer shape if last dim of lhs and second dim of rhs is known
+    bool last_dim_known = dim_size_is_known(lshape, 2);
+    bool second_dim_known = dim_size_is_known(rshape, 1);
+    if ( !last_dim_known || !second_dim_known) return false;
     CHECK(lshape[0] == rshape[0])
       << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
       << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
@@ -1495,7 +1505,8 @@ inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
     LOG(FATAL) << "batch_dot currently only support 3D*3D array"
                << lshape << " v.s. " << rshape;
   }
-  return true;
+  // return true if output shape is fully inferred
+  return shape_is_known((*out_attrs)[0]);
 }
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 73019fa8389b..f84767dd4b2f 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -452,7 +452,7 @@ void BinaryBroadcastComputeSparseEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
-  CHECK_LE(inputs[1].shape().ndim(), 2U)
+  CHECK_LE(inputs[1].shape().ndim(), 2)
     << "input dense matrix should have less than or equal to 2 dimensions";
   if (req[0] == kNullOp) return;
   const NDArray& lhs = inputs[0];
@@ -488,7 +488,7 @@ void BinaryBroadcastComputeDenseEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
-  CHECK_LE(inputs[1].shape().ndim(), 2U)
+  CHECK_LE(inputs[1].shape().ndim(), 2)
     << "input dense matrix should have less than or equal to 2 dimensions";
   if (req[0] == kNullOp) return;
   const NDArray& lhs = inputs[0];
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index 0ff73f4251cd..c5e30c68de7e 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -30,6 +30,12 @@
 namespace mxnet {
 namespace op {
 
+bool SupportMKLDNNSum(const NDArray& input) {
+  int ndim = input.shape().ndim();
+  return input.dtype() == mshadow::kFloat32 && (ndim >= 1 && ndim <= 4) &&
+         input.storage_type() == kDefaultStorage;
+}
+
 static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<NDArray>& inputs,
@@ -38,7 +44,7 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
 #if MXNET_USE_MKLDNN == 1
-  if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) {
+  if (SupportMKLDNNSum(inputs[0]) && SupportMKLDNNSum(inputs[1])) {
     MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]);
     return;
   } else if (inputs[0].storage_type() == kDefaultStorage
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index f1ec8b5ad387..75553ef2c2a5 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -49,12 +49,11 @@ std::vector<nnvm::NodeEntry> ElementWiseSumGrad(
       nnvm::Op::Get("identity");
   CHECK_EQ(ograds.size(), 1);
   std::vector<nnvm::NodeEntry> ret;
-  nnvm::NodeEntry n_out{n, 0, 0};
-  for (size_t i = 0; i < n->inputs.size(); i++) {
-    nnvm::NodePtr id_node = nnvm::Node::Create();
-    id_node->attrs.op = copy_op;
-    id_node->inputs = {ograds[0]};
-    ret.push_back(nnvm::NodeEntry{id_node, 0, 0});
+  for (size_t i = 0; i < n->inputs.size(); ++i) {
+    nnvm::NodePtr node = nnvm::Node::Create();
+    node->attrs.op = copy_op;
+    node->inputs = {ograds[0]};
+    ret.emplace_back(std::move(node));
   }
   return ret;
 }
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 3085f6d2256a..458106e02671 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -35,9 +35,10 @@
 #include "../mxnet_op.h"
 #include "../elemwise_op_common.h"
 #include "../../ndarray/ndarray_function.h"
+
 #if MSHADOW_USE_MKL == 1
-#include "mkl.h"
-#endif
+#include "../mkl_functions-inl.h"
+#endif  // MSHADOW_USE_MKL == 1
 
 namespace mxnet {
 namespace op {
@@ -264,6 +265,48 @@ class UnaryOp : public OpBase {
     }
   }
 
+#if MSHADOW_USE_MKL == 1
+  template<typename OP, typename MKL_OP>
+  static void MKL_Compute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+    if (req[0] == kNullOp)  return;
+    auto type_flag = inputs[0].type_flag_;
+    size_t input_size = inputs[0].Size();
+    if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
+        mkl_func::check_size(input_size) &&
+        mkl_func::check_type(type_flag)) {
+      // set DType as float or double according to type_flag
+      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+        MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+      });
+    } else {
+      Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
+  template<typename OP, typename MKL_OP>
+  static void MKL_ComputeEx(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs) {
+    CHECK_EQ(inputs.size(), 1U)
+      << "Invalid input, only one input is allowed";
+    CHECK_EQ(outputs.size(), 1U)
+      << "Invalid output, only one output is allowed";
+    CHECK_NE(inputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse output storage type";
+    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse output storage type";
+    if (inputs[0].storage_shape().Size()) {
+      MapToFCompute<cpu>(attrs, ctx, inputs, req, outputs, MKL_Compute<OP, MKL_OP>);
+    }
+  }
+#endif
+
   template<typename xpu, typename op>
   static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
                                const OpContext &ctx,
@@ -352,43 +395,6 @@ class UnaryOp : public OpBase {
       LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
     }
   }
-
-#if MSHADOW_USE_MKL == 1
-  static inline void MKLLog(MKL_INT size, const float* pIn, float* pOut) {
-    vsLn(size, pIn, pOut);
-  }
-
-  static inline void MKLLog(MKL_INT size, const double* pIn, double* pOut) {
-    vdLn(size, pIn, pOut);
-  }
-#endif
-
-  template<typename xpu, typename OP>
-  static void LogCompute(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-    if (req[0] == kNullOp) return;
-    // if defined MSHADOW_USE_MKL then call mkl log when req is KWriteTo, type_flag
-    // is mshadow::kFloat32 or mshadow::kFloat64 and data size less than or equal MKL_INT_MAX
-#if MSHADOW_USE_MKL == 1
-    auto type_flag = inputs[0].type_flag_;
-    const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
-    size_t input_size = inputs[0].Size();
-    if (req[0] == kWriteTo &&
-        input_size <= MKL_INT_MAX &&
-        (type_flag == mshadow::kFloat32 || type_flag == mshadow::kFloat64)) {
-      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
-        MKLLog(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
-      });
-    } else {
-      Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-    }
-#else
-    Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
-  }
 };
 
 /*! \brief Map legacy unary_bwd to backward_grad */
@@ -428,7 +434,10 @@ void CastCompute(const nnvm::NodeAttrs& attrs,
     Tensor<xpu, 1, DstDType> out = outputs[0].FlatTo1D<xpu, DstDType>(s);
     MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, SrcDType, {
       Tensor<xpu, 1, SrcDType> data = inputs[0].FlatTo1D<xpu, SrcDType>(s);
-      Assign(out, req[0], tcast<DstDType>(data));
+      if (outputs[0].type_flag_ != inputs[0].type_flag_ ||
+          req[0] != kWriteInplace) {
+        Assign(out, req[0], tcast<DstDType>(data));
+      }
     });
   });
 }
@@ -554,7 +563,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
   NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
   .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
     [](const NodeAttrs& attrs){                                     \
@@ -562,6 +571,38 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
+#if MSHADOW_USE_MKL == 1
+  /*! \bried MKL Unary compute.
+   *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$,                          \
+                                                         __kernel$, __mkl_kernel$)                 \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                         \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                             \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1,                    \
+                                 false, true, true>)                                               \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)   \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::ComputeEx<__xpu$, __kernel$>)
+
+  /*! \bried MKL Unary compute.
+   *  With this macro means mxnet compile with MKL to accelerate math function with mkl.
+   *  Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+  */
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$)    \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                         \
+    MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                             \
+    .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1,                    \
+                                 false, true, false>)                                              \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)   \
+    .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__xpu$, __kernel$>)
+
+  #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$,             \
+                                                           __mkl_kernel$)                          \
+    MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                         \
+    .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+#endif
+
 /*! \brief Unary compute, with FComputeEx for csr and rsp available  */
 #define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$)                     \
   MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                           \
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 5114a5d0dbe3..f2b8dd6b1314 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -85,8 +85,26 @@ The storage type of ``relu`` output depends upon the input storage type:
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_relu"});
 
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_relu,
-                                               unary_bwd<mshadow_op::relu_grad>);
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_relu, unary_bwd<mshadow_op::relu_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      std::vector<nnvm::NodeEntry> ret;
+      // ograds[0]: dL/dxgrad
+      // inputs[0]: dL/dy
+      // inputs[1]: y
+      // f(x) -> relu(x)
+      // f'(x) = 1 if x > 0 else 0
+      // f''(x) = 0
+      auto dydx = MakeNode("_greater", n->attrs.name + "_dydx",
+          {n->inputs[1], nnvm::NodeEntry{
+            MakeNode("zeros_like", n->attrs.name + "tmp", {n->inputs[1]}, nullptr, &n)
+          }}, nullptr, &n);
+      ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                                {ograds[0], nnvm::NodeEntry(dydx)}, nullptr, &n));
+      ret.emplace_back(MakeNode("zeros_like", n->attrs.name + "_backward_grad_grad_in",
+                                {n->inputs[1]}, nullptr, &n));
+      return ret;
+    });
 
 // sigmoid
 MXNET_OPERATOR_REGISTER_UNARY(sigmoid)
@@ -325,10 +343,9 @@ The storage type of ``make_loss`` output depends upon the input storage type:
   })
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    auto p = MakeNode("ones_like", n->attrs.name + "_backward",
-                      &(n->inputs), nullptr, &n);
     std::vector<nnvm::NodeEntry> ret;
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    ret.emplace_back(MakeNode("ones_like", n->attrs.name + "_backward",
+                     &(n->inputs), nullptr, &n));
     return ret;
   });
 
@@ -356,11 +373,10 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
     "FGradient",  [](const nnvm::NodePtr& n,
                      const std::vector<nnvm::NodeEntry>& ograds) {
       if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
-      auto lhs = MakeGradNode("_backward_copy", n, ograds,
+      std::vector<nnvm::NodeEntry> lhs = MakeGradNode("_backward_copy", n, ograds,
                               std::unordered_map<std::string, std::string>());
-      auto ng = MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
-                         {n->inputs[1]}, nullptr, &n);
-      lhs.push_back(nnvm::NodeEntry{ng, 0, 0});
+      lhs.emplace_back(MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
+                         {n->inputs[1]}, nullptr, &n));
       return lhs;
     })
 .add_argument("lhs", "NDArray-or-Symbol", "First input.")
@@ -495,11 +511,10 @@ Negative indices are supported, and `None` can be used for either `lhs_end` or `
     "FGradient",  [](const nnvm::NodePtr& n,
                      const std::vector<nnvm::NodeEntry>& ograds) {
       if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
-      auto lhs = MakeGradNode("_backward_copy", n, ograds,
+      std::vector<nnvm::NodeEntry> lhs = MakeGradNode("_backward_copy", n, ograds,
                               std::unordered_map<std::string, std::string>());
-      auto ng = MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
-                         {n->inputs[1]}, nullptr, &n);
-      lhs.push_back(nnvm::NodeEntry{ng, 0, 0});
+      lhs.emplace_back(MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
+                         {n->inputs[1]}, nullptr, &n));
       return lhs;
     })
 .add_argument("lhs", "NDArray-or-Symbol", "First input.")
@@ -624,6 +639,10 @@ Example::
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
 .set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_cast"})
 .add_argument("data", "NDArray-or-Symbol", "The input.")
@@ -635,6 +654,10 @@ NNVM_REGISTER_OP(_backward_cast)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
 .set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>);
 
 // negative
@@ -821,6 +844,26 @@ The storage type of ``fix`` output depends upon the input storage type:
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // square
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square, mkl_func::square)
+.describe(R"code(Returns element-wise squared value of the input.
+
+.. math::
+   square(x) = x^2
+
+Example::
+
+   square([2, 3, 4]) = [4, 9, 16]
+
+The storage type of ``square`` output depends upon the input storage type:
+
+   - square(default) = default
+   - square(row_sparse) = row_sparse
+   - square(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#else
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square)
 .describe(R"code(Returns element-wise squared value of the input.
 
@@ -839,6 +882,7 @@ The storage type of ``square`` output depends upon the input storage type:
 
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#endif
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_square,
                                                unary_bwd<mshadow_op::square_grad>);
@@ -918,9 +962,14 @@ Example::
    erf([0, -1., 10.]) = [0., -0.8427, 1.]
 
 )code" ADD_FILELINE)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::erf, mkl_func::erf>)
+#else
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::erf>)
+#endif    // MSHADOW_USE_MKL == 1
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
 
+
 MXNET_OPERATOR_REGISTER_BINARY(_backward_erf)
 .set_attr<FCompute>("FCompute<cpu>",
                     ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::erf_grad>>);
@@ -962,6 +1011,23 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_rcbrt)
                       unary_bwd<mshadow_op::reciprocal_cube_root_grad>>);
 
 // exp
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp, mkl_func::exp)
+MXNET_ADD_SPARSE_OP_ALIAS(exp)
+.describe(R"code(Returns element-wise exponential value of the input.
+
+.. math::
+   exp(x) = e^x \approx 2.718^x
+
+Example::
+
+   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
+
+The storage type of ``exp`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#else
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp)
 MXNET_ADD_SPARSE_OP_ALIAS(exp)
 .describe(R"code(Returns element-wise exponential value of the input.
@@ -977,6 +1043,7 @@ The storage type of ``exp`` output is always dense
 
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#endif
 
 // log
 MXNET_OPERATOR_REGISTER_UNARY(log)
@@ -988,7 +1055,11 @@ The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
 The storage type of ``log`` output is always dense
 
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::LogCompute<cpu, mshadow_op::log>)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
+#else
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
+#endif    // MSHADOW_USE_MKL == 1
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
 // log10
@@ -1016,13 +1087,73 @@ The storage type of ``log2`` output is always dense
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log2"});
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log,
-                                                  unary_bwd<mshadow_op::log_grad>);
+                                                  unary_bwd<mshadow_op::log_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    // For f(x) -> f = log
+    // f''(x) = -1 * (f'(x) * f'(x))
+    auto gx = nnvm::NodeEntry{n};
+    auto ggx_mid = MakeNode("elemwise_mul", n->attrs.name + "_backward_mid_grad_grad",
+                            {gx, gx}, nullptr, &n);
+    auto ggx = MakeNode("negative", n->attrs.name + "_backward_grad_grad",
+                        {nnvm::NodeEntry{ggx_mid}}, nullptr, &n);
+
+    std::vector<nnvm::NodeEntry> ret;
+
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                             {ograds[0], gx}, nullptr, &n));
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
+                             {ograds[0], nnvm::NodeEntry{ggx}}, nullptr, &n));
+    return ret;
+  });
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log10,
-                                                  unary_bwd<mshadow_op::log10_grad>);
+                                                  unary_bwd<mshadow_op::log10_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    // For f(x) -> f = log10
+    // f'(x) = 1 / (log(10) * x)
+    // f''(x) = -1 * (f'(x) * 1/x)
+    auto gx = nnvm::NodeEntry{n, 0, 0};
+    auto g_lx = MakeNode("reciprocal", n->attrs.name + "_backward_log_grad",
+                            {n->inputs[1]}, nullptr, &n);
+    auto ggx_mid = MakeNode("elemwise_mul", n->attrs.name + "_backward_mid_grad_grad",
+                            {gx, nnvm::NodeEntry{g_lx}}, nullptr, &n);
+    auto ggx = MakeNode("negative", n->attrs.name + "_backward_grad_grad",
+                        {nnvm::NodeEntry{ggx_mid}}, nullptr, &n);
+
+    std::vector<nnvm::NodeEntry> ret;
+
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                             {ograds[0], gx}, nullptr, &n));
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
+                             {ograds[0], nnvm::NodeEntry{ggx}}, nullptr, &n));
+    return ret;
+  });
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log2,
-                                                  unary_bwd<mshadow_op::log2_grad>);
+                                                  unary_bwd<mshadow_op::log2_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    // For f(x) -> f = log2
+    // f'(x) = 1 / (log(2) * x)
+    // f''(x) = -1 * (f'(x) * 1/x)
+    auto gx = nnvm::NodeEntry{n};
+    auto g_lx = MakeNode("reciprocal", n->attrs.name + "_backward_log_grad",
+                            {n->inputs[1]}, nullptr, &n);
+    auto ggx_mid = MakeNode("elemwise_mul", n->attrs.name + "_backward_mid_grad_grad",
+                            {gx, nnvm::NodeEntry{g_lx}}, nullptr, &n);
+    auto ggx = MakeNode("negative", n->attrs.name + "_backward_grad_grad",
+                        {nnvm::NodeEntry{ggx_mid}}, nullptr, &n);
+
+    std::vector<nnvm::NodeEntry> ret;
+
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                             {ograds[0], gx}, nullptr, &n));
+    ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_inp",
+                             {ograds[0], nnvm::NodeEntry{ggx}}, nullptr, &n));
+    return ret;
+  });
 
 // log1p
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(log1p, cpu, mshadow_op::log1p)
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cc b/src/operator/tensor/elemwise_unary_op_trig.cc
index 28a11cc83eed..b7cf76e4eb2d 100644
--- a/src/operator/tensor/elemwise_unary_op_trig.cc
+++ b/src/operator/tensor/elemwise_unary_op_trig.cc
@@ -46,7 +46,33 @@ The storage type of ``sin`` output depends upon the input storage type:
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sin" });
 
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_sin, unary_bwd<mshadow_op::sin_grad>);
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_sin, unary_bwd<mshadow_op::sin_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      // ograds[0]: d^2L/dx^2
+      // inputs[0]: dL/dy
+      // inputs[1]: x (ElemwiseUseIn)
+      // f(x) = sin(x)
+      // f'(x) = cos(x)
+      // f''(x) = -sin(x)
+      auto dydx = MakeNode("cos", n->attrs.name + "_dydx",
+                             {n->inputs[1]}, nullptr, &n);
+      auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
+          {nnvm::NodeEntry{
+            MakeNode("sin", n->attrs.name + "_grad_grad_mid", {n->inputs[1]}, nullptr, &n)
+          }}, nullptr, &n);
+
+      auto grad_grad_mid = MakeNode("elemwise_mul", n->attrs.name + "backward_grad_grad_mid",
+                                    {n->inputs[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n);
+
+      std::vector<nnvm::NodeEntry> ret;
+
+      ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                                {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
+      ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_in",
+                                {ograds[0], nnvm::NodeEntry{grad_grad_mid}}, nullptr, &n));
+      return ret;
+    });
 
 // cos
 MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(cos, cpu, mshadow_op::cos)
@@ -63,7 +89,37 @@ The storage type of ``cos`` output is always dense
 )code" ADD_FILELINE)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_cos"});
 
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_cos, unary_bwd<mshadow_op::cos_grad>);
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_cos, unary_bwd<mshadow_op::cos_grad>)
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      // ograds[0]: d^2L/dx^2
+      // inputs[0]: dL/dy
+      // inputs[1]: x (ElemwiseUseIn)
+      // f(x) = cos(x)
+      // f'(x) = -sin(x)
+      // f''(x) = -cos(x)
+      auto dydx = MakeNode("negative", n->attrs.name + "_dydx",
+          {nnvm::NodeEntry{
+            MakeNode("sin", n->attrs.name + "_grad_mid", {n->inputs[1]}, nullptr, &n)
+          }}, nullptr, &n);
+      auto d2ydx2 = MakeNode("negative", n->attrs.name + "_d2ydx2",
+          {nnvm::NodeEntry{
+            MakeNode("cos", n->attrs.name + "_grad_grad_mid", {n->inputs[1]}, nullptr, &n)
+          }}, nullptr, &n);
+
+      auto grad_grad_mid = MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_mid",
+                                    {n->inputs[0], nnvm::NodeEntry{d2ydx2}}, nullptr, &n);
+
+      std::vector<nnvm::NodeEntry> ret;
+      // for the backward of the _backward_cos node
+      // first input is the ograd and second input is x (because ElemwiseUseIn)
+      ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad",
+                                {ograds[0], nnvm::NodeEntry{dydx}}, nullptr, &n));
+      ret.emplace_back(MakeNode("elemwise_mul", n->attrs.name + "_backward_grad_grad_in",
+                                {ograds[0], nnvm::NodeEntry{grad_grad_mid}}, nullptr, &n));
+      return ret;
+    });
+
 
 // tan
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(tan, cpu, mshadow_op::tan)
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index a0254ead4572..396d1c612cd2 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -859,8 +859,8 @@ Examples::
                          {n->inputs[1]}, nullptr, &n);
 
     std::vector<nnvm::NodeEntry> ret;
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
-    ret.emplace_back(nnvm::NodeEntry{zero, 0, 0});
+    ret.emplace_back(p);
+    ret.emplace_back(zero);
     return ret;
   })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
@@ -933,8 +933,8 @@ Examples::
     auto zero = MakeNode("zeros_like", n->attrs.name + "_backward_indices",
                          {n->inputs[1]}, nullptr, &n);
     std::vector<nnvm::NodeEntry> ret;
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
-    ret.emplace_back(nnvm::NodeEntry{zero, 0, 0});
+    ret.emplace_back(p);
+    ret.emplace_back(zero);
     return ret;
   })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
@@ -996,8 +996,8 @@ Examples::
     auto zero = MakeNode("zeros_like", n->attrs.name + "_backward_indices",
                          {n->inputs[1]}, nullptr, &n);
     std::vector<nnvm::NodeEntry> ret;
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
-    ret.emplace_back(nnvm::NodeEntry{zero, 0, 0});
+    ret.emplace_back(p);
+    ret.emplace_back(zero);
     return ret;
   })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index e8c5e884588b..84b6a65dd29e 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -145,7 +145,7 @@ inline bool EmbeddingOpShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector *out_attrs) {
   using namespace mshadow;
   const mxnet::TShape &dshape = (*in_attrs)[embedding::kData];
-  if (!shape_is_known(dshape)) return false;
+  if (!ndim_is_known(dshape)) return false;
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*in_attrs, embedding::kWeight, Shape2(param.input_dim,
                                                            param.output_dim));
@@ -1075,7 +1075,7 @@ inline bool BatchTakeOpShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*in_attrs, 1, (*out_attrs)[0]);
   }
   if ((*in_attrs)[0].ndim() == 0) return false;
-  CHECK_GE((*in_attrs)[0].ndim(), 2U) << "Data array must have at least 2 dimensional";
+  CHECK_GE((*in_attrs)[0].ndim(), 2) << "Data array must have at least 2 dimensional";
   if ((*out_attrs)[0].ndim() == 0) return false;
   CHECK_EQ((*in_attrs)[0].Size()/(*in_attrs)[0][(*in_attrs)[0].ndim()-1],
            (*out_attrs)[0].Size())
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index e4b090db933e..fd491534f83a 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -242,7 +242,7 @@ inline bool InitShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape param_shape = param.shape;
-  if (!Imperative::Get()->is_np_comp()) {
+  if (!Imperative::Get()->is_np_shape()) {
     common::ConvertToNumpyShape(&param_shape);
   }
   if (shape_is_known((*out_attrs)[0]) && !shape_is_known(param_shape)) return true;
diff --git a/src/operator/tensor/la_op-inl.h b/src/operator/tensor/la_op-inl.h
index bda8137675a8..4dead87b3dce 100644
--- a/src/operator/tensor/la_op-inl.h
+++ b/src/operator/tensor/la_op-inl.h
@@ -96,13 +96,18 @@ struct gemm {
 
 // C = gemm2(A,B)
 struct gemm2 {
+  template<typename xpu, int dim, typename DType>
+  static void op(const Tensor<xpu, dim, DType>& A, const Tensor<xpu, dim, DType>& B,
+                 const Tensor<xpu, dim, DType>& C, DType alpha, bool tA, bool tB,
+                 Stream<xpu> *s) {
+    gemm::op(A, B, C, DType(alpha), DType(0), tA, tB, s);
+  }
   template<typename xpu, int dim, typename DType>
   static void op(const Tensor<xpu, dim, DType>& A, const Tensor<xpu, dim, DType>& B,
                  const Tensor<xpu, dim, DType>& C, Stream<xpu> *s,
                  const nnvm::NodeAttrs& attrs) {
     const LaMatrixMultParam& param = nnvm::get<LaMatrixMultParam>(attrs.parsed);
-    gemm::op(A, B, C, DType(param.alpha), DType(0), param.transpose_a,
-             param.transpose_b, s);
+    op(A, B, C, DType(param.alpha), param.transpose_a, param.transpose_b, s);
   }
   template<typename xpu, int dim, typename DType>
   static void op(const Tensor<xpu, dim, DType>& A, const Tensor<xpu, dim, DType>& B,
@@ -448,6 +453,22 @@ struct syevd {
   }
 };
 
+// A = inverse(B).
+struct inverse {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& B, const Tensor<xpu, 3, DType>& A,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // Reserve workspace (size determined by query)
+    int lwork(linalg_getri_workspace_query(A, s));
+    Tensor<xpu, 1, DType> work = ctx.requested[0]
+      .get_space_typed<xpu, 1, DType>(Shape1(lwork), s);
+    // Since inverse(A) = trans(inverse(trans(A))), so we don't need to transpose
+    // A even if we are using the col-major version of getrf and getri routines.
+    linalg_batch_inverse(A, B, work, s);
+  }
+};
+
 // Backward operators (always using batch processing)
 
 struct gemm_backward {
@@ -789,6 +810,21 @@ struct syevd_backward {
   }
 };
 
+struct inverse_backward {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dA,
+                 const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& dB,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    // Backward of A = inverse(B)
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 3, DType> temp = ctx.requested[0]
+      .get_space_typed<xpu, 3, DType>(A.shape_, s);
+    gemm2::op(dA, A, temp, DType(1), false, true, s);
+    gemm2::op(A, temp, dB, DType(-1), true, false, s);
+  }
+};
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/la_op.cc b/src/operator/tensor/la_op.cc
index d6e64c4f78cd..2fa1fd3a1cb2 100644
--- a/src/operator/tensor/la_op.cc
+++ b/src/operator/tensor/la_op.cc
@@ -889,5 +889,55 @@ NNVM_REGISTER_OP(_backward_linalg_syevd)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackwSyevd<cpu, syevd_backward>);
 
+NNVM_REGISTER_OP(_linalg_inverse)
+.add_alias("linalg_inverse")
+.describe(R"code(Compute the inverse of a matrix.
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, *A* is a square matrix. We compute:
+
+  *out* = *A*\ :sup:`-1`
+
+If *n>2*, *inverse* is performed separately on the trailing two dimensions
+for all inputs (batch mode).
+
+.. note:: The operator supports float32 and float64 data types only.
+
+Examples::
+
+   // Single matrix inversion
+   A = [[1., 4.], [2., 3.]]
+   inverse(A) = [[-0.6, 0.8], [0.4, -0.2]]
+
+   // Batch matrix inversion
+   A = [[[1., 4.], [2., 3.]],
+        [[1., 3.], [2., 4.]]]
+   inverse(A) = [[[-0.6, 0.8], [0.4, -0.2]],
+                 [[-2., 1.5], [1., -0.5]]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<mxnet::FInferShape>("FInferShape", InverseShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
+  { return std::vector<std::pair<int, int>>{{0, 0}}; })
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 2, 2, 1, 1, inverse>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_linalg_inverse"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of square matrix");
+
+NNVM_REGISTER_OP(_backward_linalg_inverse)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
+  { return std::vector<std::pair<int, int> >{{0, 0}}; })
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 2, 1, inverse_backward>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu
index ec310fe76fcd..3ef714e00c18 100644
--- a/src/operator/tensor/la_op.cu
+++ b/src/operator/tensor/la_op.cu
@@ -93,6 +93,12 @@ NNVM_REGISTER_OP(_linalg_potri)
 NNVM_REGISTER_OP(_backward_linalg_potri)
 .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 3, 1, potri_backward>);
 
+NNVM_REGISTER_OP(_linalg_inverse)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, inverse>);
+
+NNVM_REGISTER_OP(_backward_linalg_inverse)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, inverse_backward>);
+
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_linalg_potrf)
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index 3b36f7c23a55..5b0c7e3562dc 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -398,6 +398,21 @@ inline bool LaLQFactShape(const nnvm::NodeAttrs& attrs,
   return false;
 }
 
+// Shape inference function for linalg_inverse
+// Inputs: A. Outputs: inverse(A)
+inline bool InverseShape(const nnvm::NodeAttrs& attrs,
+                         mxnet::ShapeVector* in_attrs,
+                         mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const mxnet::TShape& in = (*in_attrs)[0];
+  const int ndim(in.ndim());
+  CHECK_GE(ndim, 2) << "Input A's dimension must be >= 2";
+  CHECK_EQ(in[ndim-2], in[ndim-1]) << "Input A's last two dimension must be equal";
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in);
+  return true;
+}
+
 // Shape inference function for linalg_syevd
 // Inputs: A. Outputs: U, L
 inline bool LaEigFactShape(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index e9f3a40afed7..5cd7bf6652d3 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -344,7 +344,7 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& shp = (*in_attrs)[0];
-  CHECK_LE(shp.ndim(), 6U) << "Transpose support at most 6 dimensions";
+  CHECK_LE(shp.ndim(), 6) << "Transpose support at most 6 dimensions";
   mxnet::TShape ret(shp.ndim(), -1);
   if (param.axes.ndim() == 0) {
     for (int i = 0; i < shp.ndim(); ++i) {
@@ -1950,10 +1950,10 @@ struct ReverseParam : public dmlc::Parameter<ReverseParam> {
 #define REVERSE_MAX_DIM 10U
 
 struct reverse {
-  MSHADOW_XINLINE static int ReverseIndex(index_t idx,
-                                          index_t nreversedim,
-                                          const index_t * stride_,
-                                          const index_t * trailing_) {
+  MSHADOW_XINLINE static index_t ReverseIndex(index_t idx,
+                                              index_t nreversedim,
+                                              const index_t * stride_,
+                                              const index_t * trailing_) {
     index_t outputIndex = idx;
     for (index_t i = 0; i < nreversedim; ++i) {
       const index_t low = outputIndex % trailing_[i];
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index 4adfac29fec1..e2f014d1ad41 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -76,7 +76,7 @@ Examples::
       std::vector<nnvm::NodeEntry> inputs;
       uint32_t n_out = n->num_outputs();
       for (uint32_t i = 0; i < n_out; ++i) {
-        inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+        inputs.emplace_back(n, i, 0);
       }
       return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs, n->attrs.dict);
     } else {
@@ -138,7 +138,7 @@ Examples::
     std::vector<nnvm::NodeEntry> inputs;
     uint32_t n_out = n->num_outputs();
     for (uint32_t i = 0; i < n_out; ++i) {
-      inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+      inputs.emplace_back(n, i, 0);
     }
     return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs,
                                {{"axis", n->attrs.dict["axis"]},
diff --git a/src/profiler/aggregate_stats.cc b/src/profiler/aggregate_stats.cc
index 4c135beaf662..ff3894650545 100644
--- a/src/profiler/aggregate_stats.cc
+++ b/src/profiler/aggregate_stats.cc
@@ -28,100 +28,219 @@
 #include <fstream>
 #include <thread>
 #include <iomanip>
+#include <queue>
+#include <utility>
 #include "./profiler.h"
 
 namespace mxnet {
 namespace profiler {
 
+using pi = std::pair<double, std::string>;
+
 template<typename DType>
 inline float MicroToMilli(const DType micro) {
   return static_cast<float>(static_cast<double>(micro) / 1000);
 }
 
+template<typename DType>
+inline float ByteToKilobyte(const DType byte) {
+  return static_cast<float>(static_cast<double>(byte) / 1000);
+}
+
+inline std::priority_queue<pi>
+  BuildHeap(const std::unordered_map<std::string, AggregateStats::StatData>& map,
+            int sort_by, int ascending) {
+  std::priority_queue<pi> heap;
+  for (const auto& iter : map) {
+    const std::string& name = iter.first;
+    const AggregateStats::StatData& data = iter.second;
+    double value = 0;
+    switch (static_cast<AggregateStats::SortBy>(sort_by)) {
+      case AggregateStats::SortBy::Avg:
+        if (data.type_ == AggregateStats::StatData::kCounter)
+          value = (data.max_aggregate_ - data.min_aggregate_) / 2;
+        else
+          value = static_cast<double>(data.total_aggregate_)
+                                / data.total_count_;
+        break;
+      case AggregateStats::SortBy::Min:
+        value = data.min_aggregate_;
+        break;
+      case AggregateStats::SortBy::Max:
+        value = data.max_aggregate_;
+        break;
+      case AggregateStats::SortBy::Count:
+        value = data.total_count_;
+        break;
+      default:
+        LOG(FATAL) << "Invalid value for parameter sort_by";
+        break;
+    }
+    if (ascending != 0)
+      value = -value;
+    heap.push(std::make_pair(value, name));
+  }
+  return heap;
+}
+
 void AggregateStats::OnProfileStat(const ProfileStat& stat) {
   std::unique_lock<std::mutex> lk(m_);
   stat.SaveAggregate(&stats_[stat.categories_.c_str()][stat.name_.c_str()]);
 }
 
-void AggregateStats::Dump(std::ostream& os, bool clear) {
+void AggregateStats::DumpTable(std::ostream& os, int sort_by, int ascending) {
   std::ios state(nullptr);
   state.copyfmt(os);
   os << std::endl
-     << "Profile Statistics." << std::endl
-     << "\tNote that counter items are counter values and not time units."
+     << "Profile Statistics:" << std::endl
+     << "\tNote the difference in units for different entries."
      << std::endl;
   std::unique_lock<std::mutex> lk(m_);
   for (const auto& stat : stats_) {
     const std::string& type = stat.first;
     const std::unordered_map<std::string, StatData>& mm = stat.second;
-    if (!mm.empty()) {
-      os << type << std::endl << "=================" << std::endl;
-      os << std::setw(25) << std::left  << "Name"
-         << std::setw(16) << std::right << "Total Count"
-         << " "
-         << std::setw(16) << std::right
-         << "Time (ms)"
-         << " "
-         << std::setw(16) << std::right
-         << "Min Time (ms)"
-         << " "
-         << std::setw(16) << std::right
-         << "Max Time (ms)"
-         << " "
-         << std::setw(16) << std::right
-         << "Avg Time (ms)"
-         << std::endl;
-      os << std::setw(25) << std::left  << "----"
-         << std::setw(16) << std::right << "-----------"
-         << " "
-         << std::setw(16) << std::right
-         << "---------"
-         << " "
-         << std::setw(16) << std::right
-         << "-------------"
-         << " "
-         << std::setw(16) << std::right
-         << "-------------"
-         << " "
-         << std::setw(16) << std::right
-         << "-------------"
-         << std::endl;
-      for (const auto& iter : mm) {
-        const StatData &data = iter.second;
-        if (data.type_ == StatData::kDuration || data.type_ == StatData::kCounter) {
-          const std::string &name = iter.first;
-          os << std::setw(25) << std::left << name
-             << std::setw(16) << std::right << data.total_count_;
-          os << " "
-             << std::fixed << std::setw(16) << std::setprecision(4) << std::right
-             << MicroToMilli(data.total_aggregate_)
-             << " "
-             << std::fixed << std::setw(16) << std::setprecision(4) << std::right
-             << MicroToMilli(data.min_aggregate_)
-             << " "
-             << std::fixed << std::setw(16) << std::setprecision(4) << std::right
-             << MicroToMilli(data.max_aggregate_);
-          if (data.type_ == StatData::kCounter) {
-            os << " "
-               << std::fixed << std::setw(16) << std::setprecision(4) << std::right
-               << (MicroToMilli(data.max_aggregate_ - data.min_aggregate_) / 2);
-          } else {
-            os << " "
-               << std::fixed << std::setw(16) << std::setprecision(4) << std::right
-               << (MicroToMilli(static_cast<double>(data.total_aggregate_)
-                                / data.total_count_));
-          }
-          os << std::endl;
-        }
+    bool is_memory = (type == "Device Storage"  || type == "Pool Memory");
+    os << type << std::endl << "=================" << std::endl;
+    os << std::setw(25) << std::left  << "Name"
+        << std::setw(16) << std::right << "Total Count"
+        << " "
+        << (is_memory ? std::setw(0) : std::setw(16)) << std::right
+        << (is_memory ? "" : "Time (ms)")
+        << (is_memory ? "" : " ")
+        << std::setw(16) << std::right
+        << (is_memory ? "Min Use  (kB)" : "Min Time (ms)")
+        << " "
+        << std::setw(16) << std::right
+        << (is_memory ? "Max Use  (kB)" : "Max Time (ms)")
+        << " "
+        << std::setw(16) << std::right
+        << (is_memory ? "Avg Use  (kB)" : "Avg Time (ms)")
+        << std::endl;
+    os << std::setw(25) << std::left  << "----"
+        << std::setw(16) << std::right << "-----------"
+        << " "
+        << (is_memory ? std::setw(0) : std::setw(16)) << std::right
+        << (is_memory ? "" : "---------")
+        << (is_memory ? "" : " ")
+        << std::setw(16) << std::right
+        << "-------------"
+        << " "
+        << std::setw(16) << std::right
+        << "-------------"
+        << " "
+        << std::setw(16) << std::right
+        << "-------------"
+        << std::endl;
+    auto heap = BuildHeap(mm, sort_by, ascending);
+    while (!heap.empty()) {
+      const std::string& name = heap.top().second;
+      const StatData &data = mm.at(name);
+      if (data.type_ == StatData::kDuration || data.type_ == StatData::kCounter) {
+        os << std::setw(25) << std::left << name
+           << std::setw(16) << std::right << data.total_count_ << " "
+           << std::fixed << (is_memory ? std::setw(0) : std::setw(16))
+           << std::setprecision(4) << std::right;
+        if (!is_memory)
+          os << MicroToMilli(data.total_aggregate_) << " ";
+        os << std::fixed << std::setw(16) << std::setprecision(4) << std::right
+           << (is_memory ? ByteToKilobyte(data.min_aggregate_) : MicroToMilli(data.min_aggregate_))
+           << " "
+           << std::fixed << std::setw(16) << std::setprecision(4) << std::right
+           << (is_memory ? ByteToKilobyte(data.max_aggregate_) : MicroToMilli(data.max_aggregate_))
+           << " "
+           << std::fixed << std::setw(16) << std::setprecision(4) << std::right
+           << (data.type_ == AggregateStats::StatData::kCounter ?
+                    ByteToKilobyte((data.max_aggregate_ - data.min_aggregate_) / 2) :
+                    MicroToMilli(static_cast<double>(data.total_aggregate_)/ data.total_count_));
+        os << std::endl;
       }
-      os << std::endl;
+      heap.pop();
     }
+    os << std::endl;
   }
   os << std::flush;
   os.copyfmt(state);
-  if (clear) {
-    stats_.clear();
+}
+
+void AggregateStats::DumpJson(std::ostream& os, int sort_by, int ascending) {
+  std::ios state(nullptr);
+  state.copyfmt(os);
+  std::unique_lock<std::mutex> lk(m_);
+  std::stringstream memory_ss;
+  std::stringstream time_ss;
+  std::stringstream *ss;
+  for (const auto& stat : stats_) {
+    const std::string& type = stat.first;
+    const std::unordered_map<std::string, StatData>& mm = stat.second;
+    bool is_memory = (type == "Device Storage"  || type == "Pool Memory");
+    ss = is_memory ? &memory_ss : &time_ss;
+    if (ss->tellp() != std::streampos(0))
+      *ss << "        ," << std::endl;
+    *ss << "        \"" << type << "\": {" << std::endl;
+    auto heap = BuildHeap(mm, sort_by, ascending);
+    bool first_pass = true;
+    while (!heap.empty()) {
+      const std::string& name = heap.top().second;
+      const StatData &data = mm.at(name);
+      if (data.type_ == AggregateStats::StatData::kDuration ||
+          data.type_ == AggregateStats::StatData::kCounter) {
+        if (!first_pass)
+          *ss << "            ," << std::endl;
+        first_pass = false;
+        *ss << "            \"" << name << "\": {" << std::endl
+            << "                \"Count\": "
+            << data.total_count_
+            << "," << std::endl;
+        if (!is_memory)
+          *ss << "                \"Total\": "
+              << std::setprecision(4)
+              << MicroToMilli(data.total_aggregate_)
+              << "," << std::endl;
+        *ss << "                \"Min\": "
+            << std::setprecision(4)
+            << (is_memory ?
+                ByteToKilobyte(data.min_aggregate_) :
+                MicroToMilli(data.min_aggregate_))
+            << "," << std::endl
+            << "                \"Max\": "
+            << std::setprecision(4)
+            << (is_memory ?
+                ByteToKilobyte(data.max_aggregate_) :
+                MicroToMilli(data.max_aggregate_))
+            << "," << std::endl
+            << "                \"Avg\": "
+            << std::setprecision(4)
+            << (data.type_ == AggregateStats::StatData::kCounter ?
+                 ByteToKilobyte((data.max_aggregate_ - data.min_aggregate_) / 2) :
+                 MicroToMilli(static_cast<double>(data.total_aggregate_) /  data.total_count_))
+            << std::endl
+            << "            }" << std::endl;
+      }
+      heap.pop();
+    }
+    *ss << "        }" << std::endl;
   }
+  os << "{" << std::endl
+     << "    \"Time\": {" << std::endl
+     << time_ss.str()
+     << "    }" << std::endl
+     << "    ," << std::endl
+     << "    \"Memory\": {" << std::endl
+     << memory_ss.str()
+     << "    }" << std::endl
+     << "," << std::endl
+     << "    \"Unit\": {" << std::endl
+     << "        \"Time\": \"ms\"," << std::endl
+     << "        \"Memory\": \"kB\"" << std::endl
+     << "    }" << std::endl
+     << "}" << std::endl
+     << std::flush;
+  os.copyfmt(state);
+}
+
+void AggregateStats::clear() {
+  std::unique_lock<std::mutex> lk(m_);
+  stats_.clear();
 }
 
 }  // namespace profiler
diff --git a/src/profiler/aggregate_stats.h b/src/profiler/aggregate_stats.h
index fad8a3137ac5..b9634250b82a 100644
--- a/src/profiler/aggregate_stats.h
+++ b/src/profiler/aggregate_stats.h
@@ -57,10 +57,25 @@ class AggregateStats {
    */
   void OnProfileStat(const ProfileStat& stat);
   /*!
-   * \brief Print profliing statistics to console
-   * \param clear Delete all of the current statistics after printing
+   * \brief Print profliing statistics to console in a tabular format
+   * \param sort_by by which stat to sort the entries, can be "avg", "min", "max", or "count"
+   * \param ascending whether to sort ascendingly
    */
-  void Dump(std::ostream& os, bool clear);
+  void DumpTable(std::ostream& os, int sort_by, int ascending);
+  /*!
+   * \brief Print profliing statistics to console in json format
+   *    * \param sort_by by which stat to sort the entries, can be "avg", "min", "max", or "count"
+   * \param ascending whether to sort ascendingly
+   */
+  void DumpJson(std::ostream& os, int sort_by, int ascending);
+  /*!
+   * \brief Delete all of the current statistics
+   */
+  void clear();
+  /* !\brief by which stat to sort */
+  enum class SortBy {
+    Avg, Min, Max, Count
+  };
 
  private:
   /*! \brief Should rarely collide, so most locks should occur only in user-space (futex) */
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 7726bc6f9273..91eb536ec7bd 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -85,6 +85,8 @@ class GPUPooledStorageManager final : public StorageManager {
     DirectFreeNoLock(handle);
   }
 
+  void ReleaseAll() override;
+
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
     mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
@@ -115,7 +117,6 @@ class GPUPooledStorageManager final : public StorageManager {
   }
 
  private:
-  void ReleaseAll();
   // used memory
   size_t used_memory_ = 0;
   // page size
@@ -250,6 +251,8 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
     DirectFreeNoLock(handle);
   }
 
+  void ReleaseAll() override;
+
  private:
   inline int div_pow2_round_up(size_t s, int divisor_log2) {
     // (1025, 10) -> 2
@@ -284,7 +287,6 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   }
 
  private:
-  void ReleaseAll();
   // number of devices
   const int NDEV = 32;
   // log2 of maximum page size. 16GB
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 0ca5ef7fa30c..7a59a773d28e 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -39,6 +39,7 @@ class StorageImpl : public Storage {
   void Alloc(Handle* handle) override;
   void Free(Handle handle) override;
   void DirectFree(Handle handle) override;
+  void ReleaseAll(Context ctx) override;
   void SharedIncrementRefCount(Handle handle) override;
   StorageImpl() {}
   virtual ~StorageImpl() = default;
@@ -162,6 +163,16 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
   profiler_.OnFree(handle);
 }
 
+void StorageImpl::ReleaseAll(Context ctx) {
+  auto&& device = storage_managers_.at(ctx.dev_type);
+  std::shared_ptr<storage::StorageManager> manager = device.Get(
+    ctx.real_dev_id(), []() {
+    LOG(FATAL) << "Cannot Free space to a device you have not allocated";
+    return nullptr;
+  });
+  manager->ReleaseAll();
+}
+
 void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {
   CHECK_EQ(handle.ctx.dev_type, Context::kCPUShared);
   auto&& device = storage_managers_.at(Context::kCPUShared);
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index d17dc91dc2fc..13be16ebe70f 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -52,6 +52,14 @@ class StorageManager {
    * \param handle Handle struct.
    */
   virtual void DirectFree(Storage::Handle handle) = 0;
+  /*!
+  * \brief Release all memory if using a pool storage manager
+  *
+  * This release all memory from pool storage managers such as
+  * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
+  * For non-pool memory managers this has no effect.
+  */
+  virtual void ReleaseAll() {}
   /*!
    * \brief Destructor.
    */
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index ef3aec18e529..6b863f8efdc0 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -29,6 +29,7 @@
 #include <gtest/gtest.h>
 #include <mxnet/c_api.h>
 #include <mxnet/engine.h>
+#include <mxnet/ndarray.h>
 #include <dmlc/timer.h>
 #include <cstdio>
 #include <thread>
@@ -254,6 +255,53 @@ TEST(Engine, PushFunc) {
   EXPECT_EQ(res, -1);
 }
 
+TEST(Engine, PushFuncND) {
+  auto ctx = mxnet::Context{};
+  mxnet::NDArray nd(ctx);
+
+  // Test #1
+  LOG(INFO) << "===== Test #1: PushAsyncND param and deleter =====";
+  int* a = new int(100);
+  int res = MXEnginePushAsyncND(FooAsyncFunc, a, FooFuncDeleter, &ctx, &nd, 1, nullptr, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #2
+  LOG(INFO) << "===== Test #2: PushAsyncND NULL param and NULL deleter =====";
+  res = MXEnginePushAsyncND(FooAsyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &nd, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #3
+  LOG(INFO) << "===== Test #3: PushAsyncND invalid number of const nds =====";
+  res = MXEnginePushAsyncND(FooAsyncFunc, nullptr, nullptr, &ctx, &nd, -1, nullptr, 0);
+  EXPECT_EQ(res, -1);
+
+  // Test #4
+  LOG(INFO) << "===== Test #4: PushAsyncND invalid number of mutable nds =====";
+  res = MXEnginePushAsyncND(FooAsyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &nd, -1);
+  EXPECT_EQ(res, -1);
+
+  // Test #5
+  LOG(INFO) << "===== Test #5: PushSyncND param and deleter =====";
+  int* b = new int(101);
+  res = MXEnginePushSyncND(FooSyncFunc, b, FooFuncDeleter, &ctx, &nd, 1, nullptr, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #6
+  LOG(INFO) << "===== Test #6: PushSyncND NULL param and NULL deleter =====";
+  res = MXEnginePushSyncND(FooSyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &nd, 1);
+  EXPECT_EQ(res, 0);
+
+  // Test #7
+  LOG(INFO) << "===== Test #7: PushSyncND invalid number of const nds =====";
+  res = MXEnginePushSyncND(FooSyncFunc, nullptr, nullptr, &ctx, &nd, -1, nullptr, 0);
+  EXPECT_EQ(res, -1);
+
+  // Test #8
+  LOG(INFO) << "===== Test #8: PushSyncND invalid number of mutable nds =====";
+  res = MXEnginePushSyncND(FooSyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &nd, -1);
+  EXPECT_EQ(res, -1);
+}
+
 TEST(Engine, basics) {
   auto&& engine = mxnet::Engine::Get();
   auto&& var = engine->NewVariable();
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index bf35834c5d5f..87df39a2754d 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -310,7 +310,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     node->inputs.clear();
     node->inputs.reserve(num_inputs);
     for (uint32_t i = 0; i < num_inputs; ++i) {
-      node->inputs.emplace_back(nnvm::NodeEntry{nullptr, i, 0});
+      node->inputs.emplace_back(nullptr, i, 0);
       (*index2array)[i] = &inputs()[i];
     }
 
@@ -319,7 +319,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       ograd_entries.reserve(num_outputs);
       for (uint32_t i = 0; i < num_outputs; ++i) {
         const uint32_t index = num_inputs + i;
-        ograd_entries.emplace_back(nnvm::NodeEntry{nullptr, index, 1});
+        ograd_entries.emplace_back(nullptr, index, 1);
         (*index2array)[index] = &outputs()[i];
       }
       const std::vector<nnvm::NodeEntry> igrad_entries = fgradient[node->op()](node, ograd_entries);
diff --git a/tests/cpp/operator/mkldnn_test.cc b/tests/cpp/operator/mkldnn_test.cc
index 02131d419c24..3e7b62d8b33f 100644
--- a/tests/cpp/operator/mkldnn_test.cc
+++ b/tests/cpp/operator/mkldnn_test.cc
@@ -100,7 +100,7 @@ static void VerifyDefMem(const mkldnn::memory &mem) {
 
 TEST(MKLDNN_UTIL_FUNC, MemFormat) {
   // Check whether the number of format is correct.
-  CHECK_EQ(mkldnn_format_last, 114);
+  CHECK_EQ(mkldnn_format_last, 145);
   CHECK_EQ(mkldnn_nchw, 7);
   CHECK_EQ(mkldnn_oihw, 16);
 }
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index 89b96e200204..d5f1ebdd6fef 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -77,22 +77,24 @@ core_logic: {
         }
       }
     },
-    'Test Large Tensor Size: CPU': {
+    // https://github.com/apache/incubator-mxnet/issues/14980
+    /*'Test Large Tensor Size: CPU': {
       node(NODE_LINUX_CPU) {
         ws('workspace/large_tensor-cpu') {
             utils.unpack_and_init('cpu_int64', mx_cmake_lib)
             utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_large_tensor', false)
         }
       }
-    },
-    'Test Large Tensor Size: GPU': {
+    },*/
+    // https://github.com/apache/incubator-mxnet/issues/14981
+    /*'Test Large Tensor Size: GPU': {
       node(NODE_LINUX_GPU) {
         ws('workspace/large_tensor-gpu') {
             utils.unpack_and_init('gpu_int64', mx_cmake_lib)
             utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_large_tensor', true)
         }
       }
-    },
+    },*/
     'StraightDope: Python2 Single-GPU': {
       node(NODE_LINUX_GPU_P3) {
         ws('workspace/straight_dope-single_gpu') {
@@ -126,7 +128,7 @@ core_logic: {
       }
     },
     'Tutorial: Python2': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_P3) {
         ws('workspace/tutorial-test-python2') {
           utils.unpack_and_init('gpu', mx_lib)
           utils.docker_run('ubuntu_nightly_gpu', 'nightly_tutorial_test_ubuntu_python2_gpu', true, '1500m')
@@ -134,12 +136,20 @@ core_logic: {
       }
     },
     'Tutorial: Python3': {
-      node(NODE_LINUX_GPU) {
+      node(NODE_LINUX_GPU_P3) {
         ws('workspace/tutorial-test-python3') {
           utils.unpack_and_init('gpu', mx_lib)
           utils.docker_run('ubuntu_nightly_gpu', 'nightly_tutorial_test_ubuntu_python3_gpu', true, '1500m')
         }
       }
+    },
+    'Gluon estimator: GPU': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/estimator-test-gpu') {
+          utils.unpack_and_init('gpu', mx_lib)
+          utils.docker_run('ubuntu_nightly_gpu', 'nightly_estimator', true)
+        }
+      }
     }
   }
 }
diff --git a/tests/nightly/estimator/test_estimator_cnn.py b/tests/nightly/estimator/test_estimator_cnn.py
new file mode 100644
index 000000000000..c60dc544b347
--- /dev/null
+++ b/tests/nightly/estimator/test_estimator_cnn.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test gluon estimator on CNN models
+
+import argparse
+import numpy as np
+import mxnet as mx
+from mxnet import gluon, init, nd
+from mxnet.gluon import data
+from mxnet.gluon.contrib.estimator import estimator
+from mxnet.gluon.model_zoo import vision
+
+def load_data_mnist(batch_size, resize=None, num_workers=4):
+    '''
+    Load MNIST dataset
+    '''
+    transformer = []
+    if resize:
+        transformer += [data.vision.transforms.Resize(resize)]
+    transformer += [data.vision.transforms.ToTensor()]
+    transformer = data.vision.transforms.Compose(transformer)
+    mnist_train = data.vision.MNIST(train=True)
+    mnist_test = data.vision.MNIST(train=False)
+    train_iter = data.DataLoader(
+        mnist_train.transform_first(transformer), batch_size, shuffle=True,
+        num_workers=num_workers)
+    test_iter = data.DataLoader(
+        mnist_test.transform_first(transformer), batch_size, shuffle=False,
+        num_workers=num_workers)
+    return train_iter, test_iter
+
+def bilinear_kernel(in_channels, out_channels, kernel_size):
+    '''
+    Bilinear interpolation using transposed convolution
+    https://github.com/d2l-ai/d2l-en/blob/master/chapter_computer-vision/fcn.md
+    '''
+    factor = (kernel_size + 1) // 2
+    if kernel_size % 2 == 1:
+        center = factor - 1
+    else:
+        center = factor - 0.5
+    og = np.ogrid[:kernel_size, :kernel_size]
+    filt = (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
+    weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), dtype='float32')
+    weight[range(in_channels), range(out_channels), :, :] = filt
+    return nd.array(weight)
+
+def get_net(model_name, context):
+    if model_name == 'FCN':
+        num_classes = 21
+        pretrained_net = vision.resnet18_v2(pretrained=True, ctx=context)
+        net = gluon.nn.HybridSequential()
+        for layer in pretrained_net.features[:-2]:
+            net.add(layer)
+        net.add(gluon.nn.Conv2D(num_classes, kernel_size=1),
+                gluon.nn.Conv2DTranspose(num_classes, kernel_size=64, padding=16, strides=32))
+        net[-1].initialize(init.Constant(bilinear_kernel(num_classes, num_classes, 64)), ctx=context)
+        net[-2].initialize(init=init.Xavier(), ctx=context)
+        input_shape = (1, 3, 320, 480)
+        label_shape = (1, 320, 480)
+        loss_axis = 1
+    else:
+        net = vision.get_model(model_name, classes=10)
+        net.initialize(mx.init.Xavier(), ctx=context)
+        input_shape = (1, 1, 224, 224)
+        label_shape = 1
+        loss_axis = -1
+    return net, input_shape, label_shape, loss_axis
+
+def test_estimator_cpu():
+    '''
+    Test estimator by doing one pass over each model with synthetic data
+    '''
+    models = ['resnet18_v1',
+              'FCN'
+              ]
+    context = mx.cpu()
+    for model_name in models:
+        net, input_shape, label_shape, loss_axis = get_net(model_name, context)
+        train_dataset = gluon.data.dataset.ArrayDataset(mx.nd.random.uniform(shape=input_shape),
+                                                        mx.nd.zeros(shape=label_shape))
+        val_dataset = gluon.data.dataset.ArrayDataset(mx.nd.random.uniform(shape=input_shape),
+                                                      mx.nd.zeros(shape=label_shape))
+        loss = gluon.loss.SoftmaxCrossEntropyLoss(axis=loss_axis)
+        train_data = gluon.data.DataLoader(train_dataset, batch_size=1)
+        val_data = gluon.data.DataLoader(val_dataset, batch_size=1)
+        net.hybridize()
+        trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+        # Define estimator
+        est = estimator.Estimator(net=net,
+                                  loss=loss,
+                                  metrics=mx.metric.Accuracy(),
+                                  trainer=trainer,
+                                  context=context)
+        # Call fit()
+        est.fit(train_data=train_data,
+                val_data=val_data,
+                epochs=1)
+
+def test_estimator_gpu():
+    '''
+    Test estimator by training resnet18_v1 for 5 epochs on MNIST and verify accuracy
+    '''
+    model_name = 'resnet18_v1'
+    batch_size = 128
+    num_epochs = 5
+    context = mx.gpu(0)
+    net, _, _, _ = get_net(model_name, context)
+    train_data, test_data = load_data_mnist(batch_size, resize=224)
+    loss = gluon.loss.SoftmaxCrossEntropyLoss()
+    net.hybridize()
+    acc = mx.metric.Accuracy()
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+    # Define estimator
+    est = estimator.Estimator(net=net,
+                              loss=loss,
+                              metrics=acc,
+                              trainer=trainer,
+                              context=context)
+    # Call fit()
+    est.fit(train_data=train_data,
+            val_data=test_data,
+            epochs=num_epochs)
+
+    assert acc.get()[1] > 0.80
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='test gluon estimator')
+    parser.add_argument('--type', type=str, default='cpu')
+    opt = parser.parse_args()
+    if opt.type == 'cpu':
+        test_estimator_cpu()
+    elif opt.type == 'gpu':
+        test_estimator_gpu()
+    else:
+        raise RuntimeError("Unknown test type")
diff --git a/tests/nightly/estimator/test_sentiment_rnn.py b/tests/nightly/estimator/test_sentiment_rnn.py
new file mode 100644
index 000000000000..d54af491f325
--- /dev/null
+++ b/tests/nightly/estimator/test_sentiment_rnn.py
@@ -0,0 +1,289 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Gluon Text Sentiment Classification Example using RNN/CNN
+Example modified from below link:
+https://github.com/d2l-ai/d2l-en/blob/master/chapter_natural-language-processing/sentiment-analysis-rnn.md
+https://github.com/d2l-ai/d2l-en/blob/master/chapter_natural-language-processing/sentiment-analysis-cnn.md"""
+
+import argparse
+import os
+import tarfile
+import random
+import collections
+import mxnet as mx
+from mxnet import nd, gluon
+from mxnet.contrib import text
+from mxnet.gluon import nn, rnn
+from mxnet.gluon.contrib.estimator import estimator
+
+
+class TextCNN(nn.Block):
+    def __init__(self, vocab, embed_size, kernel_sizes, num_channels,
+                 **kwargs):
+        super(TextCNN, self).__init__(**kwargs)
+        self.embedding = nn.Embedding(len(vocab), embed_size)
+        # The embedding layer does not participate in training
+        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
+        self.dropout = nn.Dropout(0.5)
+        self.decoder = nn.Dense(2)
+        # The max-over-time pooling layer has no weight, so it can share an
+        # instance
+        self.pool = nn.GlobalMaxPool1D()
+        # Create multiple one-dimensional convolutional layers
+        self.convs = nn.Sequential()
+        for c, k in zip(num_channels, kernel_sizes):
+            self.convs.add(nn.Conv1D(c, k, activation='relu'))
+
+    def forward(self, inputs):
+        # Concatenate the output of two embedding layers with shape of
+        # (batch size, number of words, word vector dimension) by word vector
+        embeddings = nd.concat(
+            self.embedding(inputs), self.constant_embedding(inputs), dim=2)
+        # According to the input format required by Conv1D, the word vector
+        # dimension, that is, the channel dimension of the one-dimensional
+        # convolutional layer, is transformed into the previous dimension
+        embeddings = embeddings.transpose((0, 2, 1))
+        # For each one-dimensional convolutional layer, after max-over-time
+        # pooling, an NDArray with the shape of (batch size, channel size, 1)
+        # can be obtained. Use the flatten function to remove the last
+        # dimension and then concatenate on the channel dimension
+        encoding = nd.concat(*[nd.flatten(
+            self.pool(conv(embeddings))) for conv in self.convs], dim=1)
+        # After applying the dropout method, use a fully connected layer to
+        # obtain the output
+        outputs = self.decoder(self.dropout(encoding))
+        return outputs
+
+
+class BiRNN(nn.Block):
+    def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs):
+        super(BiRNN, self).__init__(**kwargs)
+        self.embedding = nn.Embedding(len(vocab), embed_size)
+        # Set Bidirectional to True to get a bidirectional recurrent neural
+        # network
+        self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers,
+                                bidirectional=True, input_size=embed_size)
+        self.decoder = nn.Dense(2)
+
+    def forward(self, inputs):
+        # The shape of inputs is (batch size, number of words). Because LSTM
+        # needs to use sequence as the first dimension, the input is
+        # transformed and the word feature is then extracted. The output shape
+        # is (number of words, batch size, word vector dimension).
+        embeddings = self.embedding(inputs.T)
+        # The shape of states is (number of words, batch size, 2 * number of
+        # hidden units).
+        states = self.encoder(embeddings)
+        # Concatenate the hidden states of the initial time step and final
+        # time step to use as the input of the fully connected layer. Its
+        # shape is (batch size, 4 * number of hidden units)
+        encoding = nd.concat(states[0], states[-1])
+        outputs = self.decoder(encoding)
+        return outputs
+
+
+def download_imdb(data_dir='/tmp/data'):
+    '''
+    Download and extract the IMDB dataset
+    '''
+    # Large Movie Review Dataset from http://ai.stanford.edu/~amaas/data/sentiment/
+    # Note this dataset is copyright to Andrew Maas and Stanford AI Lab
+    # @InProceedings{maas-EtAl:2011:ACL-HLT2011,
+    #   author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
+    #   title     = {Learning Word Vectors for Sentiment Analysis},
+    #   booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
+    #   month     = {June},
+    #   year      = {2011},
+    #   address   = {Portland, Oregon, USA},
+    #   publisher = {Association for Computational Linguistics},
+    #   pages     = {142--150},
+    #   url       = {http://www.aclweb.org/anthology/P11-1015}
+    # }
+    url = ('https://aws-ml-platform-datasets.s3.amazonaws.com/imdb/aclImdb_v1.tar.gz')
+    sha1 = '01ada507287d82875905620988597833ad4e0903'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    file_path = os.path.join(data_dir, 'aclImdb_v1.tar.gz')
+    if not os.path.isfile(file_path):
+        file_path = gluon.utils.download(url, data_dir, sha1_hash=sha1)
+    with tarfile.open(file_path, 'r') as f:
+        f.extractall(data_dir)
+
+
+def read_imdb(folder='train'):
+    '''
+    Read the IMDB dataset
+    '''
+    data = []
+    for label in ['pos', 'neg']:
+        folder_name = os.path.join('/tmp/data/aclImdb/', folder, label)
+        for file in os.listdir(folder_name):
+            with open(os.path.join(folder_name, file), 'rb') as f:
+                review = f.read().decode('utf-8').replace('\n', '').lower()
+                data.append([review, 1 if label == 'pos' else 0])
+    random.shuffle(data)
+    return data
+
+
+def get_tokenized_imdb(data):
+    '''
+    Tokenized the words
+    '''
+
+    def tokenizer(text):
+        return [tok.lower() for tok in text.split(' ')]
+
+    return [tokenizer(review) for review, _ in data]
+
+
+def get_vocab_imdb(data):
+    '''
+    Get the indexed tokens
+    '''
+    tokenized_data = get_tokenized_imdb(data)
+    counter = collections.Counter([tk for st in tokenized_data for tk in st])
+    return text.vocab.Vocabulary(counter, min_freq=5)
+
+
+def preprocess_imdb(data, vocab):
+    '''
+    Make the length of each comment 500 by truncating or adding 0s
+    '''
+    max_l = 500
+
+    def pad(x):
+        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
+
+    tokenized_data = get_tokenized_imdb(data)
+    features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
+    labels = nd.array([score for _, score in data])
+    return features, labels
+
+
+def run(net, train_dataloader, test_dataloader, **kwargs):
+    '''
+    Train a test sentiment model
+    '''
+    num_epochs = kwargs['epochs']
+    ctx = kwargs['ctx']
+    batch_size = kwargs['batch_size']
+    lr = kwargs['lr']
+
+    # Define trainer
+    trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
+    # Define loss and evaluation metrics
+    loss = gluon.loss.SoftmaxCrossEntropyLoss()
+    acc = mx.metric.Accuracy()
+
+    # Define estimator
+    est = estimator.Estimator(net=net, loss=loss, metrics=acc,
+                              trainer=trainer, context=ctx)
+    # Begin training
+    est.fit(train_data=train_dataloader, val_data=test_dataloader,
+            epochs=num_epochs)
+    return acc
+
+
+def test_estimator_cpu(**kwargs):
+    '''
+    Test estimator by doing one pass over each model with synthetic data
+    '''
+    models = ['TextCNN', 'BiRNN']
+    ctx = kwargs['ctx']
+    batch_size = kwargs['batch_size']
+    embed_size = kwargs['embed_size']
+
+    train_data = mx.nd.random.randint(low=0, high=100, shape=(2 * batch_size, 500))
+    train_label = mx.nd.random.randint(low=0, high=2, shape=(2 * batch_size,))
+    val_data = mx.nd.random.randint(low=0, high=100, shape=(batch_size, 500))
+    val_label = mx.nd.random.randint(low=0, high=2, shape=(batch_size,))
+
+    train_dataloader = gluon.data.DataLoader(dataset=gluon.data.ArrayDataset(train_data, train_label),
+                                             batch_size=batch_size, shuffle=True)
+    val_dataloader = gluon.data.DataLoader(dataset=gluon.data.ArrayDataset(val_data, val_label),
+                                           batch_size=batch_size)
+    vocab_list = mx.nd.zeros(shape=(100,))
+
+    # Get the model
+    for model in models:
+        if model == 'TextCNN':
+            kernel_sizes, nums_channels = [3, 4, 5], [100, 100, 100]
+            net = TextCNN(vocab_list, embed_size, kernel_sizes, nums_channels)
+        else:
+            num_hiddens, num_layers = 100, 2
+            net = BiRNN(vocab_list, embed_size, num_hiddens, num_layers)
+        net.initialize(mx.init.Xavier(), ctx=ctx)
+
+        run(net, train_dataloader, val_dataloader, **kwargs)
+
+
+def test_estimator_gpu(**kwargs):
+    '''
+    Test estimator by training Bidirectional RNN for 5 epochs on the IMDB dataset
+    and verify accuracy
+    '''
+    ctx = kwargs['ctx']
+    batch_size = kwargs['batch_size']
+    num_epochs = kwargs['epochs']
+    embed_size = kwargs['embed_size']
+
+    # data
+    download_imdb()
+    train_data, test_data = read_imdb('train'), read_imdb('test')
+    vocab = get_vocab_imdb(train_data)
+
+    train_set = gluon.data.ArrayDataset(*preprocess_imdb(train_data, vocab))
+    test_set = gluon.data.ArrayDataset(*preprocess_imdb(test_data, vocab))
+    train_dataloader = gluon.data.DataLoader(train_set, batch_size, shuffle=True)
+    test_dataloader = gluon.data.DataLoader(test_set, batch_size)
+
+    # Model
+    num_hiddens, num_layers = 100, 2
+    net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
+    net.initialize(mx.init.Xavier(), ctx=ctx)
+
+    glove_embedding = text.embedding.create(
+        'glove', pretrained_file_name='glove.6B.100d.txt', vocabulary=vocab)
+
+    net.embedding.weight.set_data(glove_embedding.idx_to_vec)
+    net.embedding.collect_params().setattr('grad_req', 'null')
+
+    acc = run(net, train_dataloader, test_dataloader, **kwargs)
+
+    assert acc.get()[1] > 0.70
+
+
+parser = argparse.ArgumentParser(description='test gluon estimator')
+parser.add_argument('--type', type=str, default='cpu')
+opt = parser.parse_args()
+kwargs = {
+    'batch_size': 64,
+    'lr': 0.01,
+    'embed_size': 100
+}
+
+if opt.type == 'cpu':
+    kwargs['ctx'] = mx.cpu()
+    kwargs['epochs'] = 1
+    test_estimator_cpu(**kwargs)
+elif opt.type == 'gpu':
+    kwargs['ctx'] = mx.gpu()
+    kwargs['epochs'] = 5
+    test_estimator_gpu(**kwargs)
+else:
+    raise RuntimeError("Unknown test type")
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index f798cbc1034e..cbba608d5d2f 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import mxnet as mx
 import numpy as np
-from mxnet.test_utils import rand_ndarray, assert_almost_equal
+import mxnet as mx
+from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d
 from mxnet import gluon, nd
 from tests.python.unittest.common import with_seed
 
@@ -51,6 +51,16 @@ def test_ndarray_ones():
     assert nd.sum(a).asnumpy() == LARGE_SIZE
 
 
+def test_ndarray_convert():
+    a = nd.zeros(shape=(LARGE_X, SMALL_Y))
+    b = a.astype(np.int32)
+    b.wait_to_read()
+    assert b.dtype == np.int32
+    b = a.tostype('row_sparse')
+    b.wait_to_read()
+    assert isinstance(b, mx.nd.sparse.RowSparseNDArray)
+
+
 @with_seed()
 def test_ndarray_random_uniform():
     a = nd.random.uniform(shape=(LARGE_X, SMALL_Y))
@@ -64,10 +74,10 @@ def test_ndarray_random_randint():
     # check if randint can generate value greater than 2**32 (large)
     low_large_value = 2**32
     high_large_value = 2**34
-    a = nd.random.randint(low_large_value,high_large_value)
+    a = nd.random.randint(low_large_value, high_large_value, dtype=np.int64)
     low = mx.nd.array([low_large_value], dtype='int64')
     high = mx.nd.array([high_large_value], dtype='int64')
-    assert a.__gt__(low) & a.__lt__(high)
+    assert a.__gt__(low) and a.__lt__(high)
 
 
 def test_ndarray_empty():
@@ -87,20 +97,20 @@ def test_elementwise():
 
 
 def test_reduce():
-    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
     assert nd.sum(a).asnumpy() == a.shape[0] * a.shape[1]
 
 
 def test_dot():
-    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.ones(shape=(SMALL_Y, SMALL_Y))
     res = nd.dot(a, b)
     assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
 
 
 def test_FullyConnected():
-    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
-    b = nd.ones(shape=(SMALL_Y, SMALL_Y)) 
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.ones(shape=(SMALL_Y, SMALL_Y))
     res = nd.FullyConnected(a, b, num_hidden=b.shape[1], no_bias=True)
     assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
 
@@ -115,15 +125,33 @@ def test_broadcast():
 
 
 def test_clip():
+    a = nd.arange(0, LARGE_X * SMALL_Y).reshape(LARGE_X, SMALL_Y)
+    res = nd.clip(a, a_min=100, a_max=1000)
+    assert np.sum(res[-1].asnumpy() == 1000) == a.shape[1]
+
+
+def test_split():
+    a = nd.arange(0, LARGE_X * SMALL_Y).reshape(LARGE_X, SMALL_Y)
+    outs = nd.split(a, num_outputs=SMALL_Y, axis=1)
+    result = sum(1 for i, v in enumerate(outs) if i == v[0].asnumpy())
+    assert result == a.shape[1]
+
+
+def test_argmin():
+    a = nd.arange(0, LARGE_X * SMALL_Y).reshape(LARGE_X, SMALL_Y)
+    idx = mx.nd.argmin(a, axis=0)
+    assert idx.shape[0] == SMALL_Y
+
+
+def test_tile():
     a = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
-    b = nd.broadcast_to(a, shape=(a.shape[0], SMALL_Y))
-    res = nd.clip(b, a_min=100, a_max=1000)
-    assert np.sum(res[-1].asnumpy() == 1000) == b.shape[1]
+    b = nd.tile(a, reps=(1, SMALL_Y))
+    assert np.sum(b[-1].asnumpy() == LARGE_X) == b.shape[1]
 
 
 def test_take():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
-    idx = nd.arange(LARGE_X-1000, LARGE_X)
+    idx = nd.arange(LARGE_X - 1000, LARGE_X)
     res = nd.take(a, idx)
     assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
 
@@ -171,22 +199,21 @@ def test_Dense(ctx=mx.cpu(0)):
 
 def test_where():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
-    b = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
-    b = nd.broadcast_to(b, shape=(b.shape[0], SMALL_Y))
+    b = nd.arange(0, LARGE_X * SMALL_Y).reshape(LARGE_X, SMALL_Y)
     res = nd.where(b > 100, a, b)
     assert np.sum(res[-1].asnumpy() == 1) == b.shape[1]
-
     csr_cond = nd.sparse.cast_storage(b < 10, 'csr')
     res = nd.sparse.where(csr_cond, a, b)
-    assert np.sum(res[0].asnumpy() == 1) == b.shape[1]
+    assert np.sum(res[0].asnumpy() == 1) == 10
 
 
 def test_pick():
-    a = mx.nd.ones(shape=(256*35, 1024*1024))
-    b = mx.nd.ones(shape=(256*35,))
-    res = mx.nd.pick(a,b)
+    a = mx.nd.ones(shape=(256 * 35, 1024 * 1024))
+    b = mx.nd.ones(shape=(256 * 35, ))
+    res = mx.nd.pick(a, b)
     assert res.shape == b.shape
-    
+
+
 def test_depthtospace():
     def numpy_depth_to_space(x, blocksize):
         b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
@@ -202,6 +229,7 @@ def numpy_depth_to_space(x, blocksize):
     output = mx.nd.depth_to_space(data, 2)
     assert_almost_equal(output.asnumpy(), expected, atol=1e-3, rtol=1e-3)
 
+
 def test_spacetodepth():
     def numpy_space_to_depth(x, blocksize):
         b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
@@ -217,6 +245,87 @@ def numpy_space_to_depth(x, blocksize):
     output = mx.nd.space_to_depth(data, 2)
     assert_almost_equal(output.asnumpy(), expected, atol=1e-3, rtol=1e-3)
 
+@with_seed()
+def test_diag():
+    a_np = np.random.random((LARGE_X, SMALL_Y)).astype(np.float32)
+    a = mx.nd.array(a_np)
+
+    # k == 0
+    r = mx.nd.diag(a)
+    assert_almost_equal(r.asnumpy(), np.diag(a_np))
+
+    # k == 1
+    k = 1
+    r = mx.nd.diag(a, k=k)
+    assert_almost_equal(r.asnumpy(), np.diag(a_np, k=k))
+
+    # k == -1
+    k = -1
+    r = mx.nd.diag(a, k=k)
+    assert_almost_equal(r.asnumpy(), np.diag(a_np, k=k))
+
+    # random k
+    k = np.random.randint(-min(LARGE_X, SMALL_Y) + 1, min(LARGE_X, SMALL_Y))
+    r = mx.nd.diag(a, k=k)
+    assert_almost_equal(r.asnumpy(), np.diag(a_np, k=k))
+
+
+@with_seed()
+def test_ravel_multi_index():
+    x1, y1 = rand_coord_2d((LARGE_X - 100), LARGE_X, 10, SMALL_Y)
+    x2, y2 = rand_coord_2d((LARGE_X - 200), LARGE_X, 9, SMALL_Y)
+    x3, y3 = rand_coord_2d((LARGE_X - 300), LARGE_X, 8, SMALL_Y)
+    indices_2d = [[x1, x2, x3], [y1, y2, y3]]
+    idx = mx.nd.ravel_multi_index(mx.nd.array(indices_2d, dtype=np.int64), shape=(LARGE_X, SMALL_Y))
+    idx_numpy = np.ravel_multi_index(indices_2d, (LARGE_X, SMALL_Y))
+    assert np.sum(1 for i in range(idx.size) if idx[i] == idx_numpy[i]) == 3
+
+
+@with_seed()
+def test_unravel_index():
+    x1, y1 = rand_coord_2d((LARGE_X - 100), LARGE_X, 10, SMALL_Y)
+    x2, y2 = rand_coord_2d((LARGE_X - 200), LARGE_X, 9, SMALL_Y)
+    x3, y3 = rand_coord_2d((LARGE_X - 300), LARGE_X, 8, SMALL_Y)
+    original_2d_indices = [[x1, x2, x3], [y1, y2, y3]]
+    idx_numpy = np.ravel_multi_index(original_2d_indices, (LARGE_X, SMALL_Y))
+    indices_2d = mx.nd.unravel_index(mx.nd.array(idx_numpy, dtype=np.int64), shape=(LARGE_X, SMALL_Y))
+    assert (indices_2d.asnumpy() == np.array(original_2d_indices)).all()
+
+
+def create_2d_tensor(rows, columns):
+    a = np.arange(0, rows).reshape(rows, 1)
+    b = np.broadcast_to(a, shape=(a.shape[0], columns))
+    return nd.array(b, dtype=np.int64)
+
+
+def test_transpose():
+    b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
+    t = b.T
+    assert t.shape == (SMALL_Y, LARGE_X)
+    assert np.sum(t[:, -1].asnumpy() == (LARGE_X - 1)) == b.shape[1]
+
+
+def test_swapaxes():
+    b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
+    t = nd.swapaxes(b, dim1=0, dim2=1)
+    assert t.shape == (SMALL_Y, LARGE_X)
+    assert np.sum(t[:, -1].asnumpy() == (LARGE_X - 1)) == b.shape[1]
+
+
+def test_flip():
+    b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
+    t = nd.flip(b, axis=0)
+    assert t.shape == (LARGE_X, SMALL_Y)
+    assert np.sum(t[-1, :].asnumpy() == 0) == b.shape[1]
+
+
+def test_softmax():
+    input_data = mx.nd.ones((SMALL_Y, LARGE_X))
+    true_output = np.full((SMALL_Y, LARGE_X), (1 / SMALL_Y))
+    output = nd.softmax(input_data, axis=0)
+    assert_almost_equal(output.asnumpy(), true_output, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 95835fd77e9e..b60814a47a81 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -456,6 +456,7 @@ def get_net(num_ops):
 
 
 @with_seed()
+@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/14970')
 def test_bulking():
     # test case format: (max_fwd_segment_size, max_bwd_segment_size, enable_bulking_in_training)
     test_cases = [(0, 0, True), (1, 1, True), (15, 15, False),
diff --git a/tests/python/gpu/test_gluon_transforms.py b/tests/python/gpu/test_gluon_transforms.py
index 599a02c7a4f4..e303008dee9a 100644
--- a/tests/python/gpu/test_gluon_transforms.py
+++ b/tests/python/gpu/test_gluon_transforms.py
@@ -24,7 +24,7 @@
 from mxnet.base import MXNetError
 from mxnet.gluon.data.vision import transforms
 from mxnet.test_utils import assert_almost_equal, set_default_context
-from mxnet.test_utils import almost_equal
+from mxnet.test_utils import almost_equal, same
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import assertRaises, setup_module, with_seed, teardown
@@ -90,6 +90,15 @@ def test_to_tensor():
     transformer = transforms.ToTensor()
     assertRaises(MXNetError, transformer, invalid_data_in)
 
+    # Bounds (0->0, 255->1)
+    data_in = np.zeros((10, 20, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert same(out_nd.asnumpy(), np.transpose(np.zeros(data_in.shape, dtype=np.float32), (2, 0, 1)))
+
+    data_in = np.full((10, 20, 3), 255).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert same(out_nd.asnumpy(), np.transpose(np.ones(data_in.shape, dtype=np.float32), (2, 0, 1)))
+
 @with_seed()
 def test_resize():
     # Test with normal case 3D input float type
diff --git a/tests/python/gpu/test_nccl.py b/tests/python/gpu/test_nccl.py
index 40ef6fdfd0af..275dae009a21 100644
--- a/tests/python/gpu/test_nccl.py
+++ b/tests/python/gpu/test_nccl.py
@@ -22,7 +22,7 @@
 
 shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
 keys = [1,2,3,4,5,6,7]
-num_gpus = len(mx.test_utils.list_gpus())
+num_gpus = mx.context.num_gpus()
 
 
 if num_gpus > 8 :
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 392d0cfbe3f3..064f783ec6c8 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1633,6 +1633,24 @@ def test_deformable_psroipooling_with_type():
                  'deformable_psroipool_trans': (2, 4, 3, 3),
                  'type_dict': {'deformable_psroipool_data': np.float16, 'deformable_psroipool_rois': np.float16,
                                'deformable_psroipool_trans': np.float16}},
+                {'ctx': mx.cpu(0),
+                 'deformable_psroipool_data': (1, 18, 14, 14),
+                 'deformable_psroipool_rois': (2, 5),
+                 'deformable_psroipool_trans': (2, 4, 3, 3),
+                 'type_dict': {'deformable_psroipool_data': np.float64, 'deformable_psroipool_rois': np.float64,
+                               'deformable_psroipool_trans': np.float64}},
+                {'ctx': mx.cpu(0),
+                 'deformable_psroipool_data': (1, 18, 14, 14),
+                 'deformable_psroipool_rois': (2, 5),
+                 'deformable_psroipool_trans': (2, 4, 3, 3),
+                 'type_dict': {'deformable_psroipool_data': np.float32, 'deformable_psroipool_rois': np.float32,
+                               'deformable_psroipool_trans': np.float32}},
+                {'ctx': mx.cpu(0),
+                 'deformable_psroipool_data': (1, 18, 14, 14),
+                 'deformable_psroipool_rois': (2, 5),
+                 'deformable_psroipool_trans': (2, 4, 3, 3),
+                 'type_dict': {'deformable_psroipool_data': np.float16, 'deformable_psroipool_rois': np.float16,
+                               'deformable_psroipool_trans': np.float16}},
                 ]
 
     check_consistency(sym, ctx_list, scale=0.1, tol=tol,
@@ -1656,10 +1674,14 @@ def test_deformable_convolution_with_type():
                  'deformable_conv_data': (2, 2, 10, 10),
                  'deformable_conv_offset': (2, 18, 8, 8),
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
-                # {'ctx': mx.gpu(0),
-                #  'deformable_conv_data': (2, 2, 10, 10),
-                #  'deformable_conv_offset': (2, 18, 8, 8),
-                #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_conv_offset': np.float16}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 10, 10),
+                 'deformable_conv_offset': (2, 18, 8, 8),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 10, 10),
+                 'deformable_conv_offset': (2, 18, 8, 8),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
 
     check_consistency(sym, ctx_list, scale=0.1, tol=tol)
@@ -1676,9 +1698,9 @@ def test_deformable_convolution_options():
     tol = {np.dtype(np.float32): 1e-1,
            np.dtype(np.float64): 1e-3}
     # 2D convolution
+    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
 
     # Pad > 0
-    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
     ctx_list = [{'ctx': mx.gpu(0),
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 18, 7, 7),
@@ -1687,12 +1709,19 @@ def test_deformable_convolution_options():
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 18, 7, 7),
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 7, 7),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 7, 7),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv')
     check_consistency(sym, ctx_list, scale=0.1, tol=tol)
 
     # Stride > 1
-    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
     ctx_list = [{'ctx': mx.gpu(0),
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 18, 3, 3),
@@ -1701,12 +1730,19 @@ def test_deformable_convolution_options():
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 18, 3, 3),
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv')
     check_consistency(sym, ctx_list, scale=0.1, tol=tol)
 
     # Dilate > 1
-    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
     ctx_list = [{'ctx': mx.gpu(0),
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 18, 3, 3),
@@ -1715,12 +1751,19 @@ def test_deformable_convolution_options():
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 18, 3, 3),
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv')
     check_consistency(sym, ctx_list, scale=0.1, tol=tol)
 
     # Deformable group > 1
-    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
     ctx_list = [{'ctx': mx.gpu(0),
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 36, 5, 5),
@@ -1729,13 +1772,18 @@ def test_deformable_convolution_options():
                  'deformable_conv_data': (2, 2, 7, 7),
                  'deformable_conv_offset': (2, 36, 5, 5),
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
-                # {'ctx': mx.gpu(0),
-                #  'deformable_conv_data': (2, 2, 7, 7),
-                #  'deformable_conv_offset': (2, 36, 5, 5),
-                #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 36, 5, 5),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.cpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 36, 5, 5),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
-    sym = mx.sym.contrib.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2,
-                                               name='deformable_conv')
+    sym = mx.sym.contrib.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2, name='deformable_conv')
+    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
@@ -1964,14 +2012,14 @@ def check_proposal_consistency(op, batch_size, with_nms=False):
 # The following 2 functions launch 0-thread kernels, an error that should be caught and signaled.
 def kernel_error_check_imperative():
     os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
-    with mx.np_compat(active=True):
+    with mx.np_shape(active=True):
         a = mx.nd.array([1,2,3],ctx=mx.gpu(0))
         b = mx.nd.array([],ctx=mx.gpu(0))
         c = (a / b).asnumpy()
 
 def kernel_error_check_symbolic():
     os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
-    with mx.np_compat(active=True):
+    with mx.np_shape(active=True):
         a = mx.sym.Variable('a')
         b = mx.sym.Variable('b')
         c = a / b
@@ -2159,6 +2207,7 @@ def _test_bulking_in_process(seed, time_per_iteration):
     time_per_iteration.value = (time.time() - start) / num_iterations
 
 @with_seed()
+@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/14970')
 def test_bulking():
     # test case format: (max_fwd_segment_size, max_bwd_segment_size, enable_bulking_in_training)
     test_cases = [(0,0,True), (1,1,True), (15,15,False), (15,0,True), (0,15,True), (15,15,True)]
@@ -2200,6 +2249,54 @@ def test_context_num_gpus():
     # Test that num_gpus reports at least one GPU, as the test is run on a GPU host.
     assert mx.context.num_gpus() > 0
 
+def math_log(shape, dtype, check_value):
+    np_x = np.random.rand(*tuple(shape))
+    x = mx.nd.array(np_x, dtype=dtype)
+    y = mx.nd.log(data=x)
+    if check_value:
+        x_ = x.as_in_context(mx.cpu())
+        y_ = mx.nd.log(data=x_)
+        assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def math_erf(shape, dtype, check_value):
+    np_x = np.random.rand(*tuple(shape))
+    x = mx.nd.array(np_x, dtype=dtype)
+    y = mx.nd.erf(data=x)
+    if check_value:
+        x_ = x.as_in_context(mx.cpu())
+        y_ = mx.nd.erf(data=x_)
+        assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def math_square(shape, dtype, check_value):
+    np_x = np.random.rand(*tuple(shape))
+    x = mx.nd.array(np_x, dtype=dtype)
+    y = mx.nd.square(data=x)
+    if check_value:
+        x_ = x.as_in_context(mx.cpu())
+        y_ = mx.nd.square(data=x_)
+        assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def run_math(op, shape, dtype="float32", check_value=True):
+    run_num = 10
+    for i in range(run_num):
+        if op == 'log':
+            math_log(shape=shape, dtype=dtype, check_value=check_value)
+        elif op == 'erf':
+            math_erf(shape=shape, dtype=dtype, check_value=check_value)
+        elif op == 'square':
+            math_square(shape=shape, dtype=dtype, check_value=check_value)
+
+@with_seed()
+def test_math():
+    ops = ['log', 'erf', 'square']
+    check_value= True
+    shape_lst = [[1000], [100,1000], [10,100,100], [10,100,100,100]] 
+    dtypes = ["float32", "float64"]
+    for shape in shape_lst:
+        for dtype in dtypes:
+            for op in ops:
+                run_math(op, shape, dtype, check_value=check_value)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index b4cfde2e7f2a..662edcfeb739 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -24,6 +24,7 @@
 import mxnet as mx
 import unittest
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
+from mxnet.module import Module
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
@@ -496,7 +497,7 @@ class Reshape2D(gluon.HybridBlock):
         def __init__(self, factor):
             super(Reshape2D, self).__init__()
             self._factors = (int(factor),) * 2
-     
+
         def hybrid_forward(self, F, x):
             f1, f2 = self._factors
                                                           # (N, f1*f2*C, H, W)
@@ -505,20 +506,20 @@ def hybrid_forward(self, F, x):
             x = F.transpose(x, (0, 1, 4, 2, 5, 3))        # (N, C, H, f1, W, f2)
             x = F.reshape(x, (0, 0, -3, -3))              # (N, C, H*f1, W*f2)
             return x
-     
-     
+
+
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Net, self).__init__(**kwargs)
             with self.name_scope():
                 self.conv1 = nn.Conv2D(8, kernel_size=5)
                 self.reshape2D = Reshape2D(2)
-     
+
         def hybrid_forward(self, F, x):
             x = self.conv1(x)
             x = self.reshape2D(x)
             return x
-     
+
     net = Net()
     net.initialize(mx.init.Xavier(), ctx=mx.cpu())
     net.hybridize()
@@ -526,5 +527,22 @@ def hybrid_forward(self, F, x):
     output = net(data)
     a = output.asnumpy()
 
+@with_seed()
+def test_weight_async_reorder():
+    data = mx.sym.Variable("data")
+    w1 = mx.sym.Variable("1_weight")
+    w2 = mx.sym.Variable("2_weight")
+    conv1 = mx.sym.Convolution(data=data, weight=w1 + w1, num_filter=32, no_bias=True, kernel=(3, 3))
+    conv2 = mx.sym.Convolution(data=conv1, weight=w2 + w2, num_filter=32, no_bias=True, kernel=(1, 1))
+    mod = Module(symbol=conv2, label_names=None, context=mx.current_context())
+    mod.bind(for_training=False, data_shapes=[('data', (10, 16, 50, 50))])
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    data = [mx.random.uniform(-1.0, 1.0, shape=(10, 16, 50, 50), ctx=mx.current_context())]
+    batch=mx.io.DataBatch(data, [])
+    for i in range(2):
+        mod.forward(batch, is_train=False)
+        for output in mod.get_outputs():
+            output.wait_to_read()
+
 if __name__ == '__main__':
     install.test_mkldnn_install()
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index 761eb47e56cb..b25fefc6cc0e 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -21,7 +21,6 @@
 import numpy as np
 import unittest
 import ctypes
-from mxnet.io import NDArrayIter
 from mxnet.module import Module
 from mxnet.symbol import Symbol
 from importlib import import_module
@@ -31,38 +30,34 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
-from mxnet.test_utils import assert_almost_equal
+from mxnet.test_utils import assert_almost_equal, assert_almost_equal_with_err
 import itertools
 
 OP_NAME='op_name'
 QUANTIZED_OP_NAME='quantized_op_name'
-SG_PASS_NAME='sg_pass_name'
-POST_SG_PASS_NAME='post_sg_pass_name'
+SG_PASS_NAME='MKLDNN'
+QUANTIZE_SG_PASS_NAME='MKLDNN_QUANTIZE'
 config =  {
   'conv': {
     OP_NAME: 'sg_mkldnn_conv',
-    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_conv',
-    SG_PASS_NAME: 'MKLDNN',
-    POST_SG_PASS_NAME: 'MKLDNN_POST_QUANTIZE'
+    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_conv'
   },
   'fc': {
     OP_NAME: 'sg_mkldnn_fully_connected',
-    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_fully_connected',
-    SG_PASS_NAME: 'MKLDNN',
-    POST_SG_PASS_NAME: 'MKLDNN_POST_QUANTIZE'
+    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_fully_connected'
   }
 }
 
-DATA_SHAPE=[(4, 4, 10, 10), (32, 3, 24, 24), (64, 8, 64, 64)]
+DATA_SHAPE=[(64, 4, 10, 10), (4, 3, 24, 24), (1, 16, 32, 32)]
 
 def check_qsym_calibrated(qsym, out_type, name='conv'):
-  quantized_op_name = config[name][QUANTIZED_OP_NAME]
+  quantized_op_name = 'quantized_' + name
   assert ''.join(qsym.attr_dict().keys()).find(quantized_op_name) != -1
   for k, v in qsym.attr_dict().items():
     if k.find('_quantize') != -1:
       assert v['out_type'] == out_type
     if k.find(quantized_op_name) != -1:
-      if name == 'fc' and 'enable_float_output' in v:
+      if quantized_op_name.startswith("quantized_sg_mkldnn_fully_connected") and 'enable_float_output' in v:
         continue
       assert 'min_calib_range' in v
       assert 'max_calib_range' in v
@@ -84,22 +79,20 @@ def check_qsym_scale_align(qsym):
 
 
 
-def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape):
-  mod = Module(symbol=qsym, context=mx.current_context())
+def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape):
+  mod = Module(symbol=qsym, label_names=None, context=mx.current_context())
   mod.bind(for_training=False,
-           data_shapes=[('data', data_shape)],
-           label_shapes=[('softmax_label', label_shape)])
+           data_shapes=[('data', data_shape)])
   mod.set_params(qarg_params, qaux_params)
   mod.forward(batch, is_train=False)
   for output in mod.get_outputs():
     output.wait_to_read()
   return mod.get_outputs()
 
-def check_qsym_dummy_forward(qsym, batch, data_shape, label_shape):
-  mod = Module(symbol=qsym, context=mx.current_context())
+def check_qsym_dummy_forward(qsym, batch, data_shape):
+  mod = Module(symbol=qsym, label_names=None, context=mx.current_context())
   mod.bind(for_training=False,
-           data_shapes=[('data', data_shape)],
-           label_shapes=[('softmax_label', label_shape)])
+           data_shapes=[('data', data_shape)])
   mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
   mod.forward(batch, is_train=False)
   for output in mod.get_outputs():
@@ -121,30 +114,34 @@ def check_qsym_gluon_forward(qsym, qarg_params, qaux_params, data_shape):
   data = mx.random.uniform(-1.0, 1.0, shape=data_shape)
   net(data)
 
+class CalibIter(mx.io.DataIter):
+    def __init__(self, batch, data_shape, batch_size):
+        super(CalibIter, self).__init__(batch_size)
+        self.data_shape = data_shape
+        self.label_shape = (batch_size,)
+        self.provide_data = [('data', self.data_shape)]
+        self.provide_label = []
+        self.batch = batch
+
+    def __iter__(self):
+        yield self.batch
+
+
 def check_quantize(sym, data_shape, out_type, name='conv',
                    check_calibration=True, gluon_forward=False, check_scale_align=False):
-  sg_pass_name = config[name][SG_PASS_NAME]
-  post_sg_pass_name = config[name][POST_SG_PASS_NAME]
-
-  fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc_softmax')
-  if gluon_forward == True:
-    sym = fc
-    sym_sg = sym.get_backend_symbol(sg_pass_name)
-    mod = Module(symbol=sym, label_names=[])
-    mod.bind(for_training=False,
+  if name in config:
+    name = config[name][OP_NAME]
+  sym_sg = sym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
+  mod = Module(symbol=sym, label_names=None)
+  mod.bind(for_training=False,
             data_shapes=[('data', data_shape)])
-  else:
-    sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
-    sym_sg = sym.get_backend_symbol(sg_pass_name)
-    label_shape = (data_shape[0], 10)
-    mod = Module(symbol=sym)
-    mod.bind(for_training=False,
-            data_shapes=[('data', data_shape)],
-            label_shapes=[('softmax_label', label_shape)])
   mod.init_params(mx.init.Normal(0.5))
   arg_params, aux_params = mod.get_params()
 
-  data = [mx.random.uniform(-1, 1, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes]
+  if out_type == 'uint8':
+    data = [mx.random.uniform(0.0, 1.0, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes]
+  else:
+    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes]
   batch = mx.io.DataBatch(data, [])
 
   mod.forward(batch, is_train=False)
@@ -157,10 +154,8 @@ def check_quantize(sym, data_shape, out_type, name='conv',
     excluded_sym_names += ['sg_mkldnn_fully_connected_0']
     excluded_sym_names += ['fc_softmax']
 
-  calib_data = mx.nd.random.uniform(shape=data_shape)
-  calib_data = NDArrayIter(data=calib_data)
-  calib_data = DummyIter(calib_data)
-  calib_layer = lambda name: name.endswith('_output')
+  calib_data = CalibIter(batch, data_shape, 1)
+
   qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
                                                                    arg_params=arg_params,
                                                                    aux_params=aux_params,
@@ -169,9 +164,10 @@ def check_quantize(sym, data_shape, out_type, name='conv',
                                                                    quantized_dtype=out_type,
                                                                    calib_mode='naive',
                                                                    calib_data=calib_data,
-                                                                   calib_layer=calib_layer,
-                                                                   num_calib_examples=5)
-  qsym = qsym.get_backend_symbol(post_sg_pass_name)
+                                                                   calib_layer=None,
+                                                                   label_names=None,
+                                                                   num_calib_examples=1)
+  qsym = qsym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
   if check_calibration:
     check_qsym_calibrated(qsym, out_type, name=name)
   if check_scale_align:
@@ -179,10 +175,13 @@ def check_quantize(sym, data_shape, out_type, name='conv',
   if gluon_forward == True:
     check_qsym_gluon_forward(qsym, qarg_params, qaux_params, data_shape)
   else:
-    check_qsym_dummy_forward(qsym, batch, data_shape, label_shape)
-    quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape)
+    quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape)
     for i in range(len(ref_out)):
-      assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol = 1)
+      min_range = mx.nd.min(ref_out[i]).asscalar()
+      max_range = mx.nd.max(ref_out[i]).asscalar()
+      atol = 0.1 * max(abs(min_range), abs(max_range))
+      assert_almost_equal_with_err(quantized_out[i].asnumpy(), ref_out[i].asnumpy(), rtol=0.1, atol=atol, etol=0.2)
+    check_qsym_dummy_forward(qsym, batch, data_shape)
 
 @with_seed()
 def check_quantize_whole_model_with_forward():
@@ -203,8 +202,8 @@ def check_quantize_whole_model(out_type):
     data = mx.sym.Variable('data')
     conv0 = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0')
     sym = mx.sym.Convolution(conv0, kernel=(1, 1), num_filter=16, name='conv1')
-    sym_sg = sym.get_backend_symbol('MKLDNN')
-    mod = Module(symbol=sym, label_names=[])
+    sym_sg = sym.get_backend_symbol('MKLDNN_QUANTIZE')
+    mod = Module(symbol=sym, label_names=None)
     mod.bind(for_training=False,
              data_shapes=[('data', data_shape)])
 
@@ -214,7 +213,7 @@ def check_quantize_whole_model(out_type):
     excluded_sym_names = []
 
     calib_data = mx.nd.random.uniform(shape=data_shape)
-    calib_data = NDArrayIter(data=calib_data)
+    calib_data = mx.io.NDArrayIter(data=calib_data)
     calib_data = DummyIter(calib_data)
     calib_layer = lambda name: name.endswith('_output')
     qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
@@ -226,56 +225,61 @@ def check_quantize_whole_model(out_type):
                                                                      calib_mode='naive',
                                                                      calib_data=calib_data,
                                                                      calib_layer=calib_layer,
-                                                                     num_calib_examples=5)
-    qsym = qsym.get_backend_symbol('MKLDNN_POST_QUANTIZE')
+                                                                     label_names=None,
+                                                                     num_calib_examples=1)
+    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
     check_qsym_forward(qsym, qarg_params, qaux_params, data_shape)
 
   for qdtype in ['uint8', 'int8', 'auto']:
     check_quantize_whole_model(qdtype)
 
 @with_seed()
-def check_fusion(sym, data_shape, attrs_op, name='conv', check_quantization=True):
-  op_name = config[name][OP_NAME]
-  sg_pass_name = config[name][SG_PASS_NAME]
-
-  sym_sg = sym.get_backend_symbol(sg_pass_name)
-  assert ''.join(sym_sg.get_internals().list_outputs()).find(op_name) != -1
-  for k, v in sym_sg.attr_dict().items():
-    if k.find(op_name) != -1:
-      for attr_op in attrs_op:
-        assert v[attr_op] in ['true', 'True']
-
-  arg_shapes, _, aux_shapes = sym.infer_shape()
-  arg_array = [mx.nd.random.uniform(-1, 1, shape=shape) for shape in arg_shapes]
-  aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
-  exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
-  exe.forward()
-  os.environ['MXNET_SUBGRAPH_BACKEND'] = sg_pass_name
-  exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
-  exe_sg.forward()
-  del os.environ['MXNET_SUBGRAPH_BACKEND']
-  for i in range(len(exe.outputs)):
-    assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-3)
-
-  # fp32 to int8
-  out_type_list = ['uint8', 'int8', 'auto']
+def check_fusion(sym, data_shape, attrs_dict, check_fp32_fusion=True, check_quantization=True, out_types=['uint8', 'int8', 'auto']):
+  if check_fp32_fusion:
+    sym_sg = sym.get_backend_symbol(SG_PASS_NAME)
+    for name, attrs in attrs_dict.items():
+      if name in config:
+        op_name = config[name][OP_NAME]
+      else:
+        op_name = name
+      assert ''.join(sym_sg.get_internals().list_outputs()).find(op_name) != -1
+      if len(attrs):
+          found = False
+          for k, v in sym_sg.attr_dict().items():
+            if k.find(op_name) != -1:
+              found = True
+              for attr_name, attr_value in attrs.items():
+                assert v[attr_name].lower() == attr_value.lower()
+          assert found
+
+    arg_shapes, _, aux_shapes = sym.infer_shape()
+    arg_array = [mx.nd.random.uniform(-1.0, 1.0, shape=shape) for shape in arg_shapes]
+    aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
+    exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+    exe.forward()
+    os.environ['MXNET_SUBGRAPH_BACKEND'] = SG_PASS_NAME
+    exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+    exe_sg.forward()
+    del os.environ['MXNET_SUBGRAPH_BACKEND']
+    for i in range(len(exe.outputs)):
+      assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-1)
 
   if check_quantization:
-    for out_type in out_type_list:
-      check_quantize(sym, data_shape, out_type, name=name)
+    # fp32 to int8
+    for out_type in out_types:
+      check_quantize(sym, data_shape, out_type, name=op_name)
       # TODO(ciyong), since quantized fc save its params in int8, while gluon treat the default
       # variable from symbol file as fp32 which results in mismatch dtype of params.
       # Skip quantized fc in gluon pass.
       if name != 'fc':
-        check_quantize(sym, data_shape, out_type, name=name, gluon_forward=True)
+        check_quantize(sym, data_shape, out_type, name=op_name, gluon_forward=True)
 
 def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None,
                      date_shape=(4,4,10,10), name='conv'):
   op_name = config[name][OP_NAME]
-  sg_pass_name = config[name][SG_PASS_NAME]
 
   for sym, attrs, excluded_attr in zip(syms, attrs_name, excluded_attrs):
-    sym_sg = sym.get_backend_symbol(sg_pass_name)
+    sym_sg = sym.get_backend_symbol(SG_PASS_NAME)
     exe_sg = sym_sg.simple_bind(mx.cpu(), data=date_shape, grad_req='null')
 
     attrs_dict = sym_sg.attr_dict()
@@ -289,38 +293,55 @@ def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None,
 def head_symbol(data_shape):
   data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
   weight = mx.symbol.Variable('weight', dtype='float32')
-  bn = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=0.9, name='bn')
-  return bn, weight
+  return data, weight
 
 # single conv fuision case
 def single_conv(no_bias, data_shape):
-  conv_attr = ['']
+  attr = {'conv': []}
   data, weight = head_symbol(data_shape)
   conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
                                kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  return conv, conv_attr
+  return conv, attr
 
 # conv + bn fusion case
 def conv_bn(no_bias, data_shape):
-  conv_bn_attr = ['with_bn']
+  attr = {'conv': {'with_bn': 'true'}}
   data, weight = head_symbol(data_shape)
   conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
                                kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
   bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
-  return bn1, conv_bn_attr
+  return bn1, attr
 
-# conv + relu fusion case
-def conv_relu(no_bias, data_shape):
-  conv_relu_attr = ['with_relu']
+# conv + act fusion case
+def conv_act(no_bias, data_shape, alg):
+  attr = {'conv': {'with_act': 'true'}}
   data, weight = head_symbol(data_shape)
   conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
                                kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  relu = mx.symbol.Activation(data=conv, name='relu', act_type="relu")
-  return relu, conv_relu_attr
+  if alg == "relu6":
+    relu = mx.symbol.clip(data=conv, name='relu6', a_min=0, a_max=6)
+  else:
+    relu = mx.symbol.Activation(data=conv, name=alg, act_type=alg)
+  return relu, attr
+
+# conv + act + sum fusion case
+def conv_act_sum(no_bias, data_shape, alg):
+  attr = {'conv': {'with_act': 'true', 'with_sum': 'true'}}
+  data, weight = head_symbol(data_shape)
+  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  if alg == "relu6":
+    relu = mx.symbol.clip(data=conv, name='relu6', a_min=0, a_max=6)
+  else:
+    relu = mx.symbol.Activation(data=conv, name=alg, act_type=alg)
+  conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
+                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
+  sum = relu + conv1
+  return sum, attr
 
 # conv + add fusion case
 def conv_add(no_bias, data_shape):
-  conv_add_attr = ['with_sum']
+  attr = {'conv': {'with_sum': 'true'}}
   data, weight = head_symbol(data_shape)
   conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
                                kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
@@ -328,11 +349,11 @@ def conv_add(no_bias, data_shape):
                                kernel=(3, 3), stride=(1, 1))
   pool = mx.sym.Pooling(data=conv2, kernel=(1, 1), pool_type='avg', name='pool')
   sum = conv1 + pool
-  return sum, conv_add_attr
+  return sum, attr
 
 # conv + add fusion case 2
 def conv_add2(no_bias, data_shape):
-  conv_add_attr = ['with_sum']
+  attr = {'conv': {'with_sum': 'true'}}
   data, weight = head_symbol(data_shape)
   conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
                                kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
@@ -340,21 +361,24 @@ def conv_add2(no_bias, data_shape):
                                kernel=(3, 3), stride=(1, 1))
   pool = mx.sym.Pooling(data=conv2, kernel=(1, 1), pool_type='avg', name='pool')
   sum = pool + conv1
-  return sum, conv_add_attr
+  return sum, attr
 
-# conv + bn + relu fusion case
-def conv_bn_relu(no_bias, data_shape):
-  conv_bn_relu_attr = ['with_bn', 'with_relu']
+# conv + bn + act fusion case
+def conv_bn_act(no_bias, data_shape, alg):
+  attr = {'conv': {'with_bn': 'true', 'with_act': 'true'}}
   data, weight = head_symbol(data_shape)
   conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
                                kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
   bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
-  relu = mx.symbol.Activation(data=bn1, name='relu', act_type="relu")
-  return relu, conv_bn_relu_attr
+  if alg == "relu6":
+    relu = mx.symbol.clip(data=bn1, name='relu6', a_min=0, a_max=6)
+  else:
+    relu = mx.symbol.Activation(data=bn1, name=alg, act_type=alg)
+  return relu, attr
 
-# conv + bn + add + relu fusion case
-def conv_bn_sum_relu(no_bias, data_shape):
-  conv_bn_add_relu_attr = ['with_sum', 'with_postsum_relu', 'with_bn']
+# conv + bn + add + act fusion case
+def conv_bn_sum_act(no_bias, data_shape, alg):
+  attr = {'conv': {'with_sum': 'true', 'with_postsum_act': 'true', 'with_bn': 'true'}}
   data, weight = head_symbol(data_shape)
   conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
                                kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
@@ -362,12 +386,15 @@ def conv_bn_sum_relu(no_bias, data_shape):
   conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
                                 kernel=(3, 3), stride=(1, 1))
   sum1 = bn1 + conv1
-  relu = mx.symbol.Activation(data=sum1, name='relu', act_type="relu")
-  return relu, conv_bn_add_relu_attr
+  if alg == "relu6":
+    relu = mx.symbol.clip(data=sum1, name='relu6', a_min=0, a_max=6)
+  else:
+    relu = mx.symbol.Activation(data=sum1, name=alg, act_type=alg)
+  return relu, attr
 
 # single concat case
 def single_concat(data_shape, input_num, dim):
-  data, weight = head_symbol(data_shape)
+  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
   inputs = []
   for i in range(input_num):
     inputs.append(data)
@@ -388,6 +415,22 @@ def concat_scale_align(data_shape):
   concat = mx.symbol.Concat(*[conv1, conv2, conv3, conv4], name="concat", dim=1)
   return concat
 
+
+# mobilenetv2 case
+def mobilenetv2_struct(data_shape):
+  attr = {'sg_mkldnn_conv_bn_0' : {'with_bn': 'true'}}
+  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
+  weight1 = mx.symbol.Variable('conv1_weight', dtype='float32')
+  weight2 = mx.symbol.Variable('conv2_weight', dtype='float32')
+  conv1 = mx.symbol.Convolution(data=data, weight=weight1, name='conv1', num_filter=64,
+                               kernel=(1, 1), stride=(1, 1), no_bias=True)
+  bn1 = mx.symbol.BatchNorm(data=conv1, name="bn1")
+  conv2 = mx.symbol.Convolution(data=bn1, weight=weight2, name='conv2', num_filter=64,
+                               kernel=(1, 1), stride=(1, 1), no_bias=True)
+  bn2 = mx.symbol.BatchNorm(data=conv2, name="bn2")
+  sum = bn1 + bn2
+  return sum, attr
+
 def tail_neg_symbol(sym1, sym2):
   fc1 = mx.sym.FullyConnected(data=sym1, num_hidden=10, flatten=True, name='fc1')
   fc2 = mx.sym.FullyConnected(data=sym2, num_hidden=10, flatten=True, name='fc2')
@@ -504,7 +547,7 @@ def neg_conv_bn_relu(data_shape):
 
   syms.append(sym2)
   attrs.append(['with_bn'])
-  excluded_attrs.append(['with_relu'])
+  excluded_attrs.append(['with_act'])
   return syms, attrs, excluded_attrs
 
 # conv + bn + add + relu can't be fusion case
@@ -539,7 +582,7 @@ def neg_conv_bn_add_relu(data_shape):
 
   syms.append(sym1)
   attrs.append([])
-  excluded_attrs.append(['with_sum', 'with_postsum_relu', 'with_bn'])
+  excluded_attrs.append(['with_sum', 'with_postsum_act', 'with_bn'])
 
   # eg.2
   conv21 = mx.symbol.Convolution(data=data, weight=weight, name='conv21', num_filter=64, kernel=(3, 3), stride=(1, 1))
@@ -551,7 +594,7 @@ def neg_conv_bn_add_relu(data_shape):
 
   syms.append(sym2)
   attrs.append(['with_bn'])
-  excluded_attrs.append(['with_sum', 'with_postsum_relu'])
+  excluded_attrs.append(['with_sum', 'with_postsum_act'])
 
   # eg.3
   conv31 = mx.symbol.Convolution(data=data, weight=weight, name='conv31', num_filter=64, kernel=(3, 3), stride=(1, 1))
@@ -563,18 +606,18 @@ def neg_conv_bn_add_relu(data_shape):
 
   syms.append(sym3)
   attrs.append(['with_bn', 'with_sum'])
-  excluded_attrs.append(['with_postsum_relu'])
+  excluded_attrs.append(['with_postsum_act'])
   return syms, attrs, excluded_attrs
 
 def single_fc(no_bias, data_shape, flatten=True):
-  attr = ['']
+  attr = {'fc': {}}
   data, weight = head_symbol(data_shape)
   fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
                                 no_bias=no_bias, flatten=flatten)
   return fc, attr
 
 def fc_relu(no_bias, data_shape, flatten=True):
-  attr = ['with_relu']
+  attr = {'fc': {'with_relu': 'true'}}
   data, weight = head_symbol(data_shape)
   fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
                                 no_bias=no_bias, flatten=flatten)
@@ -614,12 +657,18 @@ def test_pos_single_conv():
     check_fusion(net, data_shape, attrs)
 
 @with_seed()
-def test_pos_conv_relu():
+def test_pos_conv_act():
+  act_list = {"relu": True,
+              "sigmoid": True,
+              "tanh": True,
+              "softrelu": True,
+              "relu6": True}
   for data_shape in DATA_SHAPE:
-    net, attrs = conv_relu(False, data_shape)
-    check_fusion(net, data_shape, attrs)
-    net, attrs = conv_relu(True, data_shape)
-    check_fusion(net, data_shape, attrs)
+    for (alg, quantize) in act_list.items():
+      net, attrs = conv_act(False, data_shape, alg)
+      check_fusion(net, data_shape, attrs, check_quantization=quantize)
+      net, attrs = conv_act(True, data_shape, alg)
+      check_fusion(net, data_shape, attrs, check_quantization=quantize)
 
 @with_seed()
 def test_pos_conv_bn():
@@ -646,25 +695,40 @@ def test_pos_conv_add2():
     check_fusion(net, data_shape, attrs)
 
 @with_seed()
-def test_pos_conv_bn_relu():
+def test_pos_conv_bn_act():
+  act_list = {"relu": True,
+              "sigmoid": True,
+              "tanh": True,
+              "softrelu": True,
+              "relu6": True}
   for data_shape in DATA_SHAPE:
-    net, attrs = conv_bn_relu(False, data_shape)
-    check_fusion(net, data_shape, attrs)
-    net, attrs = conv_bn_relu(True, data_shape)
-    check_fusion(net, data_shape, attrs)
+    for (alg, quantize) in act_list.items():
+      net, attrs = conv_bn_act(False, data_shape, alg)
+      check_fusion(net, data_shape, attrs, check_quantization=quantize)
+      net, attrs = conv_bn_act(True, data_shape, alg)
+      check_fusion(net, data_shape, attrs, check_quantization=quantize)
 
 @with_seed()
-def test_pos_conv_bn_sum_relu():
+def test_pos_conv_bn_sum_act():
+  act_list = {"relu": True,
+              "sigmoid": True,
+              "tanh": True,
+              "softrelu": True,
+              "relu6": False}
   for data_shape in DATA_SHAPE:
-    net, attrs = conv_bn_sum_relu(False, data_shape)
-    check_fusion(net, data_shape, attrs)
-    net, attrs = conv_bn_sum_relu(True, data_shape)
-    check_fusion(net, data_shape, attrs)
+    for (alg, quantize) in act_list.items():
+      net, attrs = conv_bn_sum_act(False, data_shape, alg)
+      check_fusion(net, data_shape, attrs, check_quantization=quantize)
+      net, attrs = conv_bn_sum_act(True, data_shape, alg)
+      check_fusion(net, data_shape, attrs, check_quantization=quantize)
 
 @with_seed()
 def test_pos_single_concat():
   for data_shape in DATA_SHAPE:
-    for out_type in ('uint8', 'int8', 'auto'):
+    for out_type in ('int8', 'auto'):
+      net = single_concat(data_shape, 2, -1)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False)
+      check_quantize(net, data_shape, out_type, name='conv', check_calibration=False, gluon_forward=True)
       net = single_concat(data_shape, 2, 1)
       check_quantize(net, data_shape, out_type, name='conv', check_calibration=False)
       check_quantize(net, data_shape, out_type, name='conv', check_calibration=False, gluon_forward=True)
@@ -678,11 +742,17 @@ def test_pos_single_concat():
 @with_seed()
 def test_pos_concat_scale_align():
   for data_shape in DATA_SHAPE:
-    for out_type in ('uint8', 'int8', 'auto'):
+    for out_type in ('int8', 'auto'):
       net = concat_scale_align(data_shape)
       check_quantize(net, data_shape, out_type, check_calibration=True, check_scale_align=True)
       check_quantize(net, data_shape, out_type, check_calibration=True, check_scale_align=True, gluon_forward=True)
 
+@with_seed()
+def test_mobilenetv2_struct():
+  for data_shape in DATA_SHAPE:
+      net, attrs = mobilenetv2_struct(data_shape)
+      check_fusion(net, data_shape, attrs, out_types=['int8', 'auto'])
+
 @with_seed()
 def test_neg_conv_bn():
   for data_shape in DATA_SHAPE:
@@ -718,9 +788,9 @@ def test_single_fc():
   for dshape, no_bias, flatten in itertools.product(DATA_SHAPE, [True, False], [True, False]):
     syms, attrs = single_fc(no_bias, dshape, flatten)
     if flatten is True:
-      check_fusion(syms, dshape, attrs, name='fc', check_quantization=True)
+      check_fusion(syms, dshape, attrs, check_quantization=True)
     else:
-      check_fusion(syms, dshape, attrs, name='fc', check_quantization=False)
+      check_fusion(syms, dshape, attrs, check_quantization=False)
 
 
 @with_seed()
@@ -728,9 +798,9 @@ def test_fc_relu():
   for dshape, no_bias, flatten in itertools.product(DATA_SHAPE, [True, False], [True, False]):
     syms, attrs = fc_relu(no_bias, dshape, flatten)
     if flatten is True:
-      check_fusion(syms, dshape, attrs, name='fc', check_quantization=True)
+      check_fusion(syms, dshape, attrs, check_quantization=True)
     else:
-      check_fusion(syms, dshape, attrs, name='fc', check_quantization=False)
+      check_fusion(syms, dshape, attrs, check_quantization=False)
 
 @with_seed()
 def test_neg_fc_relu():
diff --git a/tests/python/profiling/test_nvtx.py b/tests/python/profiling/test_nvtx.py
index 35b209ebb6eb..507b438e300d 100644
--- a/tests/python/profiling/test_nvtx.py
+++ b/tests/python/profiling/test_nvtx.py
@@ -25,7 +25,7 @@
 
 def test_nvtx_ranges_present_in_profile():
 
-    if not mx.test_utils.list_gpus():
+    if not mx.context.num_gpus():
         unittest.skip('Test only applicable to machines with GPUs')
 
     # Build a system independent wrapper to execute simple_forward with nvprof
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index ce93f9821b9d..294e10763220 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -748,9 +748,6 @@ def check_quantize_model(qdtype):
         if is_test_for_native_cpu():
             print('skipped testing test_quantize_model_with_forward for native cpu since it is not supported yet')
             return
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing test_quantize_model_with_forward for mkldnn cpu int8 since it is not supported yet')
-            return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing test_quantize_model_with_forward for gpu uint8 since it is not supported yet')
             return
@@ -782,11 +779,16 @@ def check_qsym_qdtype(qsym, qdtype):
                     assert 'out_type' in v
                     assert v['out_type'] == qdtype
 
-        def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
-            mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
-            mod.bind(for_training=False,
-                     data_shapes=[('data', data_shape)],
-                     label_shapes=[('softmax_label', label_shape)])
+        def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=None):
+            if label_shape is None:
+                mod = mx.mod.Module(symbol=qsym, label_names=None, context=mx.current_context())
+                mod.bind(for_training=False,
+                         data_shapes=[('data', data_shape)])
+            else:
+                mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
+                mod.bind(for_training=False,
+                         data_shapes=[('data', data_shape)],
+                         label_shapes=[('softmax_label', label_shape)])
             mod.set_params(qarg_params, qaux_params)
             data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
             batch = mx.io.DataBatch(data, [])
@@ -794,165 +796,109 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
             for output in mod.get_outputs():
                 output.wait_to_read()
 
-        sym = get_fp32_residual()
         batch_size = 4
-        data_shape = (batch_size, 4, 10, 10)
-        label_shape = (batch_size, 10)
-
         length = batch_size  # specify num of outputs from split op
-        msym = get_fp32_sym_with_multiple_outputs(length)
-        msym_label_shape = (length, 10)
-        msym_data_shape = (length, 4, 4, 10, 10)
+        sym_list = []
+        name_list = []
+        dshape_list = []
+        lshape_list = []
+
+        # sym 1
+        sym_list.append(get_fp32_residual())
+        name_list.append('sym1')
+        dshape_list.append((batch_size, 4, 10, 10))
+        lshape_list.append((batch_size, 10))
+
+        # sym 2
+        sym_list.append(get_fp32_sym_with_multiple_outputs(length))
+        name_list.append('sym2')
+        dshape_list.append((length, 4, 4, 10, 10))
+        lshape_list.append((length, 10))
 
-        for s, dshape, lshape in zip((sym, msym), (data_shape, msym_data_shape),
-                                     (label_shape, msym_label_shape)):
-            mod = Module(symbol=s)
-            mod.bind(data_shapes=[('data', dshape)], label_shapes=[('softmax_label', lshape)])
+        data = mx.sym.Variable('data')
+        # sym 3
+        sym_list.append(mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0'))
+        name_list.append('sym3')
+        dshape_list.append((batch_size, 4, 10, 10))
+        lshape_list.append(None)
+
+        # sym 4
+        cell = mx.rnn.LSTMCell(num_hidden=64)
+        outputs, _ = cell.unroll(length, data)
+        sym_list.append(mx.sym.Group(outputs))
+        name_list.append('sym4')
+        dshape_list.append((batch_size, length, 32))
+        lshape_list.append(None)
+
+        for s, dshape, lshape, name in zip(sym_list, dshape_list, lshape_list, name_list):
+            if qdtype == 'int8' and is_test_for_mkldnn() and name in ['sym1', 'sym2', 'sym3']:
+              print('skipped testing test_quantize_model_with_forward for mkldnn cpu int8 since it is not supported yet')
+              continue
+
+            if lshape is None:
+                mod = Module(symbol=s, label_names=None)
+                mod.bind(for_training=False,
+                         data_shapes=[('data', dshape)])
+            else:
+                mod = Module(symbol=s)
+                mod.bind(for_training=False,
+                         data_shapes=[('data', dshape)],
+                         label_shapes=[('softmax_label', lshape)])
 
             mod.init_params()
             arg_params, aux_params = mod.get_params()
-            excluded_names = []
-            if mx.current_context() == mx.cpu():
-               excluded_names += ['fc', 'conv1']
-            if mx.current_context() == mx.gpu():
-               excluded_names += ['sum0', 'relu0', 'relu1']
-            excluded_names += ['concat']
-
-            optional_names = ['pool0']
-            for skip_optional_names in [False, True]:
-                exclude_sym_names = []
-                if skip_optional_names:
-                    excluded_sym_names = excluded_names
-                else:
-                    excluded_sym_names = excluded_names + optional_names
-
-                qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
-                                                                                 arg_params=arg_params,
-                                                                                 aux_params=aux_params,
-                                                                                 excluded_sym_names=excluded_sym_names,
-                                                                                 ctx=mx.current_context(),
-                                                                                 quantized_dtype=qdtype,
-                                                                                 calib_mode='none')
-                check_params(arg_params, qarg_params, qsym)
-                check_params(aux_params, qaux_params)
-                check_qsym_forward(qsym, qarg_params, qaux_params, dshape, lshape)
-
-                calib_data = mx.nd.random.uniform(shape=dshape)
-                calib_data = NDArrayIter(data=calib_data, batch_size=batch_size)
-                calib_data = DummyIter(calib_data)
-                qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
-                                                                                 arg_params=arg_params,
-                                                                                 aux_params=aux_params,
-                                                                                 excluded_sym_names=excluded_sym_names,
-                                                                                 ctx=mx.current_context(),
-                                                                                 quantized_dtype=qdtype,
-                                                                                 calib_mode='naive',
-                                                                                 calib_data=calib_data,
-                                                                                 num_calib_examples=20)
-                check_params(arg_params, qarg_params, qsym)
-                check_params(aux_params, qaux_params)
-                check_qsym_calibrated(qsym)
-                check_qsym_qdtype(qsym, qdtype)
-                check_qsym_forward(qsym, qarg_params, qaux_params, dshape, lshape)
-
-    for qdtype in ['int8', 'uint8']:
-        check_quantize_model(qdtype)
-
-@with_seed()
-def test_quantize_conv_with_forward():
-    def check_quantize_model(qdtype):
-        if is_test_for_native_cpu():
-            print('skipped testing test_quantize_model_with_forward for native cpu since it is not supported yet')
-            return
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing test_quantize_model_with_forward for mkldnn cpu int8 since it is not supported yet')
-            return
-        elif qdtype == 'uint8' and is_test_for_gpu():
-            print('skipped testing test_quantize_model_with_forward for gpu uint8 since it is not supported yet')
-            return
 
-        def check_params(params, qparams, qsym=None):
-            if qsym is None:
-                assert len(params) == len(qparams)
-                for k, v in params.items():
-                    assert k in qparams
-                    assert same(v.asnumpy(), qparams[k].asnumpy())
-            else:
-                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
-                assert len(qparams) == len(qparams_ground_truth)
-                for k, v in qparams_ground_truth.items():
-                    assert k in qparams
-                    assert same(v.asnumpy(), qparams[k].asnumpy())
+            excluded_sym_names = []
+            # sym3/sym4 doesn't have such layers
+            if name not in ['sym3', 'sym4']:
+                excluded_names = []
+                if mx.current_context() == mx.cpu():
+                   excluded_names += ['fc', 'conv1']
+                if mx.current_context() == mx.gpu():
+                   excluded_names += ['sum0', 'relu0', 'relu1']
+                excluded_names += ['concat']
+
+                optional_names = ['pool0']
+                for skip_optional_names in [False, True]:
+                    exclude_sym_names = []
+                    if skip_optional_names:
+                        excluded_sym_names = excluded_names
+                    else:
+                        excluded_sym_names = excluded_names + optional_names
 
-        def check_qsym_calibrated(qsym):
-            attrs = qsym.attr_dict()
-            for k, v in attrs.items():
-                if k.find('requantize_') != -1:
-                    assert 'min_calib_range' in v
-                    assert 'max_calib_range' in v
-
-        def check_qsym_qdtype(qsym, qdtype):
-            attrs = qsym.attr_dict()
-            for k, v in attrs.items():
-                if k.find('_quantize') != -1:
-                    assert 'out_type' in v
-                    assert v['out_type'] == qdtype
+            qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
+                                                                             arg_params=arg_params,
+                                                                             aux_params=aux_params,
+                                                                             excluded_sym_names=excluded_sym_names,
+                                                                             ctx=mx.current_context(),
+                                                                             quantized_dtype=qdtype,
+                                                                             calib_mode='none')
+            check_params(arg_params, qarg_params, qsym)
+            check_params(aux_params, qaux_params)
+            check_qsym_forward(qsym, qarg_params, qaux_params, dshape, lshape)
 
-        def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape):
-            mod = mx.mod.Module(symbol=qsym, label_names=None, context=mx.current_context())
-            mod.bind(for_training=False,
-                     data_shapes=[('data', data_shape)])
-            mod.set_params(qarg_params, qaux_params)
-            data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
-            batch = mx.io.DataBatch(data, [])
-            mod.forward(batch, is_train=False)
-            for output in mod.get_outputs():
-                output.wait_to_read()
+            calib_data = mx.nd.random.uniform(shape=dshape)
+            calib_data = NDArrayIter(data=calib_data, batch_size=batch_size)
+            calib_data = DummyIter(calib_data)
+            qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
+                                                                             arg_params=arg_params,
+                                                                             aux_params=aux_params,
+                                                                             excluded_sym_names=excluded_sym_names,
+                                                                             ctx=mx.current_context(),
+                                                                             quantized_dtype=qdtype,
+                                                                             calib_mode='naive',
+                                                                             calib_data=calib_data,
+                                                                             num_calib_examples=20)
+            check_params(arg_params, qarg_params, qsym)
+            check_params(aux_params, qaux_params)
+            check_qsym_calibrated(qsym)
+            check_qsym_qdtype(qsym, qdtype)
+            check_qsym_forward(qsym, qarg_params, qaux_params, dshape, lshape)
 
-        batch_size = 4
-        dshape = (batch_size, 4, 10, 10)
-        data = mx.sym.Variable('data')
-        sym = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0')
-
-        mod = Module(symbol=sym, label_names=None)
-        mod.bind(data_shapes=[('data', dshape)])
-
-        mod.init_params()
-        arg_params, aux_params = mod.get_params()
-        excluded_sym_names = []
-
-        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
-                                                                            arg_params=arg_params,
-                                                                            aux_params=aux_params,
-                                                                            excluded_sym_names=excluded_sym_names,
-                                                                            ctx=mx.current_context(),
-                                                                            quantized_dtype=qdtype,
-                                                                            calib_mode='none')
-        check_params(arg_params, qarg_params, qsym)
-        check_params(aux_params, qaux_params)
-        check_qsym_forward(qsym, qarg_params, qaux_params, dshape)
-
-        calib_data = mx.nd.random.uniform(shape=dshape)
-        calib_data = NDArrayIter(data=calib_data, batch_size=batch_size)
-        calib_data = DummyIter(calib_data)
-        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
-                                                                            arg_params=arg_params,
-                                                                            aux_params=aux_params,
-                                                                            excluded_sym_names=excluded_sym_names,
-                                                                            ctx=mx.current_context(),
-                                                                            quantized_dtype=qdtype,
-                                                                            calib_mode='naive',
-                                                                            calib_data=calib_data,
-                                                                            num_calib_examples=20)
-        check_params(arg_params, qarg_params, qsym)
-        check_params(aux_params, qaux_params)
-        check_qsym_calibrated(qsym)
-        check_qsym_qdtype(qsym, qdtype)
-        check_qsym_forward(qsym, qarg_params, qaux_params, dshape)
-
-    for qdtype in ['uint8', 'int8']:
+    for qdtype in ['int8', 'uint8']:
         check_quantize_model(qdtype)
 
+
 @with_seed()
 def test_quantize_sym_with_calib():
     sym = get_fp32_sym()
diff --git a/tests/python/tensorrt/test_tensorrt_lenet5.py b/tests/python/tensorrt/test_tensorrt_lenet5.py
index bdc306c0b297..ce88b9de3f5c 100644
--- a/tests/python/tensorrt/test_tensorrt_lenet5.py
+++ b/tests/python/tensorrt/test_tensorrt_lenet5.py
@@ -95,9 +95,11 @@ def test_tensorrt_inference():
     print("MXNet accuracy: %f" % mx_pct)
     print("MXNet-TensorRT accuracy: %f" % trt_pct)
 
-    assert abs(mx_pct - trt_pct) < 1e-2, \
-        """Diff. between MXNet & TensorRT accuracy too high:
-           MXNet = %f, TensorRT = %f""" % (mx_pct, trt_pct)
+    absolute_accuracy_diff = abs(mx_pct - trt_pct)
+    epsilon = 1.01e-2
+    assert absolute_accuracy_diff < epsilon, \
+        """Absolute diff. between MXNet & TensorRT accuracy (%f) exceeds threshold (%f):
+           MXNet = %f, TensorRT = %f""" % (absolute_accuracy_diff, epsilon, mx_pct, trt_pct)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 5b5aff31231b..61955f034a71 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -378,6 +378,27 @@ def backward(self, dm, dn):
     assert_almost_equal(y.grad.asnumpy(), dy1, atol=atol)
 
 
+@with_seed()
+def test_function1():
+    class Foo(mx.autograd.Function):
+        def __init__(self):
+            super(Foo, self).__init__()
+
+        def forward(self, X):
+            return X + 1;
+
+        def backward(self, dY):
+            return dY
+
+    with mx.autograd.record():
+        X = mx.nd.zeros((3, 4))
+        #X.attach_grad()  # uncommenting this line works
+        for i in range(5):
+            f = Foo()
+            X = f(X)
+        X.wait_to_read()
+
+
 @with_seed()
 def test_get_symbol():
     x = mx.nd.ones((1,))
diff --git a/tests/python/unittest/test_contrib_amp.py b/tests/python/unittest/test_contrib_amp.py
new file mode 100644
index 000000000000..13048c35371e
--- /dev/null
+++ b/tests/python/unittest/test_contrib_amp.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import warnings
+import collections
+import ctypes
+import mxnet.contrib.amp as amp
+
+def test_amp_coverage():
+    conditional = [item[0] for item in amp.lists.symbol.CONDITIONAL_FP32_FUNCS]
+
+    # Check for duplicates
+    for a in [amp.lists.symbol.FP16_FUNCS,
+          amp.lists.symbol.FP16_FP32_FUNCS,
+          amp.lists.symbol.FP32_FUNCS,
+          amp.lists.symbol.WIDEST_TYPE_CASTS,
+          conditional]:
+        ret = [item for item, count in collections.Counter(a).items() if count > 1]
+        assert ret == [], "Elements " + str(ret) + " are duplicated in the AMP lists."
+
+    t = []
+    for a in [amp.lists.symbol.FP16_FUNCS,
+              amp.lists.symbol.FP16_FP32_FUNCS,
+              amp.lists.symbol.FP32_FUNCS,
+              amp.lists.symbol.WIDEST_TYPE_CASTS,
+              conditional]:
+        t += a
+    ret = [item for item, count in collections.Counter(t).items() if count > 1]
+    assert ret == [], "Elements " + str(ret) + " exist in more than 1 AMP list."
+
+    # Check the coverage
+    py_str = lambda x: x.decode('utf-8')
+
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    mx.base._LIB.MXListAllOpNames(ctypes.byref(size),
+                                     ctypes.byref(plist))
+    op_names = []
+    for i in range(size.value):
+        s = py_str(plist[i])
+        if not s.startswith("_backward") \
+           and not s.startswith("_contrib_backward_"):
+            op_names.append(s)
+
+    ret1 = set(op_names) - set(t)
+
+    if ret1 != set():
+        warnings.warn("Operators " + str(ret1) + " do not exist in AMP lists (in "
+                       "python/mxnet/contrib/amp/lists/symbol.py) - please add them. "
+                       """Please follow these guidelines for choosing a proper list:
+                       - if your operator is not to be used in a computational graph
+                         (e.g. image manipulation operators, optimizers) or does not have
+                         inputs, put it in FP16_FP32_FUNCS list,
+                       - if your operator requires FP32 inputs or is not safe to use with lower
+                         precision, put it in FP32_FUNCS list,
+                       - if your operator supports both FP32 and lower precision, has
+                         multiple inputs and expects all inputs to be of the same
+                         type, put it in WIDEST_TYPE_CASTS list,
+                       - if your operator supports both FP32 and lower precision and has
+                         either a single input or supports inputs of different type,
+                         put it in FP16_FP32_FUNCS list,
+                       - if your operator is both safe to use in lower precision and
+                         it is highly beneficial to use it in lower precision, then
+                         put it in FP16_FUNCS (this is unlikely for new operators)
+                       - If you are not sure which list to choose, FP32_FUNCS is the
+                         safest option""")
+
+if __name__ == '__main__':
+    test_amp_coverage()
diff --git a/tests/python/unittest/test_contrib_hawkesll.py b/tests/python/unittest/test_contrib_hawkesll.py
new file mode 100644
index 000000000000..a4b1d9de605f
--- /dev/null
+++ b/tests/python/unittest/test_contrib_hawkesll.py
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+from mxnet import nd
+
+
+def test_hawkesll_output_ok():
+    T, N, K = 4, 4, 3
+
+    mu = nd.array([1.5, 2.0, 3.0]).tile((N, 1))
+    alpha = nd.array([0.2, 0.3, 0.4])
+    beta = nd.array([1.0, 2.0, 3.0])
+
+    lags = nd.array([[6, 7, 8, 9], [1, 2, 3, 4], [3, 4, 5, 6], [8, 9, 10, 11]])
+    marks = nd.zeros((N, T)).astype(np.int32)
+    states = nd.zeros((N, K))
+
+    valid_length = nd.array([1, 2, 3, 4])
+    max_time = nd.ones((N,)) * 100.0
+
+    A = nd.contrib.hawkesll(
+        mu, alpha, beta, states, lags, marks, valid_length, max_time
+    )
+
+    assert np.allclose(
+        np.array([-649.79453489, -649.57118596, -649.38025115, -649.17811484]),
+        A[0].asnumpy(),
+    )
+
+
+def test_hawkesll_output_multivariate_ok():
+    T, N, K = 9, 2, 3
+
+    mu = nd.array([1.5, 2.0, 3.0])
+    alpha = nd.array([0.2, 0.3, 0.4])
+    beta = nd.array([2.0, 2.0, 2.0])
+
+    lags = nd.array([[6, 7, 8, 9, 3, 2, 5, 1, 7], [1, 2, 3, 4, 2, 1, 2, 1, 4]])
+    marks = nd.array([[0, 1, 2, 1, 0, 2, 1, 0, 2], [1, 2, 0, 0, 0, 2, 2, 1, 0]]).astype(
+        np.int32
+    )
+
+    states = nd.zeros((N, K))
+
+    valid_length = nd.array([7, 9])
+    max_time = nd.ones((N,)) * 100.0
+
+    A = nd.contrib.hawkesll(
+        mu.tile((N, 1)), alpha, beta, states, lags, marks, valid_length, max_time
+    )
+
+    assert np.allclose(np.array([-647.01240372, -646.28617272]), A[0].asnumpy())
+
+
+def test_hawkesll_backward_correct():
+    ctx = mx.cpu()
+
+    mu = nd.array([1.5, 2.0, 3.0])
+    alpha = nd.array([0.2, 0.3, 0.4])
+    beta = nd.array([2.0, 2.0, 2.0])
+
+    T, N, K = 9, 2, 3
+    lags = nd.array([[6, 7, 8, 9, 3, 2, 5, 1, 7], [1, 2, 3, 4, 2, 1, 2, 1, 4]])
+    marks = nd.array([[0, 0, 0, 1, 0, 0, 1, 2, 0], [1, 2, 0, 0, 0, 2, 2, 1, 0]]).astype(
+        np.int32
+    )
+    valid_length = nd.array([9, 9])
+    states = nd.zeros((N, K))
+
+    max_time = nd.ones((N,)) * 100.0
+
+    mu.attach_grad()
+    alpha.attach_grad()
+    beta.attach_grad()
+
+    with mx.autograd.record():
+        A, _ = nd.contrib.hawkesll(
+            mu.tile((N, 1)), alpha, beta, states, lags, marks, valid_length, max_time
+        )
+    A.backward()
+
+    dmu, dalpha, dbeta = (
+        np.array([-193.33987481, -198.0, -198.66828681]),
+        np.array([-9.95093892, -4.0, -3.98784892]),
+        np.array([-1.49052169e-02, -5.87469511e-09, -7.29065224e-03]),
+    )
+    assert np.allclose(dmu, mu.grad.asnumpy())
+    assert np.allclose(dalpha, alpha.grad.asnumpy())
+    assert np.allclose(dbeta, beta.grad.asnumpy())
+
+
+def test_hawkesll_forward_single_mark():
+    _dtype = np.float32
+
+    mu = nd.array([1.5]).astype(_dtype)
+    alpha = nd.array([0.2]).astype(_dtype)
+    beta = nd.array([1.0]).astype(_dtype)
+
+    T, N, K = 7, 1, 1
+    lags = nd.array([[6, 7, 8, 3, 2, 1, 7]]).astype(_dtype)
+    marks = nd.array([[0, 0, 0, 0, 0, 0, 0]]).astype(np.int32)
+    valid_length = nd.array([7]).astype(_dtype)
+
+    states = nd.zeros((N, K)).astype(_dtype)
+    max_time = nd.ones((N,)).astype(_dtype) * 100
+
+    A, _ = nd.contrib.hawkesll(
+        mu.tile((N, 1)), alpha, beta, states, lags, marks, valid_length, max_time
+    )
+
+    assert np.allclose(A[0].asscalar(), -148.4815)
+
+
+def test_hawkesll_backward_single_mark():
+    _dtype = np.float32
+
+    mu = nd.array([1.5]).astype(_dtype)
+    alpha = nd.array([0.2]).astype(_dtype)
+    beta = nd.array([1.0]).astype(_dtype)
+
+    T, N, K = 7, 1, 1
+    lags = nd.array([[6, 7, 8, 3, 2, 1, 7]]).astype(_dtype)
+    marks = nd.array([[0, 0, 0, 0, 0, 0, 0]]).astype(np.int32)
+    valid_length = nd.array([7]).astype(_dtype)
+
+    states = nd.zeros((N, K)).astype(_dtype)
+    max_time = nd.ones((N,)).astype(_dtype) * 40
+
+    mu.attach_grad()
+    beta.attach_grad()
+
+    with mx.autograd.record():
+        A, _ = nd.contrib.hawkesll(
+            mu.tile((N, 1)), alpha, beta, states, lags, marks, valid_length, max_time
+        )
+
+    A.backward()
+
+    assert np.allclose(beta.grad.asnumpy().sum(), -0.05371582)
+
+
+if __name__ == "__main__":
+    import nose
+
+    nose.runmodule()
diff --git a/tests/python/unittest/test_dlpack.py b/tests/python/unittest/test_dlpack.py
new file mode 100644
index 000000000000..fb64f8d58831
--- /dev/null
+++ b/tests/python/unittest/test_dlpack.py
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+import mxnet as mx
+from mxnet.base import NDArrayHandle, _LIB, c_str, check_call
+from mxnet.test_utils import assert_almost_equal
+
+def test_from_dlpack_backward_compatibility():
+    def from_dlpack_old(dlpack):
+
+        PyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+        _c_str_dltensor = c_str('dltensor')
+        _c_str_used_dltensor = c_str('used_dltensor')
+        handle = NDArrayHandle()
+        dlpack = ctypes.py_object(dlpack)
+        assert ctypes.pythonapi.PyCapsule_IsValid(dlpack, _c_str_dltensor), ValueError(
+            'Invalid DLPack Tensor. DLTensor capsules can be consumed only once.')
+        dlpack_handle = ctypes.c_void_p(ctypes.pythonapi.PyCapsule_GetPointer(dlpack, _c_str_dltensor))
+        check_call(_LIB.MXNDArrayFromDLPack(dlpack_handle, ctypes.byref(handle)))
+        # Rename PyCapsule (DLPack)
+        ctypes.pythonapi.PyCapsule_SetName(dlpack, _c_str_used_dltensor)
+        # delete the deleter of the old dlpack
+        ctypes.pythonapi.PyCapsule_SetDestructor(dlpack, None)
+        return mx.nd.NDArray(handle=handle)
+
+    x = mx.nd.ones((2,3))
+    y = mx.nd.to_dlpack_for_read(x)
+    z = from_dlpack_old(y)
+    assert_almost_equal(x.asnumpy(), z.asnumpy(), rtol=1e-5, atol=1e-5)
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index efa04f4fa47a..557e817cc738 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -2726,6 +2726,31 @@ def hybrid_forward(self, F, x):
             net = Net(act0, act1, shape, slice)
             check_layer_forward_withinput(net, x)
 
+@with_seed()
+def test_np_shape_parameters():
+    class Foo(gluon.Block):
+        def __init__(self, **kwargs):
+            super(Foo, self).__init__(**kwargs)
+            self.dense = gluon.nn.Dense(16)
+        def forward(self, x):
+            return self.dense(x)
+
+    with mx.np_shape(True):
+        z = mx.nd.zeros((2,2016))
+        print(z.shape)
+        foo = Foo()
+        foo.initialize()
+        print(foo(z).shape)
+
+@with_seed()
+def test_gluon_param_load():
+    net = mx.gluon.nn.Dense(10, in_units=10)
+    net.initialize()
+    net.save_parameters('test_gluon_param_load.params')
+    net.cast('float16')
+    net.load_parameters('test_gluon_param_load.params', cast_dtype=True)
+    mx.nd.waitall()
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 1939de82eb44..58e241b528ea 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -28,6 +28,7 @@
 import mxnet.ndarray as nd
 from mxnet import context
 from mxnet.gluon.data.dataset import Dataset
+from mxnet.gluon.data.dataset import ArrayDataset
 
 @with_seed()
 def test_array_dataset():
@@ -279,6 +280,30 @@ def test_dataloader_context():
     for _, x in enumerate(loader3):
         assert x.context == context.cpu_pinned(custom_dev_id)
 
+def batchify(a):
+    return a
+
+def test_dataloader_scope():
+    """
+    Bug: Gluon DataLoader terminates the process pool early while
+    _MultiWorkerIter is operating on the pool.
+
+    Tests that DataLoader is not garbage collected while the iterator is
+    in use.
+    """
+    args = {'num_workers': 1, 'batch_size': 2}
+    dataset = nd.ones(5)
+    iterator = iter(DataLoader(
+            dataset,
+            batchify_fn=batchify,
+            **args
+        )
+    )
+
+    item = next(iterator)
+
+    assert item is not None
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
index cc15bec5dee9..627567ca4244 100644
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ b/tests/python/unittest/test_gluon_data_vision.py
@@ -47,6 +47,15 @@ def test_to_tensor():
     invalid_data_in = nd.random.uniform(0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8)
     transformer = transforms.ToTensor()
     assertRaises(MXNetError, transformer, invalid_data_in)
+    
+    # Bounds (0->0, 255->1)
+    data_in = np.zeros((10, 20, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert same(out_nd.asnumpy(), np.transpose(np.zeros(data_in.shape, dtype=np.float32), (2, 0, 1)))
+
+    data_in = np.full((10, 20, 3), 255).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert same(out_nd.asnumpy(), np.transpose(np.ones(data_in.shape, dtype=np.float32), (2, 0, 1)))
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
new file mode 100644
index 000000000000..d2e8c082aa08
--- /dev/null
+++ b/tests/python/unittest/test_gluon_estimator.py
@@ -0,0 +1,371 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+''' Unit tests for Gluon Estimator '''
+
+import sys
+import unittest
+
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon import nn
+from mxnet.gluon.contrib.estimator import *
+from nose.tools import assert_raises
+
+
+def _get_test_network():
+    net = nn.Sequential()
+    net.add(nn.Dense(4, activation='relu', flatten=False))
+    return net
+
+
+def _get_test_data():
+    batch_size = 4
+    in_data = mx.nd.random.uniform(shape=(10, 3))
+    out_data = mx.nd.random.uniform(shape=(10, 4))
+    # Input dataloader
+    dataset = gluon.data.dataset.ArrayDataset(in_data, out_data)
+    dataloader = gluon.data.DataLoader(dataset, batch_size=batch_size)
+    dataiter = mx.io.NDArrayIter(data=in_data, label=out_data, batch_size=batch_size)
+    return dataloader, dataiter
+
+
+def test_fit():
+    ''' test estimator with different train data types '''
+    net = _get_test_network()
+    dataloader, dataiter = _get_test_data()
+    num_epochs = 1
+    ctx = mx.cpu()
+    loss = gluon.loss.L2Loss()
+    acc = mx.metric.Accuracy()
+    net.initialize(ctx=ctx)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+    est = Estimator(net=net,
+                    loss=loss,
+                    metrics=acc,
+                    trainer=trainer,
+                    context=ctx)
+
+    est.fit(train_data=dataloader,
+            epochs=num_epochs)
+
+    with assert_raises(ValueError):
+        est.fit(train_data=dataiter,
+                epochs=num_epochs)
+
+    # Input NDArray
+    with assert_raises(ValueError):
+        est.fit(train_data=[mx.nd.ones(shape=(10, 3))],
+                epochs=num_epochs)
+
+
+def test_validation():
+    ''' test different validation data types'''
+    net = _get_test_network()
+    dataloader, dataiter = _get_test_data()
+    num_epochs = 1
+    ctx = mx.cpu()
+    loss = gluon.loss.L2Loss()
+    acc = mx.metric.Accuracy()
+    net.initialize(ctx=ctx)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+    est = Estimator(net=net,
+                    loss=loss,
+                    metrics=acc,
+                    trainer=trainer,
+                    context=ctx)
+    # Input dataloader
+    est.fit(train_data=dataloader,
+            val_data=dataloader,
+            epochs=num_epochs)
+
+    # using validation handler
+    train_metrics, val_metrics = est.prepare_loss_and_metrics()
+    validation_handler = ValidationHandler(val_data=dataloader, eval_fn=est.evaluate,
+                                           val_metrics=val_metrics)
+
+    with assert_raises(ValueError):
+        est.fit(train_data=dataiter,
+                val_data=dataiter,
+                epochs=num_epochs)
+    # Input NDArray
+    with assert_raises(ValueError):
+        est.fit(train_data=[mx.nd.ones(shape=(10, 3))],
+                val_data=[mx.nd.ones(shape=(10, 3))],
+                epochs=num_epochs)
+
+
+@unittest.skipIf(sys.version_info.major < 3, 'Test on python 3')
+def test_initializer():
+    ''' test with no initializer, inconsistent initializer '''
+    net = _get_test_network()
+    train_data, _ = _get_test_data()
+    num_epochs = 1
+    ctx = mx.cpu()
+
+    loss = gluon.loss.L2Loss()
+    acc = mx.metric.Accuracy()
+    # no initializer
+    est = Estimator(net=net,
+                    loss=loss,
+                    metrics=acc,
+                    context=ctx)
+    est.fit(train_data=train_data,
+            epochs=num_epochs)
+
+    # different initializer for net and estimator
+    net = _get_test_network()
+    net.initialize(mx.init.Xavier(), ctx=ctx)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+    # catch reinit warning
+    with warnings.catch_warnings(record=True) as w:
+        est = Estimator(net=net,
+                        loss=loss,
+                        metrics=acc,
+                        initializer=mx.init.MSRAPrelu(),
+                        trainer=trainer,
+                        context=ctx)
+        assert 'Network already fully initialized' in str(w[-1].message)
+    # net partially initialized, fine tuning use case
+    net = gluon.model_zoo.vision.resnet18_v1(pretrained=True, ctx=ctx)
+    net.output = gluon.nn.Dense(10) #last layer not initialized
+    est = Estimator(net, loss=loss, metrics=acc, context=ctx)
+    dataset =  gluon.data.ArrayDataset(mx.nd.zeros((10, 3, 224, 224)), mx.nd.zeros((10, 10)))
+    train_data = gluon.data.DataLoader(dataset=dataset, batch_size=5)
+    est.fit(train_data=train_data,
+            epochs=num_epochs)
+
+
+@unittest.skipIf(sys.version_info.major < 3, 'Test on python 3')
+def test_trainer():
+    ''' test with no trainer and invalid trainer '''
+    net = _get_test_network()
+    train_data, _ = _get_test_data()
+    num_epochs = 1
+    ctx = mx.cpu()
+
+    loss = gluon.loss.L2Loss()
+    acc = mx.metric.Accuracy()
+    net.initialize(ctx=ctx)
+    # input no trainer
+    with warnings.catch_warnings(record=True) as w:
+        est = Estimator(net=net,
+                        loss=loss,
+                        metrics=acc,
+                        context=ctx)
+        assert 'No trainer specified' in str(w[-1].message)
+    est.fit(train_data=train_data,
+            epochs=num_epochs)
+
+    # input invalid trainer
+    trainer = 'sgd'
+    with assert_raises(ValueError):
+        est = Estimator(net=net,
+                        loss=loss,
+                        metrics=acc,
+                        trainer=trainer,
+                        context=ctx)
+
+
+def test_metric():
+    ''' test with no metric, list of metrics, invalid metric '''
+    net = _get_test_network()
+    train_data, _ = _get_test_data()
+    num_epochs = 1
+    ctx = mx.cpu()
+
+    loss = gluon.loss.L2Loss()
+    net.initialize(ctx=ctx)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+    # input no metric
+    est = Estimator(net=net,
+                    loss=loss,
+                    trainer=trainer,
+                    context=ctx)
+    est.fit(train_data=train_data,
+            epochs=num_epochs)
+    # input list of metrics
+    metrics = [mx.metric.Accuracy(), mx.metric.Accuracy()]
+    est = Estimator(net=net,
+                    loss=loss,
+                    metrics=metrics,
+                    trainer=trainer,
+                    context=ctx)
+    est.fit(train_data=train_data,
+            epochs=num_epochs)
+    # input invalid metric
+    with assert_raises(ValueError):
+        est = Estimator(net=net,
+                        loss=loss,
+                        metrics='acc',
+                        trainer=trainer,
+                        context=ctx)
+    # test default metric
+    loss = gluon.loss.SoftmaxCrossEntropyLoss()
+    est = Estimator(net=net,
+                    loss=loss,
+                    trainer=trainer,
+                    context=ctx)
+    est.prepare_loss_and_metrics()
+    assert isinstance(est.train_metrics[0], mx.metric.Accuracy)
+
+
+def test_loss():
+    ''' test with invalid loss '''
+    net = _get_test_network()
+    ctx = mx.cpu()
+    acc = mx.metric.Accuracy()
+    net.initialize(ctx=ctx)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+    # input invalid loss
+    with assert_raises(ValueError):
+        est = Estimator(net=net,
+                        loss='mse',
+                        metrics=acc,
+                        trainer=trainer,
+                        context=ctx)
+
+
+def test_context():
+    ''' test with no context, list of context, invalid context '''
+    net = _get_test_network()
+    loss = gluon.loss.L2Loss()
+    metrics = mx.metric.Accuracy()
+    # input no context
+    est = Estimator(net=net,
+                    loss=loss,
+                    metrics=metrics)
+    # input list of context
+    gpus = mx.context.num_gpus()
+    ctx = [mx.gpu(i) for i in range(gpus)] if gpus > 0 else [mx.cpu()]
+    net = _get_test_network()
+    est = Estimator(net=net,
+                    loss=loss,
+                    metrics=metrics,
+                    context=ctx)
+    # input invalid context
+    with assert_raises(ValueError):
+        est = Estimator(net=net,
+                        loss=loss,
+                        metrics=metrics,
+                        context='cpu')
+
+    with assert_raises(AssertionError):
+        est = Estimator(net=net,
+                        loss=loss,
+                        metrics=metrics,
+                        context=[mx.gpu(0), mx.gpu(100)])
+
+
+def test_categorize_handlers():
+    class CustomHandler1(TrainBegin):
+
+        def train_begin(self):
+            print("custom train begin")
+
+    class CustomHandler2(EpochBegin, BatchBegin, TrainEnd):
+
+        def epoch_begin(self):
+            print("custom epoch begin")
+
+        def batch_begin(self):
+            print("custom batch begin")
+
+        def train_end(self):
+            print("custom train end")
+
+    class CustomHandler3(EpochBegin, BatchBegin, BatchEnd, TrainEnd):
+
+        def epoch_begin(self):
+            print("custom epoch begin")
+
+        def batch_begin(self):
+            print("custom batch begin")
+
+        def batch_end(self):
+            print("custom batch end")
+
+        def train_end(self):
+            print("custom train end")
+
+    net = nn.Sequential()
+    net.add(nn.Dense(10))
+    loss = gluon.loss.SoftmaxCrossEntropyLoss()
+    est = Estimator(net, loss=loss)
+    event_handlers = [CustomHandler1(), CustomHandler2(), CustomHandler3()]
+    train_begin, epoch_begin, batch_begin, \
+    batch_end, epoch_end, train_end = est._categorize_handlers(event_handlers)
+    assert len(train_begin) == 1
+    assert len(epoch_begin) == 2
+    assert len(batch_begin) == 2
+    assert len(batch_end) == 1
+    assert len(train_end) == 2
+
+
+@unittest.skipIf(sys.version_info.major < 3, 'Test on python 3')
+def test_default_handlers():
+    net = _get_test_network()
+    train_data, _ = _get_test_data()
+
+    num_epochs = 1
+    ctx = mx.cpu()
+
+    net.initialize(ctx=ctx)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
+
+    train_acc = mx.metric.RMSE()
+    loss = gluon.loss.L2Loss()
+
+    est = Estimator(net=net,
+                    loss=loss,
+                    metrics=train_acc,
+                    trainer=trainer,
+                    context=ctx)
+    # no handler
+    with warnings.catch_warnings(record=True) as w:
+        est.fit(train_data=train_data, epochs=num_epochs)
+        assert 'You are training with the' in str(w[-1].message)
+
+    # handler with prepared loss and metrics
+    # use mix of default and user defined handlers
+    train_metrics, val_metrics = est.prepare_loss_and_metrics()
+    logging = LoggingHandler(train_metrics=train_metrics, val_metrics=val_metrics)
+    with warnings.catch_warnings(record=True) as w:
+        est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[logging])
+        assert 'You are training with the' in str(w[-1].message)
+        # provide metric handler by default
+        assert 'MetricHandler' in str(w[-1].message)
+
+    # handler with all user defined metrics
+    # use mix of default and user defined handlers
+    metric = MetricHandler(train_metrics=[train_acc])
+    logging = LoggingHandler(train_metrics=[train_acc], val_metrics=[mx.metric.RMSE("val acc")])
+    est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[metric, logging])
+
+    # handler with mixed metrics, some handler use metrics prepared by estimator
+    # some handler use metrics user prepared
+    logging = LoggingHandler(train_metrics=train_metrics, val_metrics=[mx.metric.RMSE("val acc")])
+    with assert_raises(ValueError):
+        est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[logging])
+
+    # test handler order
+    train_metrics, val_metrics = est.prepare_loss_and_metrics()
+    early_stopping = EarlyStoppingHandler(monitor=val_metrics[0])
+    handlers = est._prepare_default_handlers(val_data=None, event_handlers=[early_stopping])
+    assert len(handlers) == 4
+    assert isinstance(handlers[0], MetricHandler)
+    assert isinstance(handlers[3], LoggingHandler)
diff --git a/tests/python/unittest/test_gluon_event_handler.py b/tests/python/unittest/test_gluon_event_handler.py
new file mode 100644
index 000000000000..7ea5ff3f4b62
--- /dev/null
+++ b/tests/python/unittest/test_gluon_event_handler.py
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+
+import mxnet as mx
+from common import TemporaryDirectory
+from mxnet import nd
+from mxnet.gluon import nn, loss
+from mxnet.gluon.contrib.estimator import estimator, event_handler
+
+
+def _get_test_network(net=nn.Sequential()):
+    net.add(nn.Dense(128, activation='relu', flatten=False),
+            nn.Dense(64, activation='relu'),
+            nn.Dense(10, activation='relu'))
+    return net
+
+
+def _get_test_data():
+    data = nd.ones((32, 100))
+    label = nd.zeros((32, 1))
+    data_arr = mx.gluon.data.dataset.ArrayDataset(data, label)
+    return mx.gluon.data.DataLoader(data_arr, batch_size=8)
+
+
+def test_checkpoint_handler():
+    with TemporaryDirectory() as tmpdir:
+        model_prefix = 'test_epoch'
+        file_path = os.path.join(tmpdir, model_prefix)
+        test_data = _get_test_data()
+
+        net = _get_test_network()
+        ce_loss = loss.SoftmaxCrossEntropyLoss()
+        acc = mx.metric.Accuracy()
+        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
+        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
+                                                             model_prefix=model_prefix,
+                                                             monitor=acc,
+                                                             save_best=True,
+                                                             epoch_period=1)
+        est.fit(test_data, event_handlers=[checkpoint_handler], epochs=1)
+        assert checkpoint_handler.current_epoch == 1
+        assert checkpoint_handler.current_batch == 4
+        assert os.path.isfile(file_path + '-best.params')
+        assert os.path.isfile(file_path + '-best.states')
+        assert os.path.isfile(file_path + '-epoch0batch4.params')
+        assert os.path.isfile(file_path + '-epoch0batch4.states')
+
+        model_prefix = 'test_batch'
+        file_path = os.path.join(tmpdir, model_prefix)
+        net = _get_test_network(nn.HybridSequential())
+        net.hybridize()
+        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
+        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
+                                                             model_prefix=model_prefix,
+                                                             epoch_period=None,
+                                                             batch_period=2,
+                                                             max_checkpoints=2)
+        est.fit(test_data, event_handlers=[checkpoint_handler], batches=10)
+        assert checkpoint_handler.current_batch == 10
+        assert checkpoint_handler.current_epoch == 3
+        assert not os.path.isfile(file_path + 'best.params')
+        assert not os.path.isfile(file_path + 'best.states')
+        assert not os.path.isfile(file_path + '-epoch0batch0.params')
+        assert not os.path.isfile(file_path + '-epoch0batch0.states')
+        assert os.path.isfile(file_path + '-symbol.json')
+        assert os.path.isfile(file_path + '-epoch1batch7.params')
+        assert os.path.isfile(file_path + '-epoch1batch7.states')
+        assert os.path.isfile(file_path + '-epoch2batch9.params')
+        assert os.path.isfile(file_path + '-epoch2batch9.states')
+
+def test_resume_checkpoint():
+    with TemporaryDirectory() as tmpdir:
+        model_prefix = 'test_net'
+        file_path = os.path.join(tmpdir, model_prefix)
+        test_data = _get_test_data()
+
+        net = _get_test_network()
+        ce_loss = loss.SoftmaxCrossEntropyLoss()
+        acc = mx.metric.Accuracy()
+        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
+        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
+                                                             model_prefix=model_prefix,
+                                                             monitor=acc,
+                                                             max_checkpoints=1)
+        est.fit(test_data, event_handlers=[checkpoint_handler], epochs=2)
+        assert os.path.isfile(file_path + '-epoch1batch8.params')
+        assert os.path.isfile(file_path + '-epoch1batch8.states')
+        checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
+                                                             model_prefix=model_prefix,
+                                                             monitor=acc,
+                                                             max_checkpoints=1,
+                                                             resume_from_checkpoint=True)
+        est.fit(test_data, event_handlers=[checkpoint_handler], epochs=5)
+        # should only continue to train 3 epochs and last checkpoint file is epoch4
+        assert est.max_epoch == 3
+        assert os.path.isfile(file_path + '-epoch4batch20.states')
+
+
+def test_early_stopping():
+    test_data = _get_test_data()
+
+    net = _get_test_network()
+    ce_loss = loss.SoftmaxCrossEntropyLoss()
+    acc = mx.metric.Accuracy()
+    est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
+    early_stopping = event_handler.EarlyStoppingHandler(monitor=acc,
+                                                        patience=0,
+                                                        mode='min')
+    est.fit(test_data, event_handlers=[early_stopping], epochs=5)
+    assert early_stopping.current_epoch == 2
+    assert early_stopping.stopped_epoch == 1
+
+    early_stopping = event_handler.EarlyStoppingHandler(monitor=acc,
+                                                        patience=2,
+                                                        mode='auto')
+    est.fit(test_data, event_handlers=[early_stopping], epochs=1)
+    assert early_stopping.current_epoch == 1
+
+
+def test_logging():
+    with TemporaryDirectory() as tmpdir:
+        test_data = _get_test_data()
+        file_name = 'test_log'
+        output_dir = os.path.join(tmpdir, file_name)
+
+        net = _get_test_network()
+        ce_loss = loss.SoftmaxCrossEntropyLoss()
+        acc = mx.metric.Accuracy()
+        est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
+        train_metrics, val_metrics = est.prepare_loss_and_metrics()
+        logging_handler = event_handler.LoggingHandler(file_name=file_name,
+                                                       file_location=tmpdir,
+                                                       train_metrics=train_metrics,
+                                                       val_metrics=val_metrics)
+        est.fit(test_data, event_handlers=[logging_handler], epochs=3)
+        assert logging_handler.batch_index == 0
+        assert logging_handler.current_epoch == 3
+        assert os.path.isfile(output_dir)
+
+
+def test_custom_handler():
+    class CustomStopHandler(event_handler.TrainBegin,
+                            event_handler.BatchEnd,
+                            event_handler.EpochEnd):
+        def __init__(self, batch_stop=None, epoch_stop=None):
+            self.batch_stop = batch_stop
+            self.epoch_stop = epoch_stop
+            self.num_batch = 0
+            self.num_epoch = 0
+            self.stop_training = False
+
+        def train_begin(self, estimator, *args, **kwargs):
+            self.num_batch = 0
+            self.num_epoch = 0
+
+        def batch_end(self, estimator, *args, **kwargs):
+            self.num_batch += 1
+            if self.num_batch == self.batch_stop:
+                self.stop_training = True
+            return self.stop_training
+
+        def epoch_end(self, estimator, *args, **kwargs):
+            self.num_epoch += 1
+            if self.num_epoch == self.epoch_stop:
+                self.stop_training = True
+            return self.stop_training
+
+    # total data size is 32, batch size is 8
+    # 4 batch per epoch
+    test_data = _get_test_data()
+    net = _get_test_network()
+    ce_loss = loss.SoftmaxCrossEntropyLoss()
+    acc = mx.metric.Accuracy()
+    est = estimator.Estimator(net, loss=ce_loss, metrics=acc)
+    custom_handler = CustomStopHandler(3, 2)
+    est.fit(test_data, event_handlers=[custom_handler], epochs=3)
+    assert custom_handler.num_batch == 3
+    assert custom_handler.num_epoch == 1
+    custom_handler = CustomStopHandler(100, 5)
+    est.fit(test_data, event_handlers=[custom_handler], epochs=10)
+    assert custom_handler.num_batch == 5 * 4
+    assert custom_handler.num_epoch == 5
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 9d7892010839..309756b122e7 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -634,32 +634,33 @@ def test_layer_fill_shape():
 
 
 def test_bidirectional_unroll_valid_length():
-    # Test BidirectionalCell.
-    # In 1.3.1 version, after hybridize( ), BidirectionalCell would failed when pass valid_length to unroll( ).
-    
-    class BiLSTM(gluon.nn.HybridBlock):
-        def __init__(self, rnn_size, time_step, **kwargs):
-            super(BiLSTM, self).__init__(**kwargs)
-            self.time_step = time_step
-            with self.name_scope():
-                self.bi_lstm = gluon.rnn.BidirectionalCell(
-                    gluon.rnn.LSTMCell(rnn_size, prefix='rnn_l0_'),
-                    gluon.rnn.LSTMCell(rnn_size, prefix='rnn_r0_'),
-                    output_prefix='lstm_bi_')
-
-        def hybrid_forward(self, F, inputs, valid_len):
-            outputs, states = self.bi_lstm.unroll(self.time_step, inputs, valid_length=valid_len,
-                                                  layout='NTC', merge_outputs=True)
-            return outputs, states
-
-    rnn_size, time_step = 100, 3
-    net = BiLSTM(rnn_size, time_step)
-    net.initialize()
-    net.hybridize()
-    inputs_data = mx.nd.random.uniform(shape=(10, 3, 50))
-    valid_len = mx.nd.array([1]*10)
-    outputs, _ = net(inputs_data, valid_len)
-    assert outputs.shape == (10, 3, 200)
+    def _check_bidirectional_unroll_valid_length(length):
+        class BiLSTM(gluon.nn.HybridBlock):
+            def __init__(self, rnn_size, time_step, **kwargs):
+                super(BiLSTM, self).__init__(**kwargs)
+                self.time_step = time_step
+                with self.name_scope():
+                    self.bi_lstm = gluon.rnn.BidirectionalCell(
+                        gluon.rnn.LSTMCell(rnn_size, prefix='rnn_l0_'),
+                        gluon.rnn.LSTMCell(rnn_size, prefix='rnn_r0_'),
+                        output_prefix='lstm_bi_')
+
+            def hybrid_forward(self, F, inputs, valid_len):
+                outputs, states = self.bi_lstm.unroll(self.time_step, inputs, valid_length=valid_len,
+                                                      layout='NTC', merge_outputs=True)
+                return outputs, states
+
+        rnn_size = 100
+        net = BiLSTM(rnn_size, length)
+        net.initialize()
+        net.hybridize()
+        inputs_data = mx.nd.random.uniform(shape=(10, length, 50))
+        valid_len = mx.nd.array([length]*10)
+        outputs, _ = net(inputs_data, valid_len)
+        assert outputs.shape == (10, length, 200)
+
+    _check_bidirectional_unroll_valid_length(1)
+    _check_bidirectional_unroll_valid_length(3)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_higher_order_grad.py b/tests/python/unittest/test_higher_order_grad.py
new file mode 100644
index 000000000000..77bfa68157aa
--- /dev/null
+++ b/tests/python/unittest/test_higher_order_grad.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import math
+from mxnet import nd, autograd
+from mxnet.test_utils import assert_almost_equal, random_arrays, rand_shape_nd
+from common import with_seed
+
+
+@with_seed()
+def test_sin():
+    def sin(x):
+        return nd.sin(x)
+
+    def grad_grad_op(x):
+        return -nd.sin(x)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        check_second_order_unary(array, sin, grad_grad_op)
+
+
+@with_seed()
+def test_cos():
+    def cos(x):
+        return nd.cos(x)
+
+    def grad_grad_op(x):
+        return -nd.cos(x)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        check_second_order_unary(array, cos, grad_grad_op)
+
+
+@with_seed()
+def test_relu():
+    def relu(x):
+        return nd.relu(x)
+
+    def grad_grad_op(x):
+        return nd.zeros_like(x)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        check_second_order_unary(array, relu, grad_grad_op)
+
+
+@with_seed()
+def test_log():
+    def log(x):
+        return nd.log(x)
+
+    def grad_grad_op(x):
+        return -1/(x**2)
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        check_second_order_unary(array, log, grad_grad_op)
+
+
+@with_seed()
+def test_log2():
+    def log2(x):
+        return nd.log2(x)
+
+    def grad_grad_op(x):
+        return -1/((x**2) * math.log(2))
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        check_second_order_unary(array, log2, grad_grad_op)
+
+
+@with_seed()
+def test_log10():
+    def log10(x):
+        return nd.log10(x)
+
+    def grad_grad_op(x):
+        return -1/((x**2) * math.log(10))
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        array = random_arrays(shape)
+        check_second_order_unary(array, log10, grad_grad_op)
+
+
+def check_second_order_unary(x, op, grad_grad_op):
+    x = nd.array(x)
+    expect_grad_grad = grad_grad_op(x)
+    x.attach_grad()
+    with autograd.record():
+        y = op(x)
+        y_grad = autograd.grad(y, x, create_graph=True, retain_graph=True)[0]
+    y_grad.backward()
+    assert_almost_equal(expect_grad_grad.asnumpy(), x.grad.asnumpy())
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_image.py b/tests/python/unittest/test_image.py
index 6b212da26d62..8a276c351d00 100644
--- a/tests/python/unittest/test_image.py
+++ b/tests/python/unittest/test_image.py
@@ -362,10 +362,12 @@ def test_random_size_crop(self):
         height = np.random.randint(100, 500)
         src = np.random.rand(height, width, 3) * 255.
         ratio = (0.75, 1)
+        epsilon = 0.05
         out, (x0, y0, new_w, new_h) = mx.image.random_size_crop(mx.nd.array(src), size=(width, height), area=0.08, ratio=ratio)
         _, pts = mx.image.center_crop(mx.nd.array(src), size=(width, height))
         if (x0, y0, new_w, new_h) != pts:
-            assert ratio[0] <= float(new_w)/new_h <= ratio[1]
+            assert ratio[0] - epsilon <= float(new_w)/new_h <= ratio[1] + epsilon, \
+            'ration of new width and height out of the bound{}/{}={}'.format(new_w, new_h, float(new_w)/new_h)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py
index 612861bd8303..1312be0c0081 100644
--- a/tests/python/unittest/test_infer_shape.py
+++ b/tests/python/unittest/test_infer_shape.py
@@ -154,7 +154,7 @@ def test_shape_completely_unknown():
     assert arg_shapes[0] == ()
     assert out_shapes[0] == ()
 
-    with mx.np_compat():
+    with mx.np_shape():
         data = mx.sym.var("data")
         ret = mx.sym.sin(data)
         arg_shapes, out_shapes, _ = ret.infer_shape_partial()
@@ -162,13 +162,90 @@ def test_shape_completely_unknown():
         assert out_shapes[0] is None
 
 
+def test_dot_partial_shape():
+    x = mx.sym.Variable("x")
+    y = mx.sym.Variable("y")
+    z = mx.sym.dot(x, y)
+    # batch size(first dim) of lhs unknown
+    _, result_shape, _ = z.infer_shape_partial(x=(0, 3, 4), y=(4, 5))
+    assert result_shape == [(0, 3, 5)]
+    with mx.np_shape(True):
+        _, result_shape, _ =  z.infer_shape_partial(x=(-1, 3, 4), y=(4, 5))
+        assert result_shape == [(-1, 3, 5)]
+
+
+def test_batch_dot_partial_shape():
+    x = mx.sym.Variable("x")
+    y = mx.sym.Variable("y")
+    z = mx.sym.batch_dot(x, y)
+    # lhs and rhs batch size unknown
+    _, result_shape, _ = z.infer_shape_partial(x=(0, 3, 4), y=(0, 4, 5))
+    assert result_shape == [(0, 3, 5)]
+    # rhs second dim unknown
+    _, result_shape, _ = z.infer_shape_partial(x=(0, 3, 4), y=(0, 0, 5))
+    assert result_shape == [()]
+    with mx.np_shape(True):
+        _, result_shape, _ =  z.infer_shape_partial(x=(-1, 3, 4), y=(-1, 4, 5))
+        assert result_shape == [(-1, 3, 5)]
+        _, result_shape, _ =  z.infer_shape_partial(x=(-1, 3, 4), y=(-1, -1, 5))
+        assert result_shape == [None]
+
+
+def test_embedding_partial_shape():
+    # testing embedding with batch size unknown
+    x = mx.sym.Variable("x")
+    w = mx.sym.Variable("w")
+    y = mx.sym.Embedding(data=x, weight=w, input_dim=100, output_dim=10)
+    _, result_shape, _ = y.infer_shape_partial(x=(0, 5), w=(100, 10))
+    assert result_shape  == [(0, 5, 10)]
+    with mx.np_shape(True):
+        _, result_shape, _ = y.infer_shape_partial(x=(-1, 5), w=(100, 10))
+        assert result_shape == [(-1, 5, 10)]
+
+
+def test_transpose_partial_shape():
+    # test converting tensor shape
+    # from channels first to channels last
+    # with batch size unknown
+    axes = [0, 3, 2, 1]
+    x = mx.sym.Variable("x")
+    y = mx.sym.transpose(x, axes=axes)
+    _, result, _ = y.infer_shape_partial(x=(0, 3, 224, 224))
+    assert result == [(0, 224, 224, 3)]
+
+    with mx.np_shape(True):
+        _, result, _ = y.infer_shape_partial(x=(-1, 3, 224, 224))
+        assert result == [(-1, 224, 224, 3)]
+
+
+def test_pick_partial_shape():
+    x = mx.sym.Variable("x")
+    index = mx.sym.Variable("index")
+    y = mx.sym.pick(x, index, axis=1)
+    # batch size unknown
+    _, result, _ =  y.infer_shape_partial(x=(0, 3, 3), index=(0, 3,))
+    assert result == [(0, 3)]
+    with mx.np_shape(True):
+        _, result, _ =  y.infer_shape_partial(x=(-1, 3, 3), index=(-1, 3,))
+        assert result == [(-1, 3)]
+
+
+def test_where_partial_shape():
+    x = mx.sym.Variable("x")
+    y = mx.sym.Variable("y")
+    cond = mx.sym.Variable("cond")
+    where_op = mx.sym.where(cond, x, y)
+    # condition must be fully known to infer shape
+    _, result, _ = where_op.infer_shape_partial(cond=(0, 2), x=(0, 2), y =(0, 2))
+    assert result == [()]
+    _, result, _ = where_op.infer_shape_partial(cond=(0,), x=(2, 2), y =(2, 2))
+    assert result == [()]
+    with mx.np_shape(True):
+        _, result, _ =  where_op.infer_shape_partial(cond=(-1, 2), x=(-1, 2), y =(-1, 2))
+        assert result == [None]
+        _, result, _ = where_op.infer_shape_partial(cond=(-1,), x=(2, 2), y=(2, 2))
+        assert result == [None]
+
 if __name__ == "__main__":
-    test_mlp2_infer_shape()
-    test_mlp2_infer_error()
-    test_backward_infer()
-    test_incomplete_infer_elewise()
-    test_incomplete_infer_mlp()
-    test_incomplete_infer_slicechannel()
-    test_incomplete_infer_convolution()
-    test_incomplete_infer_concat()
-    test_shape_completely_unknown()
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 351231bb47fd..144f042ea719 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -92,6 +92,18 @@ def test_Cifar10Rec():
     for i in range(10):
         assert(labelcount[i] == 5000)
 
+def test_inter_methods_in_augmenter():
+    def test_Cifar10Rec():
+        get_cifar10()
+        for inter_method in [0,1,2,3,4,9,10]:
+            dataiter = mx.io.ImageRecordIter(
+                path_imgrec="data/cifar/train.rec",
+                mean_img="data/cifar/cifar10_mean.bin",
+                max_rotate_angle=45,
+                inter_method=inter_method)
+            for batch in dataiter:
+                pass
+
 def test_image_iter_exception():
     def check_cifar10_exception():
         get_cifar10()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index c62bd19453d9..e5315900c725 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -123,7 +123,7 @@ def test_ndarray_setitem():
     # numpy assignment for empty axis
     for trivial_shape in [(), (1,), (1, 1), (1, 1, 1)]:
         if trivial_shape == tuple():
-            with mx.np_compat():
+            with mx.np_shape():
                 x = mx.nd.zeros(trivial_shape)
         else:
             x = mx.nd.zeros(trivial_shape)
@@ -1687,8 +1687,8 @@ def test_zero_from_numpy():
             mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
     np_array = arrays[0]
     mx_array = mx.nd.from_numpy(np_array)
-    np_array[2, 1] = 0
-    mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
+    assertRaises(ValueError, np_array.__setitem__, (2, 1), 0)
+
     mx_array[2, 1] = 100
     mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
     np_array = np.array([[1, 2], [3, 4], [5, 6]]).transpose()
@@ -1701,6 +1701,33 @@ def test_zero_from_numpy():
         assert False
 
 
+@with_seed()
+def test_save_load_scalar_zero_size_ndarrays():
+    def check_save_load(save_is_np_shape, load_is_np_shape, shapes, save_throw_exception, load_throw_exception):
+        with mx.np_shape(save_is_np_shape):
+            array_list = [np.random.randint(0, 10, size=shape) for shape in shapes]
+            array_list = [mx.nd.array(arr) for arr in array_list]
+            with TemporaryDirectory() as work_dir:
+                fname = os.path.join(work_dir, 'dataset')
+                if save_throw_exception:
+                    assert_exception(mx.nd.save, mx.MXNetError, fname, array_list)
+                else:
+                    mx.nd.save(fname, array_list)
+                with mx.np_shape(load_is_np_shape):
+                    if load_throw_exception:
+                        assert_exception(mx.nd.load, mx.MXNetError, fname)
+                    else:
+                        array_list_loaded = mx.nd.load(fname)
+                        assert len(array_list) == len(array_list_loaded)
+                        for a1, a2 in zip(array_list, array_list_loaded):
+                            assert np.array_equal(a1.asnumpy(), a2.asnumpy())
+
+    check_save_load(False, False, [(2, 0, 1), (0,), (0, 4), (3, 0, 0, 0), (2, 1), (0, 5, 0)], False, False)
+    check_save_load(True, False, [(2, 0, 1), (0,), (0, 4), (3, 0, 0, 0), (2, 1), (0, 5, 0)], False, True)
+    check_save_load(False, True, [(2, 0, 1), (0,), (0, 4), (3, 0, 0, 0), (2, 1), (0, 5, 0)], False, True)
+    check_save_load(True, True, [(2, 0, 1), (0,), (), (), (0, 4), (), (3, 0, 0, 0), (2, 1), (0, 5, 0)], False, False)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 7db07596d7f8..450a0635a7e7 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1512,6 +1512,28 @@ def test_deconvolution():
         pad = (3,)
     )
 
+@with_seed()
+def test_deconvolution_forward_with_bias():
+    """Check if deconvolution forward can work well with bias=True
+    """
+    def check_deconvolution_forward_with_bias(shape=(1, 16, 5, 5), num_filter=32, num_group=1, kernel=(3, 3), pad=(1, 1)):
+        x = mx.sym.Variable('x')
+        w = mx.sym.Variable('w')
+        input_data = mx.random.uniform(-5, 5, shape, ctx=mx.cpu())
+        y = mx.sym.Deconvolution(data=x, weight=w, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=False, pad=pad)
+        exe = y.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null')
+
+        exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
+        exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
+
+        exe.forward(is_train=False)
+        o = exe.outputs[0]
+        t = o.asnumpy()
+    check_deconvolution_forward_with_bias((1, 16, 5), 32, 1, (3,), (1,))
+    check_deconvolution_forward_with_bias((32, 16, 5), 32, 1, (3,), (1,))
+    check_deconvolution_forward_with_bias((1, 16, 5, 5), 32, 1, (3, 3), (1, 1))
+    check_deconvolution_forward_with_bias((32, 16, 5, 5), 32, 1, (3, 3), (1, 1))
+
 
 def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
     arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)}
@@ -2535,19 +2557,27 @@ def test_broadcast():
         size = tuple([shape[ele] for ele in axis])
         for ele in axis:
             shape[ele] = 1
+        target_shape_with_zero = list(target_shape)
+        for idx in range(len(target_shape_with_zero)):
+            if idx not in axis:
+                target_shape_with_zero[idx] = 0
+                break
+
         a = mx.symbol.Variable('a')
         sym_bcast_axis = mx.symbol.broadcast_axis(a, axis=axis, size=size)
         sym_bcast_to = mx.symbol.broadcast_to(a, shape=tuple(target_shape))
+        sym_bcast_to_with_zero = mx.symbol.broadcast_to(a, shape=tuple(target_shape_with_zero))
         sym_bcast_like = mx.symbol.broadcast_like(a, sym_bcast_to)
+
         def test_broadcasting_ele(sym_bcast):
             dat_npy = np.random.rand(*shape)
             groundtruth = dat_npy
             grad_nd = mx.nd.empty(shape)
             outgrad_npy = np.random.rand(*target_shape)
             grad_groundtruth = np_reduce(outgrad_npy, axis=axis, keepdims=True,
-                                          numpy_reduce_func=np.sum)
+                                         numpy_reduce_func=np.sum)
             net = sym_bcast.bind(default_context(), args={'a': mx.nd.array(dat_npy)},
-                                                 args_grad={'a': grad_nd})
+                                 args_grad={'a': grad_nd})
             net.forward(is_train=True)
             assert (net.outputs[0].shape == target_shape).all()
             assert_almost_equal(net.outputs[0].asnumpy(), groundtruth, rtol=1e-4)
@@ -2555,6 +2585,7 @@ def test_broadcasting_ele(sym_bcast):
             assert_almost_equal(grad_nd.asnumpy(), grad_groundtruth, rtol=1e-4)
         test_broadcasting_ele(sym_bcast_axis)
         test_broadcasting_ele(sym_bcast_to)
+        test_broadcasting_ele(sym_bcast_to_with_zero)
         test_broadcasting_ele(sym_bcast_like)
 
 
@@ -3433,7 +3464,7 @@ def npy_layer_norm_grad(data, gamma, out_grad, axis, eps):
         assert_almost_equal(exe.grad_dict['data'].asnumpy(), gt_data_grad, backward_check_eps, backward_check_eps)
         assert_almost_equal(exe.grad_dict['gamma'].asnumpy(), gt_gamma_grad, backward_check_eps, backward_check_eps)
         assert_almost_equal(exe.grad_dict['beta'].asnumpy(), gt_beta_grad, backward_check_eps, backward_check_eps)
-    
+
         # Test for grad_req = add
         out_grad = np.random.normal(0, 1, in_shape).astype(dtype)
         init_data_grad = np.random.normal(0, 1, in_shape).astype(dtype)
@@ -3482,73 +3513,85 @@ def l2norm(input_data, axis=0, keepdims=True):
     epsilon = 1e-3
     acc_type = {np.float16: np.float32, np.float32: np.float32, np.float64: np.float64,
                 np.int32: np.int32, np.int64: np.int64}
+    dtype_to_str = {np.float16: 'float16', np.float32: 'float32', np.float64: 'float64',
+                    np.int32: 'int32', np.int64: 'int64'}
     is_windows = sys.platform.startswith('win')
-    for order in [1, 2]:
-        for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-            for i in range(in_data_dim):
-                for out_dtype in ['float32', 'float64', 'int32', 'int64']:
-                    if (dtype == np.int32 or dtype == np.int64) and ('int' not in out_dtype or is_windows):
-                        continue
-                    if dtype != np.int32 and dtype != np.int64 and 'int' in out_dtype:
-                        continue
-                    backward_dtype = np.float32 if out_dtype == 'float32' else np.float64
-                    skip_backward = 'int' in out_dtype
-                    print(order, dtype, i, out_dtype, in_shape)
-                    in_data = np.random.uniform(-1, 1, in_shape).astype(acc_type[dtype])
-                    in_data[abs(in_data) < epsilon] = 2 * epsilon
-                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, out_dtype=out_dtype, keepdims=True)
-                    npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
-                    npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                    check_symbolic_forward(norm_sym, [in_data.astype(dtype)], [npy_out.astype(out_dtype)],
-                                           rtol=1e-3, atol=1e-5, ctx=ctx)
-                    if not skip_backward:
-                        check_symbolic_backward(norm_sym, [in_data.astype(dtype)],
-                                                [np.ones(npy_out.shape).astype(out_dtype)],
-                                                [npy_out_backward], rtol=1e-3, atol=1e-5, ctx=ctx,
-                                                dtype=backward_dtype)
-                    # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509
-                    # check gradient
-                    if dtype is not np.float16 and not skip_backward:
-                        check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
-                                               rtol=1e-1, atol=1e-3, dtype=backward_dtype)
-                    if i < in_data_dim-1:
-                        norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
-                        npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+    for enforce_safe_acc in ["1", "0"]:
+        if is_windows:
+            if enforce_safe_acc == "0":
+                break
+            enforce_safe_acc = "0" if "MXNET_SAFE_ACCUMULATION" not in os.environ else os.environ["MXNET_SAFE_ACCUMULATION"]
+        else:
+            os.environ["MXNET_SAFE_ACCUMULATION"] = enforce_safe_acc
+        for order in [1, 2]:
+            for dtype in [np.float16, np.float32, np.float64]:
+                for i in range(in_data_dim):
+                    for out_dtype in ['float32', 'float64']:
+                        backward_dtype = np.float32 if out_dtype == 'float32' else np.float64
+                        accumulation_type = acc_type[dtype]
+                        if enforce_safe_acc == "0":
+                            backward_dtype = dtype
+                            out_dtype = dtype_to_str[dtype]
+                            accumulation_type = dtype
+                        skip_backward = 'int' in out_dtype
+                        in_data = np.random.uniform(-1, 1, in_shape).astype(accumulation_type)
+                        in_data[abs(in_data) < epsilon] = 2 * epsilon
+                        norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, out_dtype=out_dtype, keepdims=True)
+                        npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
                         npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                        check_symbolic_forward(norm_sym, [in_data], [npy_out.astype(dtype)],
-                                               rtol=1e-3 if dtype is np.float16 else 1e-3,
-                                               atol=1e-5 if dtype is np.float16 else 1e-5, ctx=ctx)
-                        if not skip_backward:
-                            check_symbolic_backward(norm_sym, [in_data],
+                        check_symbolic_forward(norm_sym, [in_data.astype(dtype)], [npy_out.astype(out_dtype)],
+                                               rtol=1e-2 if dtype == np.float16 else 1e-3,
+                                               atol=1e-4 if dtype == np.float16 else 1e-5, ctx=ctx, dtype=dtype)
+                        if dtype is not np.float16 and not skip_backward:
+                            check_symbolic_backward(norm_sym, [in_data.astype(dtype)],
                                                     [np.ones(npy_out.shape).astype(out_dtype)],
-                                                    [npy_out_backward.astype(out_dtype)],
-                                                    rtol=1e-3, atol=1e-5, ctx=ctx, dtype=backward_dtype)
+                                                    [npy_out_backward], rtol=1e-3, atol=1e-5, ctx=ctx,
+                                                    dtype=backward_dtype)
+                        # Disable numeric gradient https://github.com/apache/incubator-mxnet/issues/11509
                         # check gradient
                         if dtype is not np.float16 and not skip_backward:
                             check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
                                                    rtol=1e-1, atol=1e-3, dtype=backward_dtype)
+                        if i < in_data_dim-1:
+                            norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
+                            npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+                            npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
+                            check_symbolic_forward(norm_sym, [in_data], [npy_out.astype(dtype)],
+                                                   rtol=1e-2 if dtype is np.float16 else 1e-3,
+                                                   atol=1e-4 if dtype is np.float16 else 1e-5, ctx=ctx)
+                            if dtype is not np.float16 and not skip_backward:
+                                check_symbolic_backward(norm_sym, [in_data],
+                                                        [np.ones(npy_out.shape).astype(out_dtype)],
+                                                        [npy_out_backward.astype(out_dtype)],
+                                                        rtol=1e-3, atol=1e-5, ctx=ctx, dtype=backward_dtype)
+                            # check gradient
+                            if dtype is not np.float16 and not skip_backward:
+                                check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
+                                                       rtol=1e-1, atol=1e-3, dtype=backward_dtype)
 
 
 def test_layer_norm():
-    for dtype, forward_check_eps, backward_check_eps in zip([np.float16, np.float32, np.float64],
-                                                            [1E-2, 1E-3, 1E-4],
-                                                            [1E-2, 1E-3, 1E-4]):
-        if dtype != np.float16:
-            in_shape_l, finite_grad_check_l = [(10, 6, 5), (10, 10), (128 * 32, 512)], [True, True, False]
-        else:
-            in_shape_l, finite_grad_check_l = [(10, 6, 5), (10, 10)], [True, True]  # large input + fp16 does not pass the forward check
-        for in_shape, finite_grad_check in zip(in_shape_l, finite_grad_check_l):
-            for axis in range(-len(in_shape), len(in_shape)):
-                for eps in [1E-2, 1E-3]:
-                    if dtype == np.float16:
-                        npy_grad_check = False
-                    else:
-                        npy_grad_check = True
-                    check_layer_normalization(in_shape, axis, eps, dtype=dtype,
-                                              forward_check_eps=forward_check_eps,
-                                              backward_check_eps=backward_check_eps,
-                                              npy_grad_check=npy_grad_check,
-                                              finite_grad_check=finite_grad_check)
+    for enforce_safe_acc in ["1", "0"]:
+        os.environ["MXNET_SAFE_ACCUMULATION"] = enforce_safe_acc
+        for dtype, forward_check_eps, backward_check_eps in zip([np.float16, np.float32, np.float64],
+                                                                [1E-2, 1E-3, 1E-4],
+                                                                [1E-2, 1E-3, 1E-4]):
+            if dtype != np.float16:
+                in_shape_l, finite_grad_check_l = [(10, 6, 5), (10, 10), (128 * 32, 512)], [True, True, False]
+            else:
+                in_shape_l, finite_grad_check_l = [(10, 6, 5), (10, 10)], [True, True]  # large input + fp16 does not pass the forward check
+            for in_shape, finite_grad_check in zip(in_shape_l, finite_grad_check_l):
+                for axis in range(-len(in_shape), len(in_shape)):
+                    for eps in [1E-2, 1E-3]:
+                        if dtype == np.float16:
+                            npy_grad_check = False
+                        else:
+                            npy_grad_check = True
+                        check_layer_normalization(in_shape, axis, eps, dtype=dtype,
+                                                  forward_check_eps=forward_check_eps,
+                                                  backward_check_eps=backward_check_eps,
+                                                  npy_grad_check=npy_grad_check,
+                                                  finite_grad_check=finite_grad_check)
 
 
 # Numpy Implementation of Sequence Ops
@@ -4303,47 +4346,117 @@ def test_cast():
             assert_almost_equal(exe.outputs[0].asnumpy(), X.astype(srctype).astype(dsttype), rtol=1e-3, atol=1e-5)
             assert_almost_equal(exe.grad_arrays[0].asnumpy(), X.astype(dsttype).astype(srctype), rtol=1e-3, atol=1e-5)
 
-
-# Test requires all platforms to round float32->float16 with same round-to-nearest-even policy.
-@with_seed()
-def test_cast_float32_to_float16():
+def get_cast_op_data():
     FP16_FRACTION_BITS = 10
     FP32_FRACTION_BITS = 23
     FP32_EXP_MIN = -126
     FP32_EXP_MAX = 127
     # generate test cases in the vicinity of representable float16 mantissas
     # and mid-way between them, but over the full range of float32 exponents.
-    def get_data():
-        for sign_bit in [0, 1]:
-            for exponent in range(FP32_EXP_MIN - FP32_FRACTION_BITS - 1, FP32_EXP_MAX + 2):
-                denominator = 2**(FP16_FRACTION_BITS + 1)
-                for numerator in range(0, denominator):
-                    fraction = numerator / float(denominator)
-                    for y in [-1.0, 0.0, 1.0]:
-                        small_delta = y / 2**FP32_FRACTION_BITS
-                        val = (-1.0)**sign_bit * 2.0**exponent * (1.0 + fraction + small_delta)
-                        yield val
-        # Add np.nan as a final data value to process
-        yield np.nan
-
-    input_np = np.array(list(get_data())).astype(np.float32)
+
+    for sign_bit in [0, 1]:
+        for exponent in range(FP32_EXP_MIN - FP32_FRACTION_BITS - 1, FP32_EXP_MAX + 2):
+            denominator = 2**(FP16_FRACTION_BITS + 1)
+            for numerator in range(0, denominator):
+                fraction = numerator / float(denominator)
+                for y in [-1.0, 0.0, 1.0]:
+                    small_delta = y / 2**FP32_FRACTION_BITS
+                    val = (-1.0)**sign_bit * 2.0**exponent * (1.0 + fraction + small_delta)
+                    yield val
+    # Add np.nan as a final data value to process
+    yield np.nan
+
+# Test requires all platforms to round float32->float16 with same round-to-nearest-even policy.
+@with_seed()
+def test_cast_float32_to_float16():
+    input_np = np.array(list(get_cast_op_data())).astype(np.float32)
     # The intermediate cast to np.float64 below gets around a numpy rounding bug that is fixed
     # as of numpy 1.17 by PR https://github.com/numpy/numpy/pull/12722
     expected_output = input_np.astype(np.float64).astype(np.float16)
 
-    x = mx.sym.Variable('x', dtype=np.float32)
-    sym = mx.sym.Cast(x, dtype=np.float16)
+    def check_cast(op, input_np, expected_output):
+        x = mx.sym.Variable('x', dtype=np.float32)
+        sym = op(x, dtype=np.float16)
+        ctx = default_context()
+        exe = sym.bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float32, ctx=ctx)})
+        assert exe.arg_arrays[0].dtype == np.float32
+        assert exe.outputs[0].dtype == np.float16
+        exe.forward(is_train=True)
+        sym_output = exe.outputs[0].asnumpy()
+        for fp32_val, model_fp16_val, np_fp16_val in zip(input_np, sym_output, expected_output):
+            assert (model_fp16_val == np_fp16_val) or \
+                   (np.isnan(model_fp16_val) and np.isnan(np_fp16_val)), \
+                   'fp32->fp16 cast mismatch: with fp32 value {}, model_fp16 = {}, numpy_fp16 = {}'.format(
+                    fp32_val, model_fp16_val, np_fp16_val)
+
+    check_cast(mx.sym.Cast, input_np, expected_output)
+    check_cast(mx.sym.amp_cast, input_np, expected_output)
+
+
+@with_seed()
+def test_amp_multicast():
+    x = mx.sym.Variable('x', dtype=np.float16)
+    y = mx.sym.Variable('y', dtype=np.float32)
+    z = mx.sym.Variable('z', dtype=np.float16)
     ctx = default_context()
-    exe = sym.bind(ctx, {'x' : mx.nd.array(input_np, dtype=np.float32, ctx=ctx)})
-    assert exe.arg_arrays[0].dtype == np.float32
-    assert exe.outputs[0].dtype == np.float16
+    res = mx.sym.amp_multicast(x, y, z, num_outputs=3)
+    exe = res.bind(ctx, {'x': mx.nd.random.uniform(shape=(3, 3), dtype=np.float16, ctx=ctx),
+                         'y': mx.nd.random.uniform(shape=(3, 3), dtype=np.float32, ctx=ctx),
+                         'z': mx.nd.random.uniform(shape=(3, 3), dtype=np.float16, ctx=ctx)})
+    exe.forward(is_train=True)
+    out1, out2, out3 = exe.outputs
+    assert out1.asnumpy().dtype == np.float32
+    assert out2.asnumpy().dtype == np.float32
+    assert out3.asnumpy().dtype == np.float32
+
+    def check_amp_multicast(input_np, expected_output):
+        x = mx.sym.Variable('x', dtype=np.float16)
+        y = mx.sym.Variable('y', dtype=np.float32)
+        z = mx.sym.Variable('z', dtype=np.float16)
+        ctx = default_context()
+        res = mx.sym.amp_multicast(x, y, z, num_outputs=3)
+        exe = res.bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float16, ctx=ctx),
+                             'y': mx.nd.array(input_np, dtype=np.float32, ctx=ctx),
+                             'z': mx.nd.array(input_np, dtype=np.float16, ctx=ctx)})
+        exe.forward(is_train=True)
+        sym_output = exe.outputs[0].asnumpy()
+        for fp32_val, model_fp16_val, np_fp16_val in zip(input_np, sym_output, expected_output):
+            assert (model_fp16_val == np_fp16_val) or \
+                   (np.isnan(model_fp16_val) and np.isnan(np_fp16_val)), \
+                   'fp32->fp16 cast mismatch: with fp32 value {}, model_fp16 = {}, numpy_fp16 = {}'.format(
+                    fp32_val, model_fp16_val, np_fp16_val)
+
+    input_np = np.array(list(get_cast_op_data()), dtype=np.float16)
+    expected_output = input_np.astype(np.float32)
+    check_amp_multicast(input_np, expected_output)
+
+
+@with_seed()
+def test_all_finite():
+    data = mx.sym.Variable("data", dtype=np.float32)
+    data2 = mx.sym.Variable("data2", dtype=np.float32)
+    finite_arr = mx.nd.array([[0, 0]])
+    inf_arr = mx.nd.array([[np.inf, np.inf]])
+    z = mx.sym.all_finite(data)
+    ctx = default_context()
+    exe = z.bind(ctx, {'data': inf_arr})
     exe.forward(is_train=False)
     sym_output = exe.outputs[0].asnumpy()
-    for fp32_val, model_fp16_val, np_fp16_val in zip(input_np, sym_output, expected_output):
-        assert (model_fp16_val == np_fp16_val) or \
-               (np.isnan(model_fp16_val) and np.isnan(np_fp16_val)), \
-            'fp32->fp16 cast mismatch: with fp32 value {}, model_fp16 = {}, numpy_fp16 = {}'.format(
-                fp32_val, model_fp16_val, np_fp16_val)
+    assert sym_output[0] == 0
+    exe = z.bind(ctx, {'data': finite_arr})
+    exe.forward(is_train=False)
+    sym_output = exe.outputs[0].asnumpy()
+    assert sym_output[0] == 1
+    z = mx.sym.multi_all_finite(data, data2, num_arrays=2)
+    exe = z.bind(ctx, {'data': finite_arr, 'data2': inf_arr})
+    exe.forward(is_train=False)
+    sym_output = exe.outputs[0].asnumpy()
+    assert sym_output[0] == 0
+    z = mx.sym.multi_all_finite(data, data2, num_arrays=2)
+    exe = z.bind(ctx, {'data': finite_arr, 'data2': finite_arr})
+    exe.forward(is_train=False)
+    sym_output = exe.outputs[0].asnumpy()
+    assert sym_output[0] == 1
 
 
 @with_seed()
@@ -4518,7 +4631,7 @@ def test_invalid_reps():
         assert_exception(mx.nd.tile, MXNetError, data, (1, 0, 3))
 
     test_normal_case()
-    with mx.np_compat():
+    with mx.np_shape():
         test_empty_tensor()
     test_empty_reps()
     test_tile_backward()
@@ -4579,7 +4692,7 @@ def test_zero_depth():
     test_normal_case(index_type=np.float64)
     test_normal_case(index_type=np.float32)
     test_normal_case(index_type=np.float16)
-    with mx.np_compat():
+    with mx.np_shape():
         test_empty_indices()
     test_zero_depth()
 
@@ -4745,7 +4858,6 @@ def test_1d_cond():
     test_1d_cond()
 
 
-@unittest.skip("Flaky test. Tracked in https://github.com/apache/incubator-mxnet/issues/13600")
 @with_seed()
 def test_softmin():
     for ndim in range(1, 5):
@@ -4845,22 +4957,27 @@ def check_dtypes_almost_equal(op_name,
         ref_grad_np = ref_input.grad.asnumpy()
         assert_almost_equal(dtype_grad_np, ref_grad_np, rtol=grad_rtol, atol=grad_atol)
 
-    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32')
-    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32', 'float32')
-    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64')
-    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64', 'float64')
-    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32')
-    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32', 'float32')
-    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64')
-    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64', 'float64')
-    check_dtypes_almost_equal('log_softmax', 1e-2, 1e-2, 1e-2, 1e-2,
-                              'float16', 'float32')
-    check_dtypes_almost_equal('log_softmax', 1e-2, 1e-2, 1e-2, 1e-2,
-                              'float16', 'float32', 'float32')
-    check_dtypes_almost_equal('log_softmax', 1e-3, 1e-3, 1e-3, 1e-3,
-                              'float32', 'float64')
-    check_dtypes_almost_equal('log_softmax', 1e-3, 1e-3, 1e-3, 1e-3,
-                              'float32', 'float64', 'float64')
+    import sys
+    is_windows = sys.platform.startswith('win')
+    enforce_safe_acc = os.environ.get("MXNET_SAFE_ACCUMULATION", "0")
+    if not is_windows or enforce_safe_acc == "1":
+        os.environ["MXNET_SAFE_ACCUMULATION"] = "1"
+        check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32')
+        check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32', 'float32')
+        check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64')
+        check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64', 'float64')
+        check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32')
+        check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32', 'float32')
+        check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64')
+        check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64', 'float64')
+        check_dtypes_almost_equal('log_softmax', 1e-2, 1e-2, 1e-2, 1e-2,
+                                  'float16', 'float32')
+        check_dtypes_almost_equal('log_softmax', 1e-2, 1e-2, 1e-2, 1e-2,
+                                  'float16', 'float32', 'float32')
+        check_dtypes_almost_equal('log_softmax', 1e-3, 1e-3, 1e-3, 1e-3,
+                                  'float32', 'float64')
+        check_dtypes_almost_equal('log_softmax', 1e-3, 1e-3, 1e-3, 1e-3,
+                                  'float32', 'float64', 'float64')
 
 @with_seed()
 def test_pick():
@@ -5220,6 +5337,30 @@ def test_boolean_mask():
     assert same(out.asnumpy(), expected)
     assert same(data.grad.asnumpy(), expected_grad)
 
+    # test gradient
+    shape = (100, 30)
+    a = mx.nd.random.randint(0, 100, shape=shape)
+    a.attach_grad()
+    bi = mx.nd.random.randint(0, 100, shape=shape[0:1]) > 50
+    ci = mx.nd.random.randint(0, 100, shape=shape[0:1]) < 50
+    mx_grad = mx.nd.zeros_like(a)
+    mx.autograd.mark_variables([a], [mx_grad], grad_reqs='add')
+    T = 3
+    for _ in range(T):
+        with mx.autograd.record():
+            b = mx.nd.contrib.boolean_mask(a, bi)
+            c = mx.nd.contrib.boolean_mask(a, ci)
+            su = b.sum() + c.sum()
+            su.backward()
+    grad = (bi + ci).asnumpy().reshape((-1,) + (1,) * (len(shape)-1))
+    grad = np.tile(grad, (1,) + shape[1:])
+    # T times
+    grad *= T
+    assert_allclose(a.grad.asnumpy(), grad)
+    a_np = a.asnumpy()
+    assert same(b.asnumpy(), a_np[bi.asnumpy().astype('bool')])
+    assert same(c.asnumpy(), a_np[ci.asnumpy().astype('bool')])
+
 
 @with_seed()
 def test_div_sqrt_dim():
@@ -6376,18 +6517,18 @@ def test_laop_5():
     for n in range(1, 10):
         # test batched and non-batched processing
         for b in range(3):
-            shape = (n, n) if b == 0 else (b, n, n) 
+            shape = (n, n) if b == 0 else (b, n, n)
             data_in = np.random.uniform(1, 10, shape)
             # test all legal offsets of the diagonal
-            for offs in range(1-n, n): 
-                # test extraction of diagonal 
+            for offs in range(1-n, n):
+                # test extraction of diagonal
                 test_diag = mx.sym.linalg.extractdiag(data, offset=offs)
                 res_diag = np.diagonal(data_in, offset=offs) if b==0 else np.diagonal(data_in, axis1=1, axis2=2, offset=offs)
                 check_symbolic_forward(test_diag, [data_in], [res_diag])
                 check_numeric_gradient(test_diag, [data_in])
                 # test generation of diagonal matrix
                 test_diag2 = mx.sym.linalg.makediag(data, offset=offs)
-                res_diag2 = None  
+                res_diag2 = None
                 if b == 0:
                     res_diag2 = np.diagflat(res_diag, k=offs)
                 else:
@@ -6415,6 +6556,34 @@ def test_laop_5():
                     check_symbolic_forward(test_trian, [data_in], [res_trian])
                     check_numeric_gradient(test_trian, [data_in])
 
+# Tests for linalg.inverse
+@with_seed()
+def test_laop_6():
+    dtype = np.float64
+    rtol_fw = 1e-7
+    atol_fw = 1e-9
+    num_eps = 1e-6
+    rtol_bw = 1e-4
+    atol_bw = 1e-6
+
+    data = mx.symbol.Variable('data')
+
+    check_fw = lambda sym, location, expected:\
+        check_symbolic_forward(sym, location, expected, rtol=rtol_fw,
+                               atol=atol_fw, dtype=dtype)
+    check_grad = lambda sym, location:\
+        check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol_bw,
+                               atol=atol_bw, dtype=dtype)
+
+    a = np.sqrt(np.arange(4 * 4)).reshape(4, 4)
+    a = np.tile(a, (3, 1, 1))
+    r = np.eye(4)
+    r = np.tile(r, (3, 1, 1))
+    test_inverse = mx.sym.linalg.inverse(data)
+    test_eye = mx.sym.linalg.gemm2(data, test_inverse)
+    check_fw(test_eye, [a], [r])
+    check_grad(test_inverse, [a])
+
 @with_seed()
 def test_stack():
     for _ in range(100):
@@ -6433,6 +6602,7 @@ def test_stack():
 
 
 @with_seed()
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/14288")
 def test_dropout():
     def zero_count(array, ratio):
         zeros = 0
@@ -7096,7 +7266,7 @@ def check_slice_axis_partial_infer(data, axis, begin, end, expected_out_shape):
     check_slice_axis_partial_infer(var1, 0, 0, 5, (5, 0))
     check_slice_axis_partial_infer(var1, 1, 0, 5, (10, 0))
 
-    with mx.np_compat():
+    with mx.np_shape():
         var1 = mx.sym.var(name="data", shape=(-1, 20))
         check_slice_partial_infer(var1, (None, None), (None, 10), [], (-1, 10))
         check_slice_partial_infer(var1, (None, None), (None, 10), (None, 2), (-1, 5))
@@ -7121,7 +7291,7 @@ def test_float16_min_max():
 
 
 @with_seed()
-@mx.use_np_compat
+@mx.use_np_shape
 def test_zero_size_min_max():
     def min():
         a = mx.nd.zeros(shape=(5, 0))
@@ -7331,16 +7501,16 @@ def check_bilinear_resize_modes_op(shape, scale_height=None, scale_width=None, s
         assert_almost_equal(y.asnumpy(), expected, 1e-3, 0)
         if mode != 'like':
             resize_sym = mx.sym.contrib.BilinearResize2D(data_sym, None, scale_height=scale_height, scale_width=scale_width, mode=mode)
-            check_symbolic_forward(resize_sym, [data_np], [expected], rtol=1e-3)
-            check_symbolic_backward(resize_sym, [data_np], [out_grads], expected_backward, rtol=1e-3)
-            check_numeric_gradient(resize_sym, [data_np])
+            check_symbolic_forward(resize_sym, [data_np], [expected], rtol=1e-3, atol=1e-5)
+            check_symbolic_backward(resize_sym, [data_np], [out_grads], expected_backward, rtol=1e-3, atol=1e-5)
+            check_numeric_gradient(resize_sym, [data_np], rtol=1e-2, atol=1e-4)
         else:
             data_sym_like = mx.sym.var('data_like')
             resize_sym = mx.sym.contrib.BilinearResize2D(data_sym, data_sym_like, mode=mode)
             date_np_like = x_1.asnumpy()
-            check_symbolic_forward(resize_sym, [data_np, date_np_like], [expected], rtol=1e-3)
-            check_symbolic_backward(resize_sym, [data_np, date_np_like], [out_grads], expected_backward, rtol=1e-3)
-            check_numeric_gradient(resize_sym, [data_np, date_np_like])
+            check_symbolic_forward(resize_sym, [data_np, date_np_like], [expected], rtol=1e-3, atol=1e-5)
+            check_symbolic_backward(resize_sym, [data_np, date_np_like], [out_grads], expected_backward, rtol=1e-3, atol=1e-5)
+            check_numeric_gradient(resize_sym, [data_np, date_np_like], rtol=1e-2, atol=1e-4)
 
     shape = (2, 2, 10, 10)
     check_bilinear_resize_op(shape, 5, 5)
@@ -7661,6 +7831,7 @@ def get_output_names_callback(name, arr):
     check_name(us_sym, ['data', 'pooling_data', 'pooling_output'])
 
 @with_seed()
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/13915")
 def test_activation():
     shapes = [(9,), (9, 10), (9, 10, 10), (1, 9, 10, 10)]
     dtype_l = [np.float64, np.float32, np.float16]
@@ -8170,6 +8341,34 @@ def test_split_v2():
     check_symbolic_backward(sym, {"data": mx_data}, out_grad, [np.concatenate(out_grad, axis=axis)])
 
 
+@with_seed()
+def test_moments():
+    dim = random.randint(2, 5)
+    shape = rand_shape_nd(dim, dim=5)
+    axes = [i for i in range(dim)]
+    test_dims = random.sample(axes, random.randint(1, dim))
+    test_axes = tuple(sorted(test_dims))
+    np_a = np.random.uniform(-1.0, 1.0, shape)
+    a = mx.nd.array(np_a)
+    for keepdims in [True, False]:
+        eps = 1e-3
+        np_a[abs(np_a) < eps] = 2 * eps
+        np_mean = np.mean(np_a, axis=test_axes, keepdims=keepdims)
+        np_var = np.var(np_a, axis=test_axes, keepdims=keepdims)
+        mx_mean, mx_var = mx.nd.moments(a, keepdims=keepdims, axes=test_axes)
+        N = np_a.size / np_mean.size
+        mx_sym = mx.sym.Variable("data")
+        mx_moments = mx.sym.moments(mx_sym, axes=test_axes, keepdims=keepdims)
+        mx_test_sym = mx.sym.elemwise_add(mx_moments[0], mx_moments[1])
+        if len(np_mean.shape) == 0:
+            np_mean = np_mean.reshape(mx_mean.shape)
+            np_var = np_var.reshape(mx_var.shape)
+        assert np_mean.shape == mx_mean.shape
+        assert np_var.shape == mx_var.shape
+        check_symbolic_forward(mx_test_sym, [np_a], [np_mean + np_var], rtol=1e-3, atol=1e-5)
+        check_numeric_gradient(mx_test_sym, [np_a], numeric_eps=eps, rtol=1e-2, atol=2e-4)
+
+
 @with_seed()
 def test_invalid_kernel_size():
     invalid_kernel_size = 28
@@ -8288,12 +8487,78 @@ def test_image_normalize():
     # check backward using finite difference
     check_numeric_gradient(img_norm_sym, [data_in_4d], atol=0.001)
 
+@with_seed()
+def test_index_array():
+    def test_index_array_default():
+        for shape in [(10,), (7, 5, 29), (5, 7, 11, 13, 17, 19)]:
+            data  = mx.symbol.Variable("data")
+            index_array = mx.sym.contrib.index_array(data)
+
+            input_array = np.ones(shape)
+            mgrid = np.mgrid[tuple(slice(0, x) for x in shape)]
+            expected = np.stack(mgrid, axis=-1)
+
+            check_symbolic_forward(index_array, [input_array], [expected])
+            check_symbolic_backward(index_array, [input_array], [np.ones(expected.shape)], [np.zeros_like(input_array)])
+
+    @mx.use_np_shape
+    def test_index_array_default_zero_dim():
+        data  = mx.symbol.Variable("data")
+        index_array = mx.sym.contrib.index_array(data)
+
+        input_array = np.ones(())
+        expected = np.zeros((0,))
+
+        check_symbolic_forward(index_array, [input_array], [expected])
+        check_symbolic_backward(index_array, [input_array], [np.ones(expected.shape)], [np.zeros_like(input_array)])
+
+    @mx.use_np_shape
+    def test_index_array_default_zero_size():
+        data  = mx.symbol.Variable("data")
+        index_array = mx.sym.contrib.index_array(data)
+
+        input_array = np.ones((0, 0, 0))
+        expected = np.zeros((0, 0, 0, 3))
+
+        check_symbolic_forward(index_array, [input_array], [expected])
+        check_symbolic_backward(index_array, [input_array], [np.ones(expected.shape)], [np.zeros_like(input_array)])
+
+    def test_index_array_select_axes():
+        shape = (5, 7, 11, 13, 17, 19)
+        for axes in [(3,), (4, 1), (5, 1, 3), (-1,), (-5, -1, -3)]:
+            data  = mx.symbol.Variable("data")
+            index_array = mx.sym.contrib.index_array(data, axes=axes)
+
+            input_array = np.ones(shape)
+            mgrid = np.mgrid[tuple(slice(0, x) for x in shape)]
+            expected = np.stack(mgrid, axis=-1)[..., axes]
+
+            check_symbolic_forward(index_array, [input_array], [expected])
+            check_symbolic_backward(index_array, [input_array], [np.ones(expected.shape)], [np.zeros_like(input_array)])
+
+    @mx.use_np_shape
+    def test_index_array_select_axes_zero_size():
+        data  = mx.symbol.Variable("data")
+        index_array = mx.sym.contrib.index_array(data, axes=(2, 1))
+
+        input_array = np.ones((0, 0, 0, 0))
+        expected = np.zeros((0, 0, 2))
+
+        check_symbolic_forward(index_array, [input_array], [expected])
+        check_symbolic_backward(index_array, [input_array], [np.ones(expected.shape)], [np.zeros_like(input_array)])
+
+    test_index_array_default()
+    test_index_array_default_zero_dim()
+    test_index_array_default_zero_size()
+    test_index_array_select_axes()
+    test_index_array_select_axes_zero_size()
+
 
 @with_seed()
 def test_scalar_tensor_creation():
     assertRaises(MXNetError, mx.nd.zeros, shape=())
     assertRaises(MXNetError, mx.nd.ones, shape=())
-    with mx.np_compat():
+    with mx.np_shape():
         data_mx = mx.nd.ones(shape=())
         data_np = np.ones((), dtype=data_mx.dtype)
         assert same(data_mx.asnumpy(), data_np)
@@ -8303,7 +8568,7 @@ def test_scalar_tensor_creation():
 def test_zero_size_tensor_creation():
     assertRaises(MXNetError, mx.nd.zeros, shape=(0, 1, 3, 0))
     assertRaises(MXNetError, mx.nd.ones, shape=(0, 1, 3, 0))
-    with mx.np_compat():
+    with mx.np_shape():
         data_mx = mx.nd.ones(shape=(0, 1, 0, 4))
         data_np = np.ones(shape=data_mx.shape, dtype=data_mx.dtype)
         assert same(data_mx.asnumpy(), data_np)
@@ -8311,7 +8576,7 @@ def test_zero_size_tensor_creation():
 
 @with_seed()
 def test_concat_with_zero_size_tensor():
-    with mx.np_compat():
+    with mx.np_shape():
         data1 = mx.nd.ones((0, 8, 12))
         data2 = mx.nd.ones((3, 8, 12))
         data3 = mx.nd.ones((0, 8, 12))
@@ -8326,8 +8591,8 @@ def test_concat_with_zero_size_tensor():
 
 
 @with_seed()
-def test_np_compat_decorator():
-    @mx.use_np_compat
+def test_np_shape_decorator():
+    @mx.use_np_shape
     def check_scalar_one():
         """Generate scalar one tensor"""
         return mx.nd.ones(shape=())
@@ -8335,12 +8600,12 @@ def check_scalar_one():
     assert check_scalar_one.__doc__ == "Generate scalar one tensor"
     assert check_scalar_one().shape == ()
     for active in [True, False]:
-        with mx.np_compat(active=active):
+        with mx.np_shape(active=active):
             assert check_scalar_one.__name__ == "check_scalar_one"
             assert check_scalar_one.__doc__ == "Generate scalar one tensor"
             assert check_scalar_one().shape == ()
 
-    @mx.use_np_compat
+    @mx.use_np_shape
     def check_concat(shape1, shape2, axis):
         data1 = mx.nd.ones(shape1)
         data2 = mx.nd.ones(shape2)
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index d5aabcb4b1e5..3e6cdd0997ce 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -346,7 +346,7 @@ def create_state(self, index, weight):
             if self.momentum != 0.0:
                 momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
             weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
-            return (weight_master_copy, momentum)
+            return (momentum, weight_master_copy)
         else:
             if self.momentum != 0.0:
                 momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
@@ -394,8 +394,8 @@ def update(self, index, weight, grad, state):
             grad32 = grad32 * self.rescale_grad
             if self.clip_gradient is not None:
                 grad32 = mx.nd.clip(grad32, -self.clip_gradient, self.clip_gradient)
-            mom = state[1]
-            weight32 = state[0]
+            mom = state[0]
+            weight32 = state[1]
             if self.momentum == 0.0:
                 weight32[:] += -lr * (grad32 + wd * weight32)
             else:
@@ -417,23 +417,15 @@ def test_nag():
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+
     for dtype in [np.float16, np.float32, np.float64]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        for mp_option in mp_options:
-                            kwarg = {}
-                            kwarg.update(mom_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            kwarg.update(mp_option)
-                            if (dtype == np.float16 and
-                                    ('multi_precision' not in kwarg or
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, mp_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
                                         not kwarg['multi_precision'])):
-                                continue
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
 
 #SGLD
 class PySGLD(mx.optimizer.Optimizer):
diff --git a/tests/python/unittest/test_profiler.py b/tests/python/unittest/test_profiler.py
index cdc1be4b8290..09e571be0088 100644
--- a/tests/python/unittest/test_profiler.py
+++ b/tests/python/unittest/test_profiler.py
@@ -21,6 +21,8 @@
 from mxnet import profiler
 import time
 import os
+import json
+from collections import OrderedDict
 
 def enable_profiler(profile_filename, run=True, continuous_dump=False, aggregate_stats=False):
     profiler.set_config(profile_symbolic=True,
@@ -232,6 +234,40 @@ def test_continuous_profile_and_instant_marker():
     print(debug_str)
     profiler.set_state('stop')
 
+def test_aggregate_stats_valid_json_return():
+    file_name = 'test_aggregate_stats_json_return.json'
+    enable_profiler(file_name, True, True, True)
+    test_profile_event(False)
+    debug_str = profiler.dumps(format = 'json')
+    assert(len(debug_str) > 0)
+    target_dict = json.loads(debug_str)
+    assert "Memory" in target_dict and "Time" in target_dict and "Unit" in target_dict
+    profiler.set_state('stop')
+
+def test_aggregate_stats_sorting():
+    sort_by_options = {'avg': "Avg", 'min': "Min", 'max': "Max", 'count': "Count"}
+    ascending_options = [False, True]
+    def check_ascending(lst, asc):
+        assert(lst == sorted(lst, reverse = not asc))
+
+    def check_sorting(debug_str, sort_by, ascending):
+        target_dict = json.loads(debug_str, object_pairs_hook=OrderedDict)
+        lst = []
+        for domain_name, domain in target_dict['Time'].items():
+            lst = [item[sort_by_options[sort_by]] for item_name, item in domain.items()]
+            check_ascending(lst, ascending)
+        for domain_name, domain in target_dict['Memory'].items():
+            lst = [item[sort_by_options[sort_by]] for item_name, item in domain.items()]
+            check_ascending(lst, ascending)
+
+    file_name = 'test_aggregate_stats_sorting.json'
+    enable_profiler(file_name, True, True, True)
+    test_profile_event(False)
+    for sb in sort_by_options:
+        for asc in ascending_options:
+            debug_str = profiler.dumps(format = 'json', sort_by = sb, ascending = asc)
+            check_sorting(debug_str, sb, asc)
+    profiler.set_state('stop')
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 5e809d383cdf..4d147197a150 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -867,6 +867,8 @@ def testLarge(data, repeat):
     # Test larger arrays
     testLarge(mx.nd.arange(0, 100000).reshape((10, 10000)), 10)
     testLarge(mx.nd.arange(0, 100000).reshape((10000, 10)), 10)
+    testLarge(mx.nd.arange(0, 100000), 10)
+
 
 @with_seed()
 def test_randint():
diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index 7865000c7608..fb751b4ddade 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -35,6 +35,7 @@
              'gluon/index.md',
              'mkldnn/index.md',
              'mkldnn/MKLDNN_README.md',
+             'mkldnn/operator_list.md',
              'nlp/index.md',
              'onnx/index.md',
              'python/index.md',
@@ -62,7 +63,8 @@
              'tensorrt/inference_with_trt.md',
              'java/index.md',
              'java/mxnet_java_on_intellij.md',
-             'java/ssd_inference.md']
+             'java/ssd_inference.md',
+             'amp/index.md']
 whitelist_set = set(whitelist)
 
 def test_tutorial_downloadable():
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index c58881c35054..9063616a09b5 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -103,6 +103,9 @@ def test_gluon_autograd():
 def test_gluon_gluon():
     assert _test_tutorial_nb('gluon/gluon')
 
+def test_gluon_multi_gpu():
+    assert _test_tutorial_nb('gluon/multi_gpu')
+
 def test_gluon_save_load_params():
     assert _test_tutorial_nb('gluon/save_load_params')
 
@@ -124,6 +127,9 @@ def test_gluon_learning_rate_schedules_advanced():
 def test_gluon_info_gan():
     assert _test_tutorial_nb('gluon/info_gan')
 
+def test_gluon_fit_api_fashion_mnist():
+    assert _test_tutorial_nb('gluon/fit_api_tutorial')
+
 def test_nlp_cnn():
     assert _test_tutorial_nb('nlp/cnn')
 
@@ -198,3 +204,6 @@ def test_vision_cnn_visualization():
 
 def test_control_flow():
     assert _test_tutorial_nb('control_flow/ControlFlowTutorial')
+
+def test_amp():
+    assert _test_tutorial_nb('amp/amp_tutorial')
diff --git a/tools/caffe_converter/test_converter.py b/tools/caffe_converter/test_converter.py
index 3c325d6bdd63..a4f744556fed 100644
--- a/tools/caffe_converter/test_converter.py
+++ b/tools/caffe_converter/test_converter.py
@@ -90,9 +90,9 @@ def main():
         gpus = [-1]
         default_batch_size = 32
     else:
-        gpus = mx.test_utils.list_gpus()
-        assert gpus, 'At least one GPU is needed to run test_converter in GPU mode'
-        default_batch_size = 32 * len(gpus)
+        num_gpus = mx.context.num_gpus()
+        assert num_gpus, 'At least one GPU is needed to run test_converter in GPU mode'
+        default_batch_size = 32 * num_gpus
 
     models = ['bvlc_googlenet', 'vgg-16', 'resnet-50']
 
diff --git a/tools/license_header.py b/tools/license_header.py
index b9acbf167c17..c8add72288e7 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -68,6 +68,9 @@
                # Git submodules under different licenses
                '3rdparty',
 
+               # 3rdparty headerfiles under different licenses
+               'include/mkldnn',
+
                # Code shared with project by author - see file for details
                'src/operator/special_functions-inl.h',
 
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
index fd9ce41c2a80..3905a69fb94d 100644
--- a/tools/pip/setup.py
+++ b/tools/pip/setup.py
@@ -52,7 +52,7 @@ def has_ext_modules(self):
 
 
 DEPENDENCIES = [
-    'numpy<1.15.0,>=1.8.2',
+    'numpy>1.16.0,<2.0.0',
     'requests>=2.20.0',
     'graphviz<0.9.0,>=0.8.1'
 ]
diff --git a/tools/setup_gpu_build_tools.sh b/tools/setup_gpu_build_tools.sh
index 167d4c6a6e13..724af3c90ec1 100755
--- a/tools/setup_gpu_build_tools.sh
+++ b/tools/setup_gpu_build_tools.sh
@@ -27,18 +27,24 @@ VARIANT=$1
 DEPS_PATH=$2
 
 >&2 echo "Setting CUDA versions for $VARIANT"
-if [[ $VARIANT == cu100* ]]; then
+if [[ $VARIANT == cu101* ]]; then
+    CUDA_VERSION='10.1.105-1'
+    CUDA_PATCH_VERSION='10.1.105-1'
+    LIBCUDA_VERSION='418.39-0ubuntu1'
+    LIBCUDNN_VERSION='7.6.0.64-1+cuda10.1'
+    LIBNCCL_VERSION='2.4.7-1+cuda10.1'
+elif [[ $VARIANT == cu100* ]]; then
     CUDA_VERSION='10.0.130-1'
     CUDA_PATCH_VERSION='10.0.130-1'
     LIBCUDA_VERSION='410.48-0ubuntu1'
-    LIBCUDNN_VERSION='7.3.1.20-1+cuda10.0'
-    LIBNCCL_VERSION='2.3.4-1+cuda9.2'
+    LIBCUDNN_VERSION='7.6.0.64-1+cuda10.0'
+    LIBNCCL_VERSION='2.4.7-1+cuda10.0'
 elif [[ $VARIANT == cu92* ]]; then
     CUDA_VERSION='9.2.148-1'
     CUDA_PATCH_VERSION='9.2.148.1-1'
     LIBCUDA_VERSION='396.44-0ubuntu1'
-    LIBCUDNN_VERSION='7.3.1.20-1+cuda9.2'
-    LIBNCCL_VERSION='2.3.4-1+cuda9.2'
+    LIBCUDNN_VERSION='7.6.0.64-1+cuda9.2'
+    LIBNCCL_VERSION='2.4.7-1+cuda9.2'
 elif [[ $VARIANT == cu91* ]]; then
     CUDA_VERSION='9.1.85-1'
     CUDA_PATCH_VERSION='9.1.85.3-1'
@@ -49,8 +55,8 @@ elif [[ $VARIANT == cu90* ]]; then
     CUDA_VERSION='9.0.176-1'
     CUDA_PATCH_VERSION='9.0.176.3-1'
     LIBCUDA_VERSION='384.145-0ubuntu1'
-    LIBCUDNN_VERSION='7.3.1.20-1+cuda9.0'
-    LIBNCCL_VERSION='2.3.4-1+cuda9.0'
+    LIBCUDNN_VERSION='7.6.0.64-1+cuda9.0'
+    LIBNCCL_VERSION='2.4.7-1+cuda9.0'
 elif [[ $VARIANT == cu80* ]]; then
     CUDA_VERSION='8.0.61-1'
     CUDA_PATCH_VERSION='8.0.61.2-1'
@@ -85,7 +91,31 @@ if [[ $VARIANT == cu* ]]; then
 fi
 
 # list of debs to download from nvidia
-if [[ $VARIANT == cu100* ]]; then
+if [[ $VARIANT == cu101* ]]; then
+    cuda_files=( \
+      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "libcublas10_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "libcublas-dev_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvcc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+    )
+    ml_files=( \
+      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
+      "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
+    )
+elif [[ $VARIANT == cu100* ]]; then
     cuda_files=( \
       "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
       "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \