diff --git a/.travis.yml b/.travis.yml
index b0aa26c1a3a1..485faadee277 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,6 +34,7 @@ script:
 # Temporarily disable travis build due to travis constantly time out, tracked in
 # https://github:com/apache/incubator-mxnet/issues/16535:
   - export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+  - export MXNET_SUBGRAPH_VERBOSE=0
   - mv make/osx.mk config.mk
 #  - make -j 2
 
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000000..bbb4505499c1
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,25 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Security Policy
+
+## Reporting a Vulnerability
+The Apache Software Foundation takes a very active stance in eliminating security problems and denial of service attacks against its products.
+
+We strongly encourage folks to report such problems to our private security mailing list first, before disclosing them in a public forum.
+
+For instructions how to report a security vulnerability, please consult our [security guide](https://mxnet.apache.org/api/faq/security).
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 0112d6bb1704..c2acc0f40d7d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1007,6 +1007,7 @@ cd_unittest_ubuntu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     export CD_JOB=1 # signal this is a CD run so any unecessary tests can be skipped
 
@@ -1049,6 +1050,7 @@ unittest_ubuntu_python2_cpu_cython() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
     check_cython 2
@@ -1062,6 +1064,7 @@ unittest_ubuntu_python2_cpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
@@ -1073,6 +1076,7 @@ unittest_ubuntu_python3_cpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
@@ -1083,6 +1087,7 @@ unittest_ubuntu_python3_cpu_mkldnn() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_mkl.xml --verbose tests/python/mkl
@@ -1093,6 +1098,7 @@ unittest_ubuntu_python2_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
@@ -1102,6 +1108,7 @@ unittest_ubuntu_python3_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
@@ -1112,6 +1119,7 @@ unittest_ubuntu_python3_gpu_cython() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
@@ -1123,6 +1131,7 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_OFF_TEST_ONLY=true
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
@@ -1132,6 +1141,7 @@ unittest_ubuntu_tensorrt_gpu() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
@@ -1146,6 +1156,7 @@ unittest_ubuntu_python2_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
@@ -1158,6 +1169,7 @@ unittest_ubuntu_python3_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
@@ -1320,6 +1332,7 @@ integrationtest_ubuntu_gpu_python() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     python example/image-classification/test_score.py
 }
 
@@ -1348,6 +1361,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
     pushd .
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_USE_OPERATOR_TUNING=0
     cd tests/nightly/
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu
@@ -1382,6 +1396,7 @@ integrationtest_ubuntu_gpu_dist_kvstore() {
     pushd .
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     cd tests/nightly/
     ../../tools/launch.py -n 4 --launcher local python dist_device_sync_kvstore.py
     ../../tools/launch.py -n 4 --launcher local python dist_sync_kvstore.py --type=init_gpu
@@ -1569,6 +1584,7 @@ nightly_tutorial_test_ubuntu_python3_gpu() {
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python3
     cd /work/mxnet/tests/tutorials
@@ -1582,6 +1598,7 @@ nightly_tutorial_test_ubuntu_python2_gpu() {
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python2
     cd /work/mxnet/tests/tutorials
@@ -1975,7 +1992,7 @@ cd_package_pypi() {
     popd
 }
 
-# Sanity checks wheel file 
+# Sanity checks wheel file
 cd_integration_test_pypi() {
     set -ex
     local python_cmd=${1:?"This function requires a python command as the first argument"}
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index df9b15ba1ec3..c39d1fa45328 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index f2974ff6f7b6..b2ea62fc7cd4 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index 900bfd161cd0..1e09b5c98ce1 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index b6e951b291fb..9bf7d04d8a88 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
diff --git a/cpp-package/README.md b/cpp-package/README.md
index 05fb506db42b..77ff0ee36e80 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -55,7 +55,7 @@ In order to consume the C++ API please follow the steps below.
 
 ## Tutorial
 
-A basic tutorial can be found at <https://mxnet.apache.org/tutorials/c++/basics.html>.
+A basic tutorial can be found at <https://mxnet.apache.org/api/cpp/docs/tutorials/basics>.
 
 ## Examples
 
diff --git a/docs/python_docs/python/api/gluon/data/index.rst b/docs/python_docs/python/api/gluon/data/index.rst
new file mode 100644
index 000000000000..f9e8a21e69d2
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/index.rst
@@ -0,0 +1,63 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+gluon.data
+==========
+
+.. automodule:: mxnet.gluon.data
+
+Datasets
+--------
+
+.. autosummary::
+
+   Dataset
+   ArrayDataset
+   RecordFileDataset
+   SimpleDataset
+
+Sampling
+--------
+
+.. autosummary::
+
+   Sampler
+   SequentialSampler
+   RandomSampler
+   BatchSampler
+
+DataLoader
+----------
+
+.. autosummary::
+
+   DataLoader
+
+
+API Reference
+-------------
+.. automodule:: mxnet.gluon.data
+    :members:
+    :imported-members:
+    :autosummary:
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2
+   :glob:
+
+   */index
\ No newline at end of file
diff --git a/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst b/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
new file mode 100644
index 000000000000..6b007526607a
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
@@ -0,0 +1,26 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+vision.datasets
+===============
+
+Gluon provides pre-defined vision datasets functions in the :py:mod:`mxnet.gluon.data.vision.datasets`
+module.
+
+.. automodule:: mxnet.gluon.data.vision.datasets
+    :members:
+    :autosummary:
diff --git a/docs/python_docs/python/api/gluon/data/vision/index.rst b/docs/python_docs/python/api/gluon/data/vision/index.rst
new file mode 100644
index 000000000000..2731b5f4245a
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/vision/index.rst
@@ -0,0 +1,53 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+data.vision
+============
+
+.. automodule:: mxnet.gluon.data.vision
+
+Datasets
+^^^^^^^^
+
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.data.vision.datasets
+
+
+Data transformations
+^^^^^^^^^^^^^^^^^^^^
+
+
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.data.vision.transforms
+
+
+API Reference
+-------------
+.. automodule:: mxnet.gluon.data.vision
+    :members:
+    :autosummary:
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2
+   :glob:
+
+   */index
\ No newline at end of file
diff --git a/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst b/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
new file mode 100644
index 000000000000..60d975d87aff
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
@@ -0,0 +1,48 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+vision.transforms
+=================
+
+Gluon provides pre-defined vision transformation and data augmentation functions in the :py:mod:`mxnet.gluon.data.vision.transforms`
+module.
+
+.. currentmodule:: mxnet.gluon.data.vision
+
+.. autosummary::
+   :nosignatures:
+
+   transforms.Compose
+   transforms.Cast
+   transforms.ToTensor
+   transforms.Normalize
+   transforms.RandomResizedCrop
+   transforms.CenterCrop
+   transforms.Resize
+   transforms.RandomFlipLeftRight
+   transforms.RandomFlipTopBottom
+   transforms.RandomBrightness
+   transforms.RandomContrast
+   transforms.RandomSaturation
+   transforms.RandomHue
+   transforms.RandomColorJitter
+   transforms.RandomLighting
+
+API Reference
+-------------
+.. automodule:: mxnet.gluon.data.vision.transforms
+    :members:
diff --git a/docs/python_docs/python/api/mxnet/log/index.rst b/docs/python_docs/python/api/mxnet/log/index.rst
new file mode 100644
index 000000000000..fd4d8788c28a
--- /dev/null
+++ b/docs/python_docs/python/api/mxnet/log/index.rst
@@ -0,0 +1,23 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+mxnet.log
+=========
+
+.. automodule:: mxnet.log
+    :members:
+    :autosummary:
\ No newline at end of file
diff --git a/docs/python_docs/python/api/mxnet/model/index.rst b/docs/python_docs/python/api/mxnet/model/index.rst
new file mode 100644
index 000000000000..69bcddce6bc1
--- /dev/null
+++ b/docs/python_docs/python/api/mxnet/model/index.rst
@@ -0,0 +1,23 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+mxnet.model
+===========
+
+.. automodule:: mxnet.model
+    :members:
+    :autosummary:
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md b/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
index 7a7738d8df1b..9afe95b58403 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
@@ -21,7 +21,7 @@ A saved model can be used in multiple places, such as to continue training, to f
 
 ## Prerequisites
 
-Please run the [previous tutorial](train.md) to train the network and save its parameters to file. You will need this file to run the following steps.
+Please run the [previous tutorial](4-train.html) to train the network and save its parameters to file. You will need this file to run the following steps.
 
 ```{.python .input  n=1}
 from mxnet import nd
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
index b78c38ab7077..a0788ba7df2d 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
@@ -99,7 +99,7 @@ net(x)
 
 Finally, we show how to use multiple GPUs to jointly train a neural network through data parallelism. Let's assume there are *n* GPUs. We split each data batch into *n* parts, and then each GPU will run the forward and backward passes using one part of the data.
 
-Let's first copy the data definitions and the transform function from the [previous tutorial](predict.md).
+Let's first copy the data definitions and the transform function from the [previous tutorial](5-predict.html).
 
 ```{.python .input}
 batch_size = 256
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index 8d2c4e100c76..b1f65e682263 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -20,7 +20,7 @@
 
 ## Overview
 MXNet Gluon API comes with a lot of great features, and it can provide you everything you need: from experimentation to deploying the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
-This tutorial covers training and inference in Python, please continue to [C++ inference part](https://mxnet.apache.org/versions/master/tutorials/c++/mxnet_cpp_inference_tutorial.html) after you finish.
+This tutorial covers training and inference in Python, please continue to [C++ inference part](/api/cpp/docs/tutorials/cpp_inference) after you finish.
 
 Let's say you need to build a service that provides flower species recognition. A common problem is that you don't have enough data to train a good model. In such cases, a technique called Transfer Learning can be used to make a more robust model.
 In Transfer Learning we make use of a pre-trained model that solves a related task, and was trained on a very large standard dataset, such as ImageNet. ImageNet is from a different domain, but we can utilize the knowledge in this pre-trained model to perform the new task at hand.
@@ -77,7 +77,7 @@ from mxnet.gluon.data.vision import transforms
 from mxnet.gluon.model_zoo.vision import resnet50_v2
 ```
 
-Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](../packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training.
+Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](/api/python/docs/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training.
 Here we set the `epochs` to 1 for quick demonstration, please change to 40 for actual training.
 
 ```python
@@ -161,7 +161,7 @@ test_data = gluon.data.DataLoader(
 
 We will use pre-trained ResNet50_v2 model which was pre-trained on the [ImageNet Dataset](http://www.image-net.org/) with 1000 classes. To match the classes in the Flower dataset, we must redefine the last softmax (output) layer to be 102, then initialize the parameters.
 
-Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](https://mxnet.apache.org/tutorials/gluon/hybrid.html).
+Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](/api/python/docs/tutorials/packages/gluon/blocks/hybridize.html).
 
 
 
@@ -265,7 +265,7 @@ finetune_net.export("flower-recognition", epoch=epochs)
 ## Load the model and run inference using the MXNet Module API
 
 MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
-Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.apache.org/api/python/module/module.html),    [Java](https://mxnet.apache.org/api/java/index.html), [Scala](https://mxnet.apache.org/api/scala/index.html), and [C++](https://mxnet.apache.org/api/c++/index.html) APIs.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](/api/python.html),    [Java](/api/java.html), [Scala](/api/scala.html), and [C++](/api/cpp) APIs.
 
 Here we will briefly introduce how to run inference using Module API in Python. There is more detailed explanation available in the [Predict Image Tutorial](https://mxnet.apache.org/tutorials/python/predict_image.html).
 In general, prediction consists of the following steps:
@@ -315,7 +315,7 @@ You can continue to the [next tutorial](https://mxnet.apache.org/versions/master
 
 You can also find more ways to run inference and deploy your models here:
 1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-2. [Scala Inference examples](https://mxnet.apache.org/tutorials/scala/)
+2. [Scala Inference examples](/api/scala/docs/tutorials/infer)
 4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
diff --git a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
index d7720bac4348..1ab490fbaa42 100644
--- a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
+++ b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
@@ -164,7 +164,7 @@ mx_trainer = gluon.Trainer(mx_net.collect_params(),
                            'sgd', {'learning_rate': 0.1})
 ```
 
-The code difference between frameworks is small. The main difference is that in Apache MXNet we use [Trainer](https://mxnet.apache.org/api/python/docs/api/gluon/mxnet.gluon.Trainer.html) class, which accepts optimization algorithm as an argument. We also use [.collect_params()](/api/python/docs/api/gluon/_autogen/mxnet.gluon.nn.Block.collect_params.html) method to get parameters of the network.
+The code difference between frameworks is small. The main difference is that in Apache MXNet we use [Trainer](/api/python/docs/api/gluon/trainer.html) class, which accepts optimization algorithm as an argument. We also use [.collect_params()](/api/python/docs/api/gluon/block.html#mxnet.gluon.Block.collect_params) method to get parameters of the network.
 
 ### 4. Training
 
@@ -212,13 +212,13 @@ Some of the differences in Apache MXNet when compared to PyTorch are as follows:
 
 * In Apache MXNet, you don't need to flatten the 4-D input into 2-D when feeding the data into forward pass.
 
-* In Apache MXNet, you need to perform the calculation within the [autograd.record()](/api/python/docs/api/gluon-related/_autogen/mxnet.autograd.record.html) scope so that it can be automatically differentiated in the backward pass.
+* In Apache MXNet, you need to perform the calculation within the [autograd.record()](/api/python/docs/api/autograd/index.html?autograd%20record#mxnet.autograd.record) scope so that it can be automatically differentiated in the backward pass.
 
 * It is not necessary to clear the gradient every time as with PyTorch's `trainer.zero_grad()` because by default the new gradient is written in, not accumulated.
 
-* You need to specify the update step size (usually batch size) when performing [step()](/api/python/docs/api/gluon/_autogen/mxnet.gluon.Trainer.step.html) on the trainer.
+* You need to specify the update step size (usually batch size) when performing [step()](/api/python/docs/api/gluon/trainer.html?#mxnet.gluon.Trainer.step) on the trainer.
 
-* You need to call [.asscalar()](/api/python/docs/api/ndarray/_autogen/mxnet.ndarray.NDArray.asscalar.html) to turn a multidimensional array into a scalar.
+* You need to call [.asscalar()](/api/python/docs/api/ndarray/ndarray.html?#mxnet.ndarray.NDArray.asscalar) to turn a multidimensional array into a scalar.
 
 * In this sample, Apache MXNet is twice as fast as PyTorch. Though you need to be cautious with such toy comparisons.
 
@@ -230,9 +230,9 @@ As we saw above, Apache MXNet Gluon API and PyTorch have many similarities. The
 
 While Apache MXNet Gluon API is very similar to PyTorch, there are some extra functionality that can make your code even faster.
 
-* Check out [Hybridize tutorial](/api/python/docs/guide/packages/gluon/hybridize.html) to learn how to write imperative code which can be converted to symbolic one.
+* Check out [Hybridize tutorial](/api/python/docs/tutorials/packages/gluon/blocks/hybridize.html) to learn how to write imperative code which can be converted to symbolic one.
 
-* Also, check out how to extend Apache MXNet with your own [custom layers](/api/python/docs/guide/extend/custom_layer.html).
+* Also, check out how to extend Apache MXNet with your own [custom layers](/api/python/docs/tutorials/packages/gluon/blocks/custom-layer.html?custom_layers).
 
 ## Appendix
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index 8a3d8229413b..39726a3a511c 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -112,8 +112,8 @@ to train the MLP network we defined above.
 
 For our training, we will make use of the stochastic gradient descent (SGD) optimizer. In particular, we'll be using mini-batch SGD. Standard SGD processes train data one example at a time. In practice, this is very slow and one can speed up the process by processing examples in small batches. In this case, our batch size will be 100, which is a reasonable choice. Another parameter we select here is the learning rate, which controls the step size the optimizer takes in search of a solution. We'll pick a learning rate of 0.02, again a reasonable choice. Settings such as batch size and learning rate are what are usually referred to as hyper-parameters. What values we give them can have a great impact on training performance.
 
-We will use [Trainer](https://mxnet.io/api/python/docs/api/gluon/mxnet.gluon.Trainer.html) class to apply the
-[SGD optimizer](https://mxnet.io/api/python/docs/api/gluon-related/_autogen/mxnet.optimizer.SGD.html) on the
+We will use [Trainer](/api/python/docs/api/gluon/trainer.html) class to apply the
+[SGD optimizer](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.SGD) on the
 initialized parameters.
 
 ```python
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
index 0b362513c0ae..b91279cff4d4 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
@@ -556,7 +556,7 @@ except mx.MXNetError as err:
 
 ## Next 
 
-[Train a Linear Regression Model with Sparse Symbols](http://mxnet.apache.org/tutorials/sparse/train.html)
+[Train a Linear Regression Model with Sparse Symbols](/api/python/docs/tutorials/packages/ndarray/sparse/train.html)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
index 1241182af85b..7500e82cf9e6 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
@@ -578,7 +578,7 @@ except mx.MXNetError as err:
 
 ## Next
 
-[Train a Linear Regression Model with Sparse Symbols](http://mxnet.apache.org/tutorials/sparse/train.html)
+[Train a Linear Regression Model with Sparse Symbols](/api/python/docs/tutorials/packages/ndarray/sparse/train.html)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
index 71669e142a4b..336185cf7583 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
@@ -27,18 +27,18 @@ then train a linear regression model using sparse symbols with the Module API.
 
 To complete this tutorial, we need:
 
-- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.io/get_started).  
+- MXNet. See the instructions for your operating system in [Setup and Installation](/get_started).  
 
-- [Jupyter Notebook](https://jupyter.org/index.html) and [Python Requests](http://docs.python-requests.org/en/master/) packages.
+- [Jupyter Notebook](https://jupyter.org/index.html) and [Python Requests](https://3.python-requests.org/) packages.
 ```
 pip install jupyter requests
 ```
 
 - Basic knowledge of Symbol in MXNet. See the detailed tutorial for Symbol in [Symbol - Neural Network Graphs and Auto-differentiation](https://mxnet.apache.org/tutorials/basic/symbol.html).
 
-- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](https://mxnet.apache.org/versions/master/tutorials/sparse/csr.html).
+- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](/api/python/docs/tutorials/packages/ndarray/sparse/csr.html).
 
-- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](https://mxnet.apache.org/versions/master/tutorials/sparse/row_sparse.html).
+- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](/api/python/docs/tutorials/packages/ndarray/sparse/row_sparse.html).
 
 ## Variables
 
@@ -155,7 +155,7 @@ f = mx.sym.sparse.elemwise_add(c, c)
 ### Storage Type Inference
 
 What will be the output storage types of sparse symbols? In MXNet, for any sparse symbol, the result storage types are inferred based on storage types of inputs.
-You can read the [Sparse Symbol API](https://mxnet.apache.org/versions/master/api/python/symbol/sparse.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
+You can read the [Sparse Symbol API](/api/python/docs/api/symbol/sparse/index.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
 
 
 ```python
diff --git a/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md b/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
index fe515f3392d7..40fb1d2e82f5 100644
--- a/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
+++ b/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
@@ -1,4 +1,4 @@
 # NVIDIA Jetson Devices
 
 To install MXNet on a Jetson TX or Nano, please refer to the [Jetson installation
-guide](get_started/jetson_setup).
\ No newline at end of file
+guide](/get_started/jetson_setup).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/get_started.html b/docs/static_site/src/_includes/get_started/get_started.html
index 4905d28ce2d3..77367c7ed337 100644
--- a/docs/static_site/src/_includes/get_started/get_started.html
+++ b/docs/static_site/src/_includes/get_started/get_started.html
@@ -256,8 +256,8 @@ <h2>Installing MXNet</h2>
             </div> <!-- END - C++-->
 
             <br>
-            For more installation options, refer to the <a href="get_started/ubuntu_setup.html">Ubuntu installation guide</a> and
-            <a href="get_started/centos_setup.html">CentOS installation guide</a>.
+            For more installation options, refer to the <a href="/get_started/ubuntu_setup.html">Ubuntu installation guide</a> and
+            <a href="/get_started/centos_setup.html">CentOS installation guide</a>.
         </div> <!-- END - Linux -->
 
 
@@ -354,7 +354,7 @@ <h2>Installing MXNet</h2>
                 </div> <!-- End of cpu gpu -->
             </div>
             <br>
-            For more installation options, refer to the <a href="get_started/osx_setup.html">MXNet macOS installation guide</a>.
+            For more installation options, refer to the <a href="/get_started/osx_setup.html">MXNet macOS installation guide</a>.
         </div> <!-- END - Mac OS -->
 
 
@@ -440,7 +440,7 @@ <h2>Installing MXNet</h2>
                 </div> <!-- End of cpu gpu -->
             </div> <!-- End of C++ -->
 
-            For more installation options, refer to the <a href="get_started/windows_setup.html">MXNet Windows installation guide</a>.
+            For more installation options, refer to the <a href="/get_started/windows_setup.html">MXNet Windows installation guide</a>.
         </div> <!-- End of Windows -->
 
 
diff --git a/docs/static_site/src/_includes/get_started/linux/java/cpu.md b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
index 5345a2d754b2..fc6f598fa5ee 100644
--- a/docs/static_site/src/_includes/get_started/linux/java/cpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
@@ -1,6 +1,6 @@
 You can use the Maven packages defined in the following dependency to include MXNet in your Java
 project. The Java API is provided as a subset of the Scala API and is intended for inference only.
-Please refer to the <a href="get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
+Please refer to the <a href="/get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
 instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~">
diff --git a/docs/static_site/src/_includes/get_started/linux/java/gpu.md b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
index 5e687a353fe4..6f6757f6e2ea 100644
--- a/docs/static_site/src/_includes/get_started/linux/java/gpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
@@ -1,6 +1,6 @@
 You can use the Maven packages defined in the following dependency to include MXNet in your Java
 project. The Java API is provided as a subset of the Scala API and is intended for inference only.
-Please refer to the <a href="get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
+Please refer to the <a href="/get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
 instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~">
diff --git a/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
index fbbc0bd248a9..018aca9d7387 100644
--- a/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
@@ -1,2 +1,2 @@
-Refer to the [Julia section of the MXNet Ubuntu installation guide](get_started/ubuntu_setup#install-the-mxnet-package-for-julia).
+Refer to the [Julia section of the MXNet Ubuntu installation guide](/get_started/ubuntu_setup#install-the-mxnet-package-for-julia).
 
diff --git a/docs/static_site/src/_includes/get_started/linux/r/cpu.md b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
index c0a4e015b61d..88ca5dd39933 100644
--- a/docs/static_site/src/_includes/get_started/linux/r/cpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
@@ -1,5 +1,5 @@
 The default version of R that is installed with `apt-get` is insufficient. You will need
-to first [install R v3.4.4+ and build MXNet from source](get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
+to first [install R v3.4.4+ and build MXNet from source](/get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
 
 After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
 
diff --git a/docs/static_site/src/_includes/get_started/linux/r/gpu.md b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
index 57afe7a8d65e..16fbfd09d4d4 100644
--- a/docs/static_site/src/_includes/get_started/linux/r/gpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
@@ -1,7 +1,7 @@
 The default version of R that is installed with `apt-get` is insufficient. You will need
 to first
 [install R v3.4.4+ and build MXNet from
-source](get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
+source](/get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
 
 After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings
 with the
diff --git a/docs/static_site/src/_includes/get_started/macos/java/cpu.md b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
index 2050149fd33d..002037a15771 100644
--- a/docs/static_site/src/_includes/get_started/macos/java/cpu.md
+++ b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
@@ -1,7 +1,7 @@
 You can use the Maven packages defined in the following dependency to include MXNet in
 your Java project. The Java API is provided as a subset of the Scala API and is intended for
 inference only.
-Please refer to the [MXNet-Java setup guide](get_started/java_setup.html) for a detailed set of instructions to help you with the setup process.
+Please refer to the [MXNet-Java setup guide](/get_started/java_setup.html) for a detailed set of instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~"><img
 src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg"
diff --git a/docs/static_site/src/_includes/get_started/pip_snippet.md b/docs/static_site/src/_includes/get_started/pip_snippet.md
index f5cc4ea12803..2c4d932fc816 100644
--- a/docs/static_site/src/_includes/get_started/pip_snippet.md
+++ b/docs/static_site/src/_includes/get_started/pip_snippet.md
@@ -1,6 +1,6 @@
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for
-other MXNet pip packages</a>, or <a href="get_started/validate_mxnet.html">validate your MXNet installation</a>.
+other MXNet pip packages</a>, or <a href="/get_started/validate_mxnet.html">validate your MXNet installation</a>.
 
 <div style="text-align: center">
     <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.5.1.png"
diff --git a/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
index f9e61cb1c64e..4fc600468ad1 100644
--- a/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
@@ -1 +1 @@
-Refer to the [Julia section of the MXNet Windows installation guide](get_started/windows_setup.html#install-the-mxnet-package-for-julia).
+Refer to the [Julia section of the MXNet Windows installation guide](/get_started/windows_setup.html#install-the-mxnet-package-for-julia).
diff --git a/docs/static_site/src/_includes/get_started/windows/perl/perl.md b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
index a24ae0aa13f0..1a8eea5261ba 100644
--- a/docs/static_site/src/_includes/get_started/windows/perl/perl.md
+++ b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
@@ -1 +1 @@
-Refer to the [Perl section of the MXNet Windows installation guide](get_started/windows_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
+Refer to the [Perl section of the MXNet Windows installation guide](/get_started/windows_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
index fe0ac055c914..af36205337d2 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
@@ -1 +1 @@
-Refer to the [MXNet Windows installation guide](get_started/windows_setup.html)
\ No newline at end of file
+Refer to the [MXNet Windows installation guide](/get_started/windows_setup.html)
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
index 762f720b5403..55bca3a129d8 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
@@ -1 +1 @@
-To build from source, refer to the [MXNet Windows installation guide](get_started/windows_setup.html).
+To build from source, refer to the [MXNet Windows installation guide](/get_started/windows_setup.html).
diff --git a/docs/static_site/src/_includes/get_started/windows/r/cpu.md b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
index 3110475a2abd..926b8355c984 100644
--- a/docs/static_site/src/_includes/get_started/windows/r/cpu.md
+++ b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
@@ -1,7 +1,7 @@
 Note: packages for 3.6.x are not yet available.
 Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/windows/base/old/).
 
-You can [build MXNet-R from source](get_started/windows_setup.html#install-mxnet-package-for-r), or
+You can [build MXNet-R from source](/get_started/windows_setup.html#install-mxnet-package-for-r), or
 you can use a
 pre-built binary:
 
diff --git a/docs/static_site/src/_includes/get_started/windows/r/gpu.md b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
index 0840d2d2acbc..084f1a5a4012 100644
--- a/docs/static_site/src/_includes/get_started/windows/r/gpu.md
+++ b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
@@ -1,4 +1,4 @@
-You can [build MXNet-R from source](get_started/windows_setup.html#install-mxnet-package-for-r), or
+You can [build MXNet-R from source](/get_started/windows_setup.html#install-mxnet-package-for-r), or
 you can use a
 pre-built binary:
 
diff --git a/docs/static_site/src/pages/api/architecture/note_data_loading.md b/docs/static_site/src/pages/api/architecture/note_data_loading.md
index 1279d0361e5f..01bf1f23f600 100644
--- a/docs/static_site/src/pages/api/architecture/note_data_loading.md
+++ b/docs/static_site/src/pages/api/architecture/note_data_loading.md
@@ -125,7 +125,7 @@ then compress into JPEG format.
 After that, we save a header that indicates the index and label
 for that image to be used when constructing the *Data* field for that record.
 We then pack several images together into a file.
-You may want to also review the [example using im2rec.py to create a RecordIO dataset](https://mxnet.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators).
+You may want to also review the [example using im2rec.py to create a RecordIO dataset](https://mxnet.apache.org/api/faq/recordio).
 
 ### Access Arbitrary Parts Of Data
 
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
index 9392eca2977f..0d96817560d0 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
@@ -29,7 +29,7 @@ tag: cpp
 ## Overview
 MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
 Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python]({{'/api/python/docs/api/symbol-related/mxnet.module'|relative_url}}),    [Java]({{'/api/java/docs/api'|relative_url}}), [Scala]({{'/api/scala/docs/api'|relative_url}}), and [C++]({{'/api/cpp/docs/api'|relative_url}}) APIs.
-We will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/example/inference) for our use case.
+We will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) for our use case.
 
 ## Prerequisites
 
@@ -105,7 +105,7 @@ class Predictor {
 
 ### Load the model, synset file, and normalization values
 
-In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp).
+In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/imagenet_inference.cpp).
 
 Next, we need to load synset file, and normalization values. We have made the following change since our synset file contains flower names and we used both mean and standard deviation for image normalization.
 
@@ -280,12 +280,12 @@ Then it will predict your image:
 
 Now you can explore more ways to run inference and deploy your models:
 1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-2. [Scala Inference examples]({{'/api/scala/docs/tutorials'|relative_url}})
-3. [ONNX model inference examples]({{'/api/python/docs/tutorials/deploy/index.html'|relative_url}})
+2. [Scala Inference examples](/api/scala/docs/tutorials)
+3. [ONNX model inference examples](/api/python/docs/tutorials/deploy/index.html)
 4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
 
-1. [Gluon end to end tutorial]({{'/api/python/docs/tutorials/packages/gluon/gluon_from_experiment_to_deployment.html'|relative_url}})
+1. [Gluon end to end tutorial](/api/python/docs/tutorials/getting-started/gluon_from_experiment_to_deployment.html)
 2. [Gluon C++ inference example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
 3. [Gluon C++ package](https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
diff --git a/docs/static_site/src/pages/api/faq/distributed_training.md b/docs/static_site/src/pages/api/faq/distributed_training.md
index caf0123b7aea..622ace60f780 100644
--- a/docs/static_site/src/pages/api/faq/distributed_training.md
+++ b/docs/static_site/src/pages/api/faq/distributed_training.md
@@ -91,7 +91,7 @@ In the case of distributed training though, we would need to divide the dataset
 
 Typically, this split of data for each worker happens through the data iterator,
 on passing the number of parts and the index of parts to iterate over.
-Some iterators in MXNet that support this feature are [mxnet.io.MNISTIterator]({{'//api/mxnet/io/index.html#mxnet.io.MNISTIter'|relative_url}}) and [mxnet.io.ImageRecordIter]({{'/api/mxnet/io/index.html#mxnet.io.ImageRecordIter'|relative_url}}).
+Some iterators in MXNet that support this feature are [mxnet.io.MNISTIterator](/api/python/docs/api/mxnet/io/index.html?MNISTIter#mxnet.io.MNISTIter) and [mxnet.io.ImageRecordIter](api/python/docs/api/mxnet/io/index.html?imagerecorditer#mxnet.io.ImageRecordIter).
 If you are using a different iterator, you can look at how the above iterators implement this.
 We can use the kvstore object to get the number of workers (`kv.num_workers`) and rank of the current worker (`kv.rank`).
 These can be passed as arguments to the iterator.
@@ -101,7 +101,7 @@ to see an example usage.
 ### Updating weights
 KVStore server supports two modes, one which aggregates the gradients and updates the weights using those gradients, and second where the server only aggregates gradients. In the latter case, when a worker process pulls from kvstore, it gets the aggregated gradients. The worker then uses these gradients and applies the weights locally.
 
-When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer]({{'/api/python/docs/api/gluon/mxnet.gluon.Trainer.html'|relative_url}}) object like this:
+When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer](/api/python/docs/api/gluon/trainer.html) object like this:
 
 ```
 trainer = gluon.Trainer(net.collect_params(), optimizer='sgd',
@@ -190,7 +190,7 @@ git clone --recursive https://github.com/apache/incubator-mxnet
 ```
 
 #### Example
-Let us consider training a VGG11 model on the CIFAR10 dataset using [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py).
+Let us consider training a VGG11 model on the CIFAR10 dataset using [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/tools/launch.py).
 ```
 cd example/gluon/
 ```
diff --git a/docs/static_site/src/pages/api/faq/float16.md b/docs/static_site/src/pages/api/faq/float16.md
index d824acb3ce6d..e63bf87ac68f 100644
--- a/docs/static_site/src/pages/api/faq/float16.md
+++ b/docs/static_site/src/pages/api/faq/float16.md
@@ -39,7 +39,7 @@ The float16 data type is a 16 bit floating point representation according to the
 - CUDA 9 or higher
 - cuDNN v7 or higher
 
-This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
+This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](/api/python/docs/tutorials/getting-started/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
 
 ## Using the Gluon API
 
@@ -47,13 +47,13 @@ This tutorial also assumes understanding of how to train a network with float32
 
 With Gluon API, you need to take care of three things to convert a model to support computation with float16.
 
-1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast]({{'/api/python/docs/api/gluon/mxnet.gluon.nn.Block.html#mxnet.gluon.nn.Block.cast'|relative_url}}) method of the `Block` representing the network.
+1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast](/api/python/docs/api/gluon/block.html?cast#mxnet.gluon.Block.cast) method of the `Block` representing the network.
 
 ```python
 net.cast('float16')
 ```
 
-2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype]({{'/api/python/docs/api/ndarray/_autogen/mxnet.ndarray.NDArray.astype.html#mxnet.ndarray.NDArray.astype'|relative_url}}) method of NDArrays.
+2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype](/api/python/docs/api/ndarray/ndarray.html?astype#mxnet.ndarray.NDArray.astype) method of NDArrays.
 
 ```python
 data = data.astype('float16', copy=False)
@@ -98,7 +98,7 @@ net.features = pretrained_net.features
 net.cast('float16')
 ```
 
-You can check the parameters of the model by calling [summary]({{'/api/python/docs/api/gluon/mxnet.gluon.nn.Block.html#mxnet.gluon.nn.Block.summary'|relative_url}}) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
+You can check the parameters of the model by calling [summary](/api/python/docs/api/gluon/block.html?block%20summary#mxnet.gluon.Block.summary) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
 
 ```python
 net.summary(mx.nd.uniform(shape=(1, 3, 224, 224), dtype=np.float16))
diff --git a/docs/static_site/src/pages/api/faq/gradient_compression.md b/docs/static_site/src/pages/api/faq/gradient_compression.md
index 1f4c5fb21903..e2b47c646ada 100644
--- a/docs/static_site/src/pages/api/faq/gradient_compression.md
+++ b/docs/static_site/src/pages/api/faq/gradient_compression.md
@@ -110,7 +110,7 @@ A reference `gluon` implementation with a gradient compression option can be fou
 mod = mx.mod.Module(..., compression_params={'type’:'2bit', 'threshold':0.5})
 ```
 
-A `module` example is provided with [this guide for setting up MXNet with distributed training](https://mxnet.apache.org/versions/master/faq/multi_devices.html#distributed-training-with-multiple-machines). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
+A `module` example is provided with [this guide for setting up MXNet with distributed training](/api/faq/distributed_training). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
 
 ### Configuration Details
 
diff --git a/docs/static_site/src/pages/api/faq/model_parallel_lstm.md b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
index 60df280b38fe..08cf6be76a90 100644
--- a/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
+++ b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
@@ -37,7 +37,7 @@ One key strength of _MXNet_ is its ability to leverage
 powerful heterogeneous hardware environments to achieve significant speedups.
 
 There are two primary ways that we can spread a workload across multiple devices.
-In a previous document, [we addressed data parallelism](multi_devices),
+In a previous document, [we addressed data parallelism](/api/faq/distributed_training),
 an approach in which samples within a batch are divided among the available devices.
 With data parallelism, each device stores a complete copy of the model.
 Here, we explore _model parallelism_, a different approach.
diff --git a/docs/static_site/src/pages/api/faq/recordio.md b/docs/static_site/src/pages/api/faq/recordio.md
index 75407cb3da5f..2e8fcdd647f3 100644
--- a/docs/static_site/src/pages/api/faq/recordio.md
+++ b/docs/static_site/src/pages/api/faq/recordio.md
@@ -38,7 +38,6 @@ We provide two tools for creating a RecordIO dataset.
 * [im2rec.py](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) - implements the tool using the Python API.
 
 Both provide the same output: a RecordIO dataset.
-You may want to also review the [example using real-world data with im2rec.py.](https://mxnet.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators)
 
 ### Prerequisites
 
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
index 52e4db92f84b..d74112db98b5 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
@@ -272,7 +272,7 @@ Yes! You can stop the training early with `return(FALSE)`. See the following exa
 When the validation metric dips below the threshold we set, the training process stops.
 
 ## Next Steps
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with a Pretrained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model Using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with a Pretrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
index 0f37123f23da..a4ca967d8e2c 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
@@ -225,7 +225,7 @@ sum(abs(test.y - pred6[1,])) / length(test.y)
 
 
 ## Next Steps
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with a PreTrained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model Using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with a PreTrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
index e6e218f2ae13..2c24cdf92fc3 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
@@ -55,7 +55,7 @@ PM2.5 concentration levels.
 
 Load and pre-process the data
 ---------
-The first step is to load in the data and preprocess it. It is assumed that the data has been downloaded in a .csv file: data.csv from the [pollution dataset](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data)
+The first step is to load in the data and preprocess it. It is assumed that the data has been downloaded in a .csv file: data.csv from the [pollution dataset](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data).
 
  ```r
 ## Loading required packages
@@ -324,4 +324,4 @@ We also repeated the above experiments to generate the next 100 samples to 301st
 
 The above tutorial is just for demonstration purposes and has not been tuned extensively for accuracy.
 
-For more tutorials on MXNet-R, head on to [MXNet-R tutorials](https://mxnet.apache.org/tutorials/r/index.html)
+For more tutorials on MXNet-R, head on to [MXNet-R tutorials](/api/r/docs/tutorials)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
index 9113b0d313d5..dc3d1c5a028e 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
@@ -223,9 +223,9 @@ The actual computations are finished, allowing us to copy the results someplace
 the results.
 
 ## Next Steps
-* [Symbol](https://mxnet.io/tutorials/r/symbol.html)
-* [Write and use callback functions](https://mxnet.io/tutorials/r/CallbackFunction.html)
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Symbol](/api/r/docs/tutorials/symbol)
+* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
index 9c3150f97157..b5d6b8fd32a7 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
@@ -130,7 +130,7 @@ In the example, *net* is used as a function to apply to an existing symbol
 
 The [model API](https://github.com/apache/incubator-mxnet/blob/master/R-package/R/model.R) is a thin wrapper around the symbolic executors to support neural net training.
 
-We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](../../api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
+We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](/api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
 
 ## How Efficient Is the Symbolic API?
 
@@ -147,8 +147,8 @@ be more memory efficient than CXXNet and gets to the same runtime with
 greater flexibility.
 
 ## Next Steps
-* [Write and use callback functions](https://mxnet.io/tutorials/r/CallbackFunction.html)
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/get_started/build_from_source.md b/docs/static_site/src/pages/get_started/build_from_source.md
index e8f7d468b399..20a4542461c4 100644
--- a/docs/static_site/src/pages/get_started/build_from_source.md
+++ b/docs/static_site/src/pages/get_started/build_from_source.md
@@ -98,7 +98,7 @@ Those can be extended with [LAPACK (Linear Algebra Package)](https://github.com/
 
 MXNet supports multiple mathematical backends for computations on the CPU:
 * [Apple Accelerate](https://developer.apple.com/documentation/accelerate)
-* [ATLAS](https://math-atlas.sourceforge.net/)
+* [ATLAS](http://math-atlas.sourceforge.net/)
 * [MKL](https://software.intel.com/en-us/intel-mkl) (MKL, MKLML)
 * [MKL-DNN](https://github.com/intel/mkl-dnn)
 * [OpenBLAS](https://www.openblas.net/)
diff --git a/docs/static_site/src/pages/get_started/index.html b/docs/static_site/src/pages/get_started/index.html
index e89b5e3b36e8..02e7cf1b8641 100644
--- a/docs/static_site/src/pages/get_started/index.html
+++ b/docs/static_site/src/pages/get_started/index.html
@@ -28,6 +28,6 @@
 <div class="get-started-from-source">
 <div class="wrapper">
     <h2>Download from source</h2>
-    <p>The signed source code for Apache MXNet (incubating) is available for download <a href="get_started/download">here</a></p>
+    <p>The signed source code for Apache MXNet (incubating) is available for download <a href="/get_started/download">here</a></p>
 </div>
 </div>
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 177ec5d40146..ac0c6726f2c7 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1170,7 +1170,7 @@ MXNET_DLL int MXAutogradIsTraining(bool* curr);
  * \param curr returns the current status
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIsNumpyShape(bool* curr);
+MXNET_DLL int MXIsNumpyShape(int* curr);
 /*!
  * \brief set numpy compatibility switch
  * \param is_np_shape 1 when numpy shape semantics is thread local on,
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 18f6424e54f7..dbd81e575872 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -108,12 +108,14 @@ class Imperative {
       is_recording_ = is_recording;
       return old;
   }
-  /*! \brief whether numpy compatibility is on. */
-  bool is_np_shape() const {
+  /*! \brief return current numpy compatibility status,
+   *  GlobalOn(2), ThreadLocalOn(1), Off(0).
+   * */
+  int is_np_shape() const {
     if (is_np_shape_global_) {
-      return true;
+      return 2;
     }
-    return is_np_shape_thread_local_;
+    return is_np_shape_thread_local_ ? 1 : 0;
   }
   /*! \brief specify numpy compatibility off, thread local on or global on. */
   bool set_is_np_shape(int is_np_shape) {
diff --git a/julia/docs/Project.toml b/julia/docs/Project.toml
index a4b243b0ffea..023a222beba6 100644
--- a/julia/docs/Project.toml
+++ b/julia/docs/Project.toml
@@ -4,4 +4,4 @@ DocumenterMarkdown = "997ab1e6-3595-5248-9280-8efb232c3433"
 MXNet = "a7949054-b901-59c6-b8e3-7238c29bf7f0"
 
 [compat]
-Documenter = "~0.21"
+Documenter = "~0.23"
diff --git a/julia/docs/make.jl b/julia/docs/make.jl
index 3e541c636888..3ea9b07d1056 100644
--- a/julia/docs/make.jl
+++ b/julia/docs/make.jl
@@ -19,6 +19,39 @@ using Documenter
 using DocumenterMarkdown
 using MXNet
 
+"""
+Return all files of a submodule
+
+julia> listpages("ndarray")
+15-element Array{String,1}:
+ "ndarray.jl"
+ "ndarray/activation.jl"
+ "ndarray/arithmetic.jl"
+ "ndarray/array.jl"
+ ...
+ "ndarray/statistic.jl"
+ "ndarray/trig.jl"
+ "ndarray/type.jl"
+"""
+listpages(x) =
+  ["$x.jl"; joinpath.(x, readdir(joinpath(@__DIR__, "..", "src", x)))]
+
+const api_pages = [
+  "api/context.md",
+  "api/ndarray.md",
+  "api/symbolic-node.md",
+  "api/model.md",
+  "api/initializers.md",
+  "api/optimizers.md",
+  "api/callbacks.md",
+  "api/metric.md",
+  "api/io.md",
+  "api/nn-factory.md",
+  "api/executor.md",
+  "api/kvstore.md",
+  "api/visualize.md",
+]
+
 makedocs(
   sitename = "MXNet.jl",
   modules  = MXNet,
diff --git a/julia/docs/mkdocs.yml b/julia/docs/mkdocs.yml
index 22cb71869673..383505621540 100644
--- a/julia/docs/mkdocs.yml
+++ b/julia/docs/mkdocs.yml
@@ -62,4 +62,5 @@ nav:
     - Symbolic API: api/symbolic-node.md
     - Neural Networks Factory: api/nn-factory.md
     - Executor: api/executor.md
+    - Key-Value Store: api/kvstore.md
     - Network Visualization: api/visualize.md
diff --git a/julia/docs/src/api.md b/julia/docs/src/api.md
index 60cb0831d1bf..04cfadd6d698 100644
--- a/julia/docs/src/api.md
+++ b/julia/docs/src/api.md
@@ -18,18 +18,5 @@
 # API Documentation
 
 ```@contents
-Pages = [
-  "api/symbolic-node.md",
-  "api/ndarray.md",
-  "api/context.md",
-  "api/model.md",
-  "api/initializers.md",
-  "api/optimizers.md",
-  "api/callbacks.md",
-  "api/metric.md",
-  "api/io.md",
-  "api/nn-factory.md",
-  "api/executor.md",
-  "api/visualize.md",
-]
+Pages = api_pages
 ```
diff --git a/julia/docs/src/api/ndarray.md b/julia/docs/src/api/ndarray.md
index 64f59dc5393e..640e8b3ec372 100644
--- a/julia/docs/src/api/ndarray.md
+++ b/julia/docs/src/api/ndarray.md
@@ -19,7 +19,7 @@
 
 ## Arithmetic Operations
 
-In the following example `y` can be a `Real` value or another `NDArray`
+In the following example `y` can be a `Real` value or another `NDArray`.
 
 | API | Example  |                            |
 |-----|----------|----------------------------|
@@ -70,21 +70,5 @@ In the following example `y` can be a `Real` value or another `NDArray`
 
 ```@autodocs
 Modules = [MXNet.mx]
-Pages = [
-  "ndarray.jl",
-  "ndarray/activation.jl",
-  "ndarray/arithmetic.jl",
-  "ndarray/array.jl",
-  "ndarray/autoimport.jl",
-  "ndarray/comparison.jl",
-  "ndarray/context.jl",
-  "ndarray/io.jl",
-  "ndarray/linalg.jl",
-  "ndarray/reduction.jl",
-  "ndarray/remap.jl",
-  "ndarray/show.jl",
-  "ndarray/statistic.jl",
-  "ndarray/trig.jl",
-  "ndarray/type.jl",
-]
+Pages = listpages("ndarray")
 ```
diff --git a/julia/docs/src/api/symbolic-node.md b/julia/docs/src/api/symbolic-node.md
index 0efe4605c414..785dda87fbde 100644
--- a/julia/docs/src/api/symbolic-node.md
+++ b/julia/docs/src/api/symbolic-node.md
@@ -19,14 +19,5 @@
 
 ```@autodocs
 Modules = [MXNet.mx]
-Pages = [
-  "symbolic-node.jl",
-  "symbolic-node/arithmetic.jl",
-  "symbolic-node/array.jl",
-  "symbolic-node/autodiff.jl",
-  "symbolic-node/io.jl",
-  "symbolic-node/op.jl",
-  "symbolic-node/show.jl",
-  "symbolic-node/type.jl",
-]
+Pages = listpages("symbolic-node")
 ```
diff --git a/julia/docs/src/index.md b/julia/docs/src/index.md
index aacd844cc38e..4213265b4bd4 100644
--- a/julia/docs/src/index.md
+++ b/julia/docs/src/index.md
@@ -55,18 +55,6 @@ Depth = 2
 ## API Documentation
 
 ```@contents
-Pages = [
-  "api/context.md",
-  "api/ndarray.md",
-  "api/symbolic-node.md",
-  "api/model.md",
-  "api/initializers.md",
-  "api/optimizers.md",
-  "api/callbacks.md",
-  "api/metric.md",
-  "api/io.md",
-  "api/nn-factory.md",
-  "api/executor.md",
-  "api/visualize.md",
-]
+Pages = api_pages
+Depth = 2
 ```
diff --git a/julia/docs/src/tutorial/char-lstm.md b/julia/docs/src/tutorial/char-lstm.md
index bc7f7b471d94..ab7e9352b5ab 100644
--- a/julia/docs/src/tutorial/char-lstm.md
+++ b/julia/docs/src/tutorial/char-lstm.md
@@ -31,7 +31,7 @@ networks yet, the example shown here is an implementation of LSTM by
 using the default FeedForward model via explicitly unfolding over time.
 We will be using fixed-length input sequence for training. The code is
 adapted from the [char-rnn example for MXNet's Python
-binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb),
+binding](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb),
 which demonstrates how to use low-level
 [Symbolic API](@ref) to build customized neural
 network models directly.
@@ -165,7 +165,7 @@ char-lstm. To train the model, we just follow the standard high-level
 API. Firstly, we construct a LSTM symbolic architecture:
 
 Note all the parameters are defined in
-[examples/char-lstm/config.jl](https://github.com/dmlc/MXNet.jl/blob/master/examples/char-lstm/config.jl).
+[examples/char-lstm/config.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/config.jl).
 Now we load the text file and define the data provider. The data
 `input.txt` we used in this example is [a tiny Shakespeare
 dataset](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
@@ -318,6 +318,6 @@ illustrations](http://colah.github.io/posts/2015-08-Understanding-LSTMs/),
 but could otherwise be very useful for debugging. As we can see, the
 LSTM unfolded over time is just a (very) deep neural network. The
 complete code for producing this visualization can be found in
-[examples/char-lstm/visualize.jl](https://github.com/apache/incubator-mxnet/tree/master/julia/examples/char-lstmvisualize.jl).
+[examples/char-lstm/visualize.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/visualize.jl).
 
 ![image](images/char-lstm-vis.svg)
diff --git a/julia/docs/src/tutorial/mnist.md b/julia/docs/src/tutorial/mnist.md
index cc5267071f11..a404f75efe12 100644
--- a/julia/docs/src/tutorial/mnist.md
+++ b/julia/docs/src/tutorial/mnist.md
@@ -23,7 +23,7 @@ multi-layer perceptron and then a convolutional neural network (the
 LeNet architecture) on the [MNIST handwritten digit
 dataset](http://yann.lecun.com/exdb/mnist/). The code for this tutorial
 could be found in
-[examples/mnist](https://github.com/dmlc/MXNet.jl/tree/master/examples/mnist).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
+[examples/mnist](/api/julia/docs/api/tutorial/mnist/).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
 
 Simple 3-layer MLP
 ------------------
diff --git a/julia/docs/src/user-guide/overview.md b/julia/docs/src/user-guide/overview.md
index 974cc7dee974..342448a15bed 100644
--- a/julia/docs/src/user-guide/overview.md
+++ b/julia/docs/src/user-guide/overview.md
@@ -269,8 +269,6 @@ symbolic composition system. It is like
 [Theano](http://deeplearning.net/software/theano/), except that we
 avoided long expression compilation time by providing *larger* neural
 network related building blocks to guarantee computation performance.
-See also [this note](https://mxnet.readthedocs.org/en/latest/program_model.html)
-for the design and trade-off of the MXNet symbolic composition system.
 
 The basic type is `mx.SymbolicNode`. The following is a trivial example of
 composing two symbols with the `+` operation.
diff --git a/julia/examples/char-lstm/README.md b/julia/examples/char-lstm/README.md
index ac745dd4cc41..155f29603623 100644
--- a/julia/examples/char-lstm/README.md
+++ b/julia/examples/char-lstm/README.md
@@ -29,7 +29,7 @@ and `StatsBase.jl`.
 ## Training
 
 This example is adapted from the
-[example in Python binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb) of
+[example in Python binding](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb) of
 MXNet. The data `input.txt` can be downloaded [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
 
 Modify parameters in [config.jl](config.jl) and then run [train.jl](train.jl). An example output
diff --git a/julia/src/executor.jl b/julia/src/executor.jl
index 37f2dde615b8..7f6c2bb5aa58 100644
--- a/julia/src/executor.jl
+++ b/julia/src/executor.jl
@@ -245,7 +245,7 @@ Total 11 TempSpace resource requested
 ```
 """
 Base.print(io::IO, x::Executor) = print(io, debug_str(x))
-Base.print(x::Executor)         = print(STDOUT, x)
+Base.print(x::Executor)         = print(stdout, x)
 
 function debug_str(x::Executor)
   s_ref = Ref{Cstring}(C_NULL)
diff --git a/julia/src/symbolic-node/show.jl b/julia/src/symbolic-node/show.jl
index f07c6b4655ee..9d40ea124505 100644
--- a/julia/src/symbolic-node/show.jl
+++ b/julia/src/symbolic-node/show.jl
@@ -57,6 +57,6 @@ function Base.print(io::IO, sym::SymbolicNode)
   print(io, unsafe_string(out[]))
 end
 
-Base.print(sym::SymbolicNode) = print(STDOUT, sym)
+Base.print(sym::SymbolicNode) = print(stdout, sym)
 
 
diff --git a/perl-package/AI-MXNet/t/test_autograd.t b/perl-package/AI-MXNet/t/test_autograd.t
index 931c6d59333b..2ddad60df989 100644
--- a/perl-package/AI-MXNet/t/test_autograd.t
+++ b/perl-package/AI-MXNet/t/test_autograd.t
@@ -23,6 +23,7 @@ use AI::MXNet::TestUtils qw(same almost_equal rand_ndarray);
 use AI::MXNet::Base qw(:DEFAULT pones);
 use Test::More tests => 246;
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub autograd_assert
 {
diff --git a/perl-package/AI-MXNet/t/test_gluon_trainer.t b/perl-package/AI-MXNet/t/test_gluon_trainer.t
index 81113af28c20..3b1130af4ecf 100644
--- a/perl-package/AI-MXNet/t/test_gluon_trainer.t
+++ b/perl-package/AI-MXNet/t/test_gluon_trainer.t
@@ -25,6 +25,7 @@ use AI::MXNet::TestUtils qw(almost_equal dies_ok);
 use Scalar::Util qw(refaddr);
 use AI::MXNet::Base;
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub test_multi_trainer
 {
@@ -252,4 +253,3 @@ sub test_trainer_reset_kv
 }
 
 test_trainer_reset_kv();
-
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
index 3bbd8fdc4ea4..55e098683399 100644
--- a/perl-package/AI-MXNet/t/test_module.t
+++ b/perl-package/AI-MXNet/t/test_module.t
@@ -22,6 +22,7 @@ use AI::MXNet qw(mx);
 use AI::MXNet::Base;
 use AI::MXNet::TestUtils qw(almost_equal enumerate same_array dies_like rand_ndarray);
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub test_module_layout
 {
diff --git a/perl-package/AI-MXNet/t/test_sparse_ndarray.t b/perl-package/AI-MXNet/t/test_sparse_ndarray.t
index f143346b4890..afb0b25aa816 100644
--- a/perl-package/AI-MXNet/t/test_sparse_ndarray.t
+++ b/perl-package/AI-MXNet/t/test_sparse_ndarray.t
@@ -24,6 +24,7 @@ use AI::MXNet::TestUtils qw(zip assert enumerate same rand_shape_2d rand_shape_3
     rand_sparse_ndarray random_arrays almost_equal rand_ndarray randint allclose dies_ok);
 use AI::MXNet::Base qw(pones pzeros pdl product rand_sparse);
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 
 sub sparse_nd_ones
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index 5d332ff45ecb..61c64ec0984f 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -31,8 +31,7 @@
 from .profiler import set_kvstore_handle
 
 def _ctype_key_value(keys, vals):
-    """
-    Returns ctype arrays for the key-value args, and the whether string keys are used.
+    """Returns ctype arrays for the key-value args, and the whether string keys are used.
     For internal use only.
     """
     if isinstance(keys, (tuple, list)):
@@ -66,9 +65,7 @@ def _ctype_key_value(keys, vals):
         return (c_keys, c_handle_array(vals), use_str_keys)
 
 def _ctype_dict(param_dict):
-    """
-    Returns ctype arrays for keys and values(converted to strings) in a dictionary
-    """
+    """Returns ctype arrays for keys and values(converted to strings) in a dictionary"""
     assert(isinstance(param_dict, dict)), \
         "unexpected type for param_dict: " + str(type(param_dict))
     c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()])
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 07ec2ef4d61d..6e2d66cb9d15 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -153,8 +153,7 @@ def reset(self):
         self.global_sum_metric = 0.0
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results
-        to initial state."""
+        """Resets the local portion of the internal evaluation results to initial state."""
         self.num_inst = 0
         self.sum_metric = 0.0
 
@@ -372,8 +371,7 @@ def reset(self):
             pass
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results
-        to initial state."""
+        """Resets the local portion of the internal evaluation results to initial state."""
         try:
             for metric in self.metrics:
                 metric.reset_local()
@@ -592,8 +590,7 @@ def update(self, labels, preds):
 
 
 class _BinaryClassificationMetrics(object):
-    """
-    Private container class for classification metric statistics. True/false positive and
+    """Private container class for classification metric statistics. True/false positive and
      true/false negative counts are sufficient statistics for various classification metrics.
     This class provides the machinery to track those statistics across mini-batches of
     (label, prediction) pairs.
@@ -610,9 +607,7 @@ def __init__(self):
         self.global_true_negatives = 0
 
     def update_binary_stats(self, label, pred):
-        """
-        Update various binary classification counts for a single (label, pred)
-        pair.
+        """Update various binary classification counts for a single (label, pred) pair.
 
         Parameters
         ----------
@@ -691,9 +686,7 @@ def global_fscore(self):
             return 0.
 
     def matthewscc(self, use_global=False):
-        """
-        Calculate the Matthew's Correlation Coefficent
-        """
+        """Calculate the Matthew's Correlation Coefficent"""
         if use_global:
             if not self.global_total_examples:
                 return 0.
@@ -1604,8 +1597,7 @@ def reset(self):
         self.reset_local()
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results
-        to initial state."""
+        """Resets the local portion of the internal evaluation results to initial state."""
         self.num_inst = 0.
         self.lcm = numpy.zeros((self.k, self.k))
 
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index 7dbc060ed60f..8e8ac87c9e06 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -207,8 +207,7 @@ def pause(profile_process='worker'):
 
 
 def resume(profile_process='worker'):
-    """
-    Resume paused profiling.
+    """Resume paused profiling.
 
     Parameters
     ----------
diff --git a/python/mxnet/rtc.py b/python/mxnet/rtc.py
index 4dea0e656b7e..5dfc5ea6dfe2 100644
--- a/python/mxnet/rtc.py
+++ b/python/mxnet/rtc.py
@@ -172,7 +172,8 @@ def get_kernel(self, name, signature):
 
 class CudaKernel(object):
     """Constructs CUDA kernel. Should be created by `CudaModule.get_kernel`,
-    not intended to be used by users."""
+    not intended to be used by users.
+    """
     def __init__(self, handle, name, is_ndarray, dtypes):
         self.handle = handle
         self._name = name
diff --git a/python/mxnet/runtime.py b/python/mxnet/runtime.py
index 0f7de76937c0..f2e98fe674fa 100644
--- a/python/mxnet/runtime.py
+++ b/python/mxnet/runtime.py
@@ -26,9 +26,7 @@
 from .base import _LIB, check_call
 
 class Feature(ctypes.Structure):
-    """
-    Compile time feature description, member fields: `name` and `enabled`.
-    """
+    """Compile time feature description, member fields: `name` and `enabled`."""
     _fields_ = [
         ("_name", ctypes.c_char_p),
         ("_enabled", ctypes.c_bool)
@@ -36,16 +34,12 @@ class Feature(ctypes.Structure):
 
     @property
     def name(self):
-        """
-        Feature name.
-        """
+        """Feature name."""
         return self._name.decode()
 
     @property
     def enabled(self):
-        """
-        True if MXNet was compiled with the given compile-time feature.
-        """
+        """True if MXNet was compiled with the given compile-time feature."""
         return self._enabled
 
     def __repr__(self):
@@ -55,8 +49,7 @@ def __repr__(self):
             return "✖ {}".format(self.name)
 
 def feature_list():
-    """
-    Check the library for compile-time features. The list of features are maintained in libinfo.h and libinfo.cc
+    """Check the library for compile-time features. The list of features are maintained in libinfo.h and libinfo.cc
 
     Returns
     -------
@@ -70,9 +63,7 @@ def feature_list():
     return features
 
 class Features(collections.OrderedDict):
-    """
-    OrderedDict of name to Feature
-    """
+    """OrderedDict of name to Feature"""
     instance = None
     def __new__(cls):
         if cls.instance is None:
@@ -84,8 +75,7 @@ def __repr__(self):
         return str(list(self.values()))
 
     def is_enabled(self, feature_name):
-        """
-        Check for a particular feature by name
+        """Check for a particular feature by name
 
         Parameters
         ----------
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 4862aee8570d..6c8fefca4490 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1935,8 +1935,7 @@ def same_array(array1, array2):
 
 @contextmanager
 def discard_stderr():
-    """
-    Discards error output of a routine if invoked as:
+    """Discards error output of a routine if invoked as:
 
     with discard_stderr():
         ...
@@ -2324,7 +2323,8 @@ def __exit__(self, ptype, value, trace):
 
 def collapse_sum_like(a, shape):
     """Given `a` as a numpy ndarray, perform reduce_sum on `a` over the axes that do not
-    exist in `shape`. Note that an ndarray with `shape` must be broadcastable to `a`."""
+    exist in `shape`. Note that an ndarray with `shape` must be broadcastable to `a`.
+    """
     assert len(a.shape) >= len(shape)
     if np.prod(shape) == 0 or a.size == 0:
         return np.zeros(shape, dtype=a.dtype)
@@ -2349,7 +2349,8 @@ def is_cd_run():
 
 def has_tvm_ops():
     """Returns True if MXNet is compiled with TVM generated operators. If current ctx
-    is GPU, it only returns True for CUDA compute capability > 52 where FP16 is supported."""
+    is GPU, it only returns True for CUDA compute capability > 52 where FP16 is supported.
+    """
     built_with_tvm_op = _features.is_enabled("TVM_OP")
     ctx = current_context()
     if ctx.device_type == 'gpu':
@@ -2367,7 +2368,8 @@ def has_tvm_ops():
 def is_op_runnable():
     """Returns True for all CPU tests. Returns True for GPU tests that are either of the following.
     1. Built with USE_TVM_OP=0.
-    2. Built with USE_TVM_OP=1, but with compute capability >= 53."""
+    2. Built with USE_TVM_OP=1, but with compute capability >= 53.
+    """
     ctx = current_context()
     if ctx.device_type == 'gpu':
         if not _features.is_enabled("TVM_OP"):
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index cef034fd0caa..9e15caae9698 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -60,8 +60,7 @@ def get_gpu_memory(gpu_dev_id):
 
 
 def set_np_shape(active):
-    """
-    Turns on/off NumPy shape semantics, in which `()` represents the shape of scalar tensors,
+    """Turns on/off NumPy shape semantics, in which `()` represents the shape of scalar tensors,
     and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent the shapes
     of zero-size tensors. This is turned off by default for keeping backward compatibility.
 
@@ -568,8 +567,7 @@ def hybrid_forward(self, F, x, w):
 
 
 def np_ufunc_legal_option(key, value):
-    """
-    Checking if ufunc arguments are legal inputs
+    """Checking if ufunc arguments are legal inputs
 
     Parameters
     ----------
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 5c704c9646a2..26eea3dd062b 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -2777,9 +2777,9 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
 // Numpy
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyShape
   (JNIEnv *env, jobject obj, jobject compatibleRef) {
-  bool isNumpyShape;
+  int isNumpyShape;
   int ret = MXIsNumpyShape(&isNumpyShape);
-  SetIntField(env, compatibleRef, static_cast<int>(isNumpyShape));
+  SetIntField(env, compatibleRef, isNumpyShape);
   return ret;
 }
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index b80e17c18071..de208c0fed99 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -276,7 +276,7 @@ int MXAutogradSetIsRecording(int is_recording, int* prev) {
   API_END();
 }
 
-int MXIsNumpyShape(bool* curr) {
+int MXIsNumpyShape(int* curr) {
   API_BEGIN();
   *curr = Imperative::Get()->is_np_shape();
   API_END();
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index d92253266f35..882105da1321 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1627,16 +1627,16 @@ static nnvm::Graph InferForwardAttrs(nnvm::Graph g,
 
 static bool SubgraphBackendCheck(const op::SubgraphBackendPtr& backend,
                                  const Context& default_ctx,
-                                 bool verbose = false) {
+                                 int verbose = 1) {
   if (backend->HasAttr("enable") && (backend->GetAttr<bool>("enable") != true)) {
-    if (verbose) {
+    if (verbose > 1) {
       LOG(INFO) << "Subgraph backend " << backend->GetName()
                 << " isn't activated.";
     }
     return false;
   }
   if (backend->HasAttr("context") && backend->GetAttr<Context>("context") != default_ctx) {
-    if (verbose) {
+    if (verbose > 1) {
       LOG(INFO) << "Subgraph backend " << backend->GetName()
                 << " isn't activated as context mismatch.";
     }
@@ -1647,7 +1647,7 @@ static bool SubgraphBackendCheck(const op::SubgraphBackendPtr& backend,
 
 static bool SubgraphPropertyCheck(const std::string& backend_name,
                                   const op::SubgraphPropertyPtr& prop, bool need_grad,
-                                  bool verbose = false) {
+                                  int verbose = 1) {
   auto full_name =
       prop->HasAttr("property_name") ? prop->GetAttr<std::string>("property_name") : std::string();
   if (prop->HasAttr("disable") && prop->GetAttr<bool>("disable") == true) {
@@ -1657,7 +1657,7 @@ static bool SubgraphPropertyCheck(const std::string& backend_name,
   }
   if (prop->HasAttr("inference_only") && prop->GetAttr<bool>("inference_only") == true) {
     if (need_grad) {
-      if (verbose) {
+      if (verbose > 1) {
         LOG(INFO) << "skip partitioning graph with subgraph property " << full_name
                   << " from backend " << backend_name << " as it requires `grad_req=null`.";
       }
@@ -1699,7 +1699,7 @@ static nnvm::Symbol BuildSubgraph(
     const std::unordered_map<std::string, int>& arg_stype_map, const Context& default_ctx,
     const std::map<std::string, Context>& ctx_map, std::vector<Context>* in_arg_ctxes,
     std::vector<Context>* arg_grad_ctxes, std::vector<OpReqType>* grad_req_types,
-    std::vector<Context>* aux_state_ctxes, bool verbose = false) {
+    std::vector<Context>* aux_state_ctxes, int verbose = 1) {
   // setup map for in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes and grad_req_types
   std::unordered_map<std::string, Context> in_arg_ctx_map;
   std::unordered_map<std::string, Context> arg_grad_ctx_map;
@@ -1794,7 +1794,7 @@ static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src, const op::SubgraphBac
                                   std::vector<NDArray>* in_args,
                                   std::vector<NDArray>* arg_grad_store,
                                   std::vector<OpReqType>* grad_req_type,
-                                  std::vector<NDArray>* aux_states, bool verbose = false) {
+                                  std::vector<NDArray>* aux_states, int verbose = 1) {
   // setup map for in_args, arg_grad_store, grad_req_type and aux_states
   std::unordered_map<std::string, NDArray> in_args_map;
   std::unordered_map<std::string, NDArray> arg_grad_store_map;
@@ -1929,11 +1929,11 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
   auto exec = new exec::GraphExecutor();
   bool init = false;
   if (!exec->subgraph_property().empty()) {
-    static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
+    static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
     const auto& backend_name = exec->subgraph_property();
     const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name);
     if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) {
-      LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
+      if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
       std::vector<Context> tmp_in_arg_ctxes = in_arg_ctxes;
       std::vector<Context> tmp_arg_grad_ctxes = arg_grad_ctxes;
       std::vector<Context> tmp_aux_state_ctxes = aux_state_ctxes;
@@ -2001,7 +2001,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
                          const std::vector<NDArray> &aux_states,
                          Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
-  static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
+  static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
   std::vector<NDArray> tmp_in_args = in_args;
   std::vector<NDArray> tmp_arg_grad_store = arg_grad_store;
   std::vector<OpReqType> tmp_grad_req_type = grad_req_type;
@@ -2011,7 +2011,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
     const auto& backend_name = exec->subgraph_property();
     const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name);
     if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) {
-      LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
+      if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
       symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args,
                                    &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states,
                                    verbose);
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 6818d757ab79..39c2880d627b 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -32,6 +32,22 @@ DMLC_REGISTER_PARAMETER(CachedOpConfig);
 
 constexpr uint32_t kEidNotExist = std::numeric_limits<uint32_t>::max();
 
+const char CachedOp::FULL[] = "full";
+const char CachedOp::FORWARD[] = "forward";
+const char CachedOp::BACKWARD[] = "backward";
+const char CachedOp::REF_COUNT[] = "ref_count";
+const char CachedOp::MEM_PLAN[] = "mem_plan";
+const char CachedOp::STORAGE_PLAN[] = "storage_plan";
+
+namespace {
+
+std::string AddPrefix(const std::string& prefix,
+                      const std::string& s) {
+  return prefix + "_" + s;
+}
+
+}  // namespace
+
 struct CachedOp::GraphInfo {
   nnvm::Graph fwd_graph;
   nnvm::Graph full_graph;
@@ -136,7 +152,7 @@ CachedOp::CachedOp(
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
     }
 
-    fwd_graph_.attrs["forward_ref_count"] =
+    fwd_graph_.attrs[AddPrefix(FORWARD, REF_COUNT)] =
         std::make_shared<dmlc::any>(std::move(ref_count));
 
     inlining_ = !config_.static_alloc &&
@@ -201,9 +217,9 @@ CachedOp::CachedOp(
       }
     }
 
-    auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >("forward_ref_count");
+    auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >(AddPrefix(FORWARD, REF_COUNT));
     for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count.at(i) += ref_count[i];
-    fwd_graph_.attrs["full_ref_count"] =
+    fwd_graph_.attrs[AddPrefix(FULL, REF_COUNT)] =
         std::make_shared<dmlc::any>(std::move(full_ref_count));
 
     size_t num_forward_inputs = num_inputs();
@@ -336,14 +352,15 @@ bool CachedOp::SetForwardGraph(
 
   // When dynmaic shape exists, it is not feasible to plan memory ahead of time
   if (contain_dynamic_shape) {
-    g.attrs.erase("forward_mem_plan");
-    g.attrs.erase("full_mem_plan");
+    g.attrs.erase(AddPrefix(FORWARD, MEM_PLAN));
+    g.attrs.erase(AddPrefix(FULL, MEM_PLAN));
     return false;
   }
+  const std::string& prefix = recording ? FULL : FORWARD;
   if (!match) {
-    g.attrs.erase("forward_mem_plan");
-    g.attrs.erase("full_mem_plan");
-  } else if (g.attrs.count(recording ? "full_mem_plan" : "forward_mem_plan")) {
+    g.attrs.erase(AddPrefix(FORWARD, MEM_PLAN));
+    g.attrs.erase(AddPrefix(FULL, MEM_PLAN));
+  } else if (g.attrs.count(AddPrefix(prefix, MEM_PLAN))) {
     return true;
   }
 
@@ -363,9 +380,9 @@ bool CachedOp::SetForwardGraph(
   }
 
   auto mem_plan = PlanMemory(
-      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
-          recording ? "full_ref_count" : "forward_ref_count"));
-  g.attrs[recording ? "full_mem_plan" : "forward_mem_plan"] =
+      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(AddPrefix(prefix, REF_COUNT)),
+      AddPrefix(prefix, STORAGE_PLAN));
+  g.attrs[AddPrefix(prefix, MEM_PLAN)] =
       std::make_shared<dmlc::any>(std::move(mem_plan));
 
   return false;
@@ -432,7 +449,7 @@ bool CachedOp::SetBackwardGraph(
   size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
   size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
 
-  if (!g.attrs.count("backward_ref_count")) {
+  if (!g.attrs.count(AddPrefix(BACKWARD, REF_COUNT))) {
     std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
     for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
@@ -443,7 +460,7 @@ bool CachedOp::SetBackwardGraph(
       }
     }
     for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)];
-    g.attrs["backward_ref_count"] = std::make_shared<dmlc::any>(std::move(ref_count));
+    g.attrs[AddPrefix(BACKWARD, REF_COUNT)] = std::make_shared<dmlc::any>(std::move(ref_count));
   }
 
   auto shapes = info->fwd_graph.GetAttr<mxnet::ShapeVector>("shape");
@@ -476,8 +493,8 @@ bool CachedOp::SetBackwardGraph(
                                     false, node_range, entry_range);
 
   if (!match) {
-    g.attrs.erase("backward_mem_plan");
-  } else if (g.attrs.count("backward_mem_plan")) {
+    g.attrs.erase(AddPrefix(BACKWARD, MEM_PLAN));
+  } else if (g.attrs.count(AddPrefix(BACKWARD, MEM_PLAN))) {
     return true;
   }
 
@@ -491,11 +508,13 @@ bool CachedOp::SetBackwardGraph(
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
 
   auto mem_plan = PlanMemory(
-      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
+      &g, std::move(storage),
+      g.GetAttr<std::vector<uint32_t> >(AddPrefix(BACKWARD, REF_COUNT)),
+      AddPrefix(BACKWARD, STORAGE_PLAN),
       {num_forward_nodes, idx.num_nodes()},
       {num_forward_entries, idx.num_node_entries()},
       detect_inplace_addto);
-  g.attrs["backward_mem_plan"] = std::make_shared<dmlc::any>(std::move(mem_plan));
+  g.attrs[AddPrefix(BACKWARD, MEM_PLAN)] = std::make_shared<dmlc::any>(std::move(mem_plan));
 
   return false;
 }
@@ -526,9 +545,10 @@ void CachedOp::StaticAllocMemory(
   const auto& default_ctx = state.context;
   nnvm::Graph& g = keep_fwd ? state.info.full_graph : state.info.fwd_graph;
   const auto& idx = g.indexed_graph();
-  const auto& vstorage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(
-      keep_fwd ? "backward_mem_plan" : (recording ? "full_mem_plan" : "forward_mem_plan"));
+  const std::string& graph_type = keep_fwd ? BACKWARD : (recording ? FULL : FORWARD);
+  const auto& storage_plan_attr = AddPrefix(graph_type, STORAGE_PLAN);
+  const auto& storage_plan = g.GetAttr<std::vector<int> >(storage_plan_attr);
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(AddPrefix(graph_type, MEM_PLAN));
   std::vector<int> addto_entry;
   if (g.attrs.count("addto_entry")) {
     addto_entry = g.GetAttr<std::vector<int> >("addto_entry");
@@ -558,9 +578,9 @@ void CachedOp::StaticAllocMemory(
   for (size_t i = start_eid; i < end_eid; ++i) {
     if (addto_entry.size() && addto_entry[i]) {
       state.array_reqs[i] = kAddTo;
-    } else if (vstorage_inplace[i] >= 0) {
+    } else if (storage_plan[i] >= 0) {
       state.array_reqs[i] = kWriteInplace;
-    } else if (vstorage_inplace[i] == -2) {
+    } else if (storage_plan[i] == -2) {
       // -2 indicate that the entry is never referenced.
       state.array_reqs[i] = kNullOp;
     } else {
@@ -862,8 +882,9 @@ OpStatePtr CachedOp::DynamicForward(
   }
 
   // Allocate NDArrays
-  std::vector<uint32_t> ref_count = g.GetAttr<std::vector<uint32_t> >(
-      recording ? "full_ref_count" : "forward_ref_count");
+  const std::string& graph_type = recording ? FULL : FORWARD;
+  std::vector<uint32_t> ref_count =
+    g.GetAttr<std::vector<uint32_t> >(AddPrefix(graph_type, REF_COUNT));
 
   std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
   for (size_t i = 0; i < idx.num_node_entries(); ++i) {
@@ -871,8 +892,7 @@ OpStatePtr CachedOp::DynamicForward(
   }
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
   if (!use_naive_run) {
-    const auto& mem_plan = g.GetAttr<MemoryPlanVector >(
-        recording ? "full_mem_plan" : "forward_mem_plan");
+    const auto& mem_plan = g.GetAttr<MemoryPlanVector >(AddPrefix(graph_type, MEM_PLAN));
     AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
                   mem_plan, arrays, &array_reqs);
     const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
@@ -1011,7 +1031,7 @@ void CachedOp::DynamicBackward(
   }
 
   // Allocate NDArrays
-  auto ref_count = g.GetAttr<std::vector<uint32_t> >("backward_ref_count");
+  auto ref_count = g.GetAttr<std::vector<uint32_t> >(AddPrefix(BACKWARD, REF_COUNT));
   if (retain_graph) {
     for (size_t i = 0; i < num_forward_entries; ++i) ++ref_count[i];
   }
@@ -1027,7 +1047,7 @@ void CachedOp::DynamicBackward(
     if (ref_count[i] == 0) array_reqs[i] = kNullOp;
   }
 
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector >("backward_mem_plan");
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector >(AddPrefix(BACKWARD, MEM_PLAN));
   AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
 
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index db049d59ed80..84f96300c27b 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -140,6 +140,13 @@ class CachedOp {
   void RegisterOpHook(const CachedOp::CachedOpMonCallback& callback,
                       bool monitor_all = false);
 
+  static const char FULL[];
+  static const char FORWARD[];
+  static const char BACKWARD[];
+  static const char REF_COUNT[];
+  static const char MEM_PLAN[];
+  static const char STORAGE_PLAN[];
+
  private:
   struct GraphInfo;
   struct DynamicRuntime;
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 356b85e67ee2..64034dafb4d5 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -834,6 +834,7 @@ inline MemoryPlanVector PlanMemory(
     nnvm::Graph* p_g,
     nnvm::StorageVector&& storage,
     const std::vector<uint32_t>& ref_count,
+    const std::string& storage_plan,
     const std::pair<uint32_t, uint32_t>& node_range = {0, 0},
     const std::pair<uint32_t, uint32_t>& entry_range = {0, 0},
     bool detect_inplace_addto = false) {
@@ -851,6 +852,7 @@ inline MemoryPlanVector PlanMemory(
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<mxnet::ShapeVector>("shape");
   const auto& storage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
+  g.attrs[storage_plan] = std::make_shared<any>(storage_inplace);
   const auto& storage_ids = g.GetAttr<StorageVector>("storage_id");
   uint32_t entry_start = entry_range.first;
   uint32_t entry_end =
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index e1374ecdb9dd..3feccf55b734 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1616,12 +1616,13 @@ void NDArray::Save(dmlc::Stream *strm) const {
     nd_cpu.WaitToRead();
     save_data = nd_cpu.data();
   } else {
+#if MXNET_USE_MKLDNN == 1
+    // For mkldnn, a copy of *this can ensure no write access pending on *this.
+    nd_cpu = this->Copy(Context::CPU());
+    nd_cpu.WaitToRead();
+#else
     this->WaitToRead();
     nd_cpu = *this;
-#if MXNET_USE_MKLDNN == 1
-    if (nd_cpu.IsMKLDNNData()) {
-      nd_cpu = nd_cpu.Reorder2Default();
-    }
 #endif
     save_data = nd_cpu.data();
   }
@@ -1714,7 +1715,8 @@ bool NDArray::Load(dmlc::Stream *strm) {
            " Please turn on np shape semantics in Python using `with np_shape(True)`"
            " or decorator `use_np_shape` to scope the code of loading the ndarray.";
   } else {
-    CHECK(!Imperative::Get()->is_np_shape())
+    // when the flag is global on, skip the check since it would be always global on.
+    CHECK(Imperative::Get()->is_np_shape() == GlobalOn || !Imperative::Get()->is_np_shape())
         << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
            " Please turn off np shape semantics in Python using `with np_shape(False)`"
            " to scope the code of loading the ndarray.";
@@ -2005,16 +2007,18 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
-    this->WaitToRead();
-    RunContext rctx{this->ctx(), nullptr, nullptr, false};
-    NDArray src = *this;
+    Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          RunContext ctx{this->ctx(), nullptr, nullptr, false};
+          NDArray src = *this;
 #if MXNET_USE_MKLDNN == 1
-    if (src.IsMKLDNNData()) {
-      src = this->Reorder2Default();
-    }
+          src = this->Reorder2Default();
 #endif
-    ndarray::Copy<cpu, cpu>(src.data(), &dst,
-                            Context::CPU(), Context::CPU(), rctx);
+          ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), ctx);
+          on_complete();
+        },
+        this->ctx(), {this->var()}, {}, FnProperty::kNormal, 0, "SyncCopyCPU2CPU");
+    this->WaitToWrite();
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushAsync(
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index e8e2cd90b86c..9ce135040fb4 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -43,8 +43,6 @@ bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
   }
 #endif
   (*out_attrs)[0] = kDefaultStorage;
-  (*out_attrs)[1] = kDefaultStorage;
-  (*out_attrs)[2] = kDefaultStorage;
   return true;
 }
 
diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index d43647ac83b9..0f4c570331a2 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -318,8 +318,8 @@ void PreSelectSubgraphNodes(const nnvm::Graph& g, SubgraphSelectorV2Ptr subgraph
       for (auto node : excluded_nodes) {
         excluded_node_names += node->node->attrs.name + ", ";
       }
-      static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
-      if (verbose) {
+      static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
+      if (verbose > 1) {
         LOG(INFO) << "Found a cycle when BFS from node " << simple_nodes[snid]->node->attrs.name
                   << ". Excluding nodes " << excluded_node_names << "and retrying";
       }
@@ -706,9 +706,9 @@ void TopSortEntries(const nnvm::Graph& g,
 }
 
 nnvm::Graph BuildSubgraph(nnvm::Graph&& g) {
-    static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
+    static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
   if (!g.HasAttr("subgraph_property")) {  // treat the whole graph as a subgraph
-    if (verbose) {
+    if (verbose > 1) {
       LOG(INFO) << "The graph has no attribute of subgraph_property attached. "
                    "The original graph is returned.";
     }
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 96c869f40d40..8405404dc627 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -30,6 +30,7 @@
 #include <algorithm>
 #include <utility>
 #include <type_traits>
+
 #include "./util/tensor_util-inl.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
@@ -1353,6 +1354,7 @@ void BatchDotForward_(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (req[0] == kNullOp) return;
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
       << "Binary function only support input/output with the same type";
@@ -1362,115 +1364,46 @@ void BatchDotForward_(const nnvm::NodeAttrs& attrs,
     (outputs[0].type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
     << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 1, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
-    if (kNullOp != req[0]) {
-      if (param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (!param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (param.transpose_a && !param.transpose_b) {
-        mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else {
-        mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
+    int ndim = outputs[0].ndim();
+    if (outputs[0].shape_.Size() == 0 || inputs[0].shape_.Size() == 0
+                                      || inputs[1].shape_.Size() == 0) {
+      if (outputs[0].shape_.Size() != 0 && req[0] != kAddTo) {
+        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, outputs[0].shape_.Size(),
+                                                          outputs[0].dptr<DType>());
       }
+      return;
     }
-  });
-}
-
-template<typename xpu>
-void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
-                       const OpContext& ctx,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_NE(req[1], kWriteInplace);
-  CHECK_NE(req[0], kWriteInplace);
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64 ||
-    (outputs[0].type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
-    << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_grad = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_grad = outputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 2, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 2, DType*>(
-        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
-    mshadow::Tensor<xpu, 1, DType*> rhs_workspace = workspace[0];
-    mshadow::Tensor<xpu, 1, DType*> lhs_workspace = workspace[1];
+    size_t batch_size = outputs[0].shape_.ProdShape(0, ndim - 2);
+    mshadow::Tensor<xpu, 3, DType> out =
+        outputs[0].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
+                                                        outputs[0].shape_[ndim - 2],
+                                                        outputs[0].shape_[ndim - 1]), s);
+    mshadow::Tensor<xpu, 3, DType> mlhs =
+        inputs[0].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
+                                                       inputs[0].shape_[ndim - 2],
+                                                       inputs[0].shape_[ndim - 1]), s);
+    mshadow::Tensor<xpu, 3, DType> mrhs =
+        inputs[1].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
+                                                       inputs[1].shape_[ndim - 2],
+                                                       inputs[1].shape_[ndim - 1]), s);
+    mshadow::Tensor<xpu, 1, DType*> workspace =
+        ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
     if (param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x.T, y.T)
-      // dy = dot(x, dz).T = dot(dz.T, x.T)
-      // dx = dot(dz, y).T = dot(y.T, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f :  (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     } else if (!param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x, y.T)
-      // dy = dot(x.T, dz).T = dot(dz.T, x)
-      // dx = dot(dz, y)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     } else if (param.transpose_a && !param.transpose_b) {
-      // Gradient of z = dot(x.T, y)
-      // dy = dot(x, dz)
-      // dx = dot(dz, y.T).T = dot(y, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     } else {
-      // Gradient of z = dot(x, y)
-      // dy = dot(x.T, dz)
-      // dx = dot(dz, y.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     }
   });
 }
@@ -1485,24 +1418,34 @@ inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& rshape = (*in_attrs)[1];
   // return false if lhs and rhs both have fully unknown shape
   if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
-  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
+  if (lshape.ndim() >= 3 && rshape.ndim() >= 3 && lshape.ndim() == rshape.ndim()) {
+    int ndim = lshape.ndim();
     // only partially infer shape if last dim of lhs and second dim of rhs is known
-    bool last_dim_known = dim_size_is_known(lshape, 2);
-    bool second_dim_known = dim_size_is_known(rshape, 1);
+    bool last_dim_known = dim_size_is_known(lshape, ndim - 1);
+    bool second_dim_known = dim_size_is_known(rshape, ndim - 2);
     if ( !last_dim_known || !second_dim_known) return false;
-    CHECK(lshape[0] == rshape[0])
-      << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
-      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
-    index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
-    index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
-    index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
-    CHECK(lshape_k == rshape_k)
-      << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
+    for (int i = 0; i < ndim - 2; i++) {
+      CHECK_EQ(lshape[i], rshape[i])
+        << "batch_dot shape error (the leading batch dimensions must be equal): "
+        << lshape << " X " << rshape
+        << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
+    }
+    dim_t out_m = param.transpose_a ? lshape[ndim - 1] : lshape[ndim - 2];
+    dim_t lshape_k = param.transpose_a ? lshape[ndim - 2] : lshape[ndim - 1];
+    dim_t out_n = param.transpose_b ? rshape[ndim - 2] : rshape[ndim - 1];
+    dim_t rshape_k = param.transpose_b ? rshape[ndim - 1] : rshape[ndim - 2];
+    CHECK_EQ(lshape_k, rshape_k)
+      << "batch_dot shape error (shape mismatch): " << lshape << " X " << rshape
       << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
+    std::vector<dim_t> out_shape_vec;
+    for (int i = 0; i < ndim - 2; i++) {
+      out_shape_vec.push_back(lshape[i]);
+    }
+    out_shape_vec.push_back(out_m);
+    out_shape_vec.push_back(out_n);
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(out_shape_vec));
   } else {
-    LOG(FATAL) << "batch_dot currently only support 3D*3D array"
+    LOG(FATAL) << "batch_dot currently only support N-D*N-D array (N >= 3)"
                << lshape << " v.s. " << rshape;
   }
   // return true if output shape is fully inferred
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index 11a056146e1d..556260ed9600 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -115,13 +115,13 @@ NNVM_REGISTER_OP(batch_dot)
 .describe(R"doc(Batchwise dot product.
 
 ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
-``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
+``y`` are data in batch, namely N-D (N >= 3) arrays in shape of `(B0, ..., B_i, :, :)`.
 
-For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
-`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
+For example, given ``x`` with shape `(B_0, ..., B_i, N, M)` and ``y`` with shape
+`(B_0, ..., B_i, M, K)`, the result array will have shape `(B_0, ..., B_i, N, K)`,
 which is computed by::
 
-   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
+   batch_dot(x,y)[b_0, ..., b_i, :, :] = dot(x[b_0, ..., b_i, :, :], y[b_0, ..., b_i, :, :])
 
 )doc" ADD_FILELINE)
 .set_num_inputs(2)
@@ -138,21 +138,73 @@ which is computed by::
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
 .set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n,
+       const std::vector<nnvm::NodeEntry>& ograds) {
+  const DotParam& param = nnvm::get<DotParam>(n->attrs.parsed);
+  nnvm::NodePtr lhs_grad;
+  nnvm::NodePtr rhs_grad;
+  std::string lhs_gnode_name = n->attrs.name + "_backward_lhs";
+  std::string rhs_gnode_name = n->attrs.name + "_backward_rhs";
+  if (param.transpose_a && param.transpose_b) {
+    // Gradient of z = dot(x.T, y.T)
+    // dx = dot(dz, y).T = dot(y.T, dz.T)
+    // dy = dot(x, dz).T = dot(dz.T, x.T)
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {n->inputs[1], ograds[0]}, &(n->attrs.dict), &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {ograds[0], n->inputs[0]}, &(n->attrs.dict), &n);
+  } else if (!param.transpose_a && param.transpose_b) {
+    // Gradient of z = dot(x, y.T)
+    // dx = dot(dz, y)
+    // dy = dot(x.T, dz).T = dot(dz.T, x)
+    auto lhs_attrs_dict = n->attrs.dict;
+    auto rhs_attrs_dict = n->attrs.dict;
+    lhs_attrs_dict["transpose_a"] = "false";
+    lhs_attrs_dict["transpose_b"] = "false";
+    rhs_attrs_dict["transpose_a"] = "true";
+    rhs_attrs_dict["transpose_b"] = "false";
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {ograds[0], n->inputs[1]}, &lhs_attrs_dict, &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {ograds[0], n->inputs[0]}, &rhs_attrs_dict, &n);
+  } else if (param.transpose_a && !param.transpose_b) {
+    // Gradient of z = dot(x.T, y)
+    // dx = dot(dz, y.T).T = dot(y, dz.T)
+    // dy = dot(x, dz)
+    auto lhs_attrs_dict = n->attrs.dict;
+    auto rhs_attrs_dict = n->attrs.dict;
+    lhs_attrs_dict["transpose_a"] = "false";
+    lhs_attrs_dict["transpose_b"] = "true";
+    rhs_attrs_dict["transpose_a"] = "false";
+    rhs_attrs_dict["transpose_b"] = "false";
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {n->inputs[1], ograds[0]}, &lhs_attrs_dict, &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {n->inputs[0], ograds[0]}, &rhs_attrs_dict, &n);
+  } else {
+    // Gradient of z = dot(x, y)
+    // dx = dot(dz, y.T)
+    // dy = dot(x.T, dz)
+    auto lhs_attrs_dict = n->attrs.dict;
+    auto rhs_attrs_dict = n->attrs.dict;
+    lhs_attrs_dict["transpose_a"] = "false";
+    lhs_attrs_dict["transpose_b"] = "true";
+    rhs_attrs_dict["transpose_a"] = "true";
+    rhs_attrs_dict["transpose_b"] = "false";
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {ograds[0], n->inputs[1]}, &lhs_attrs_dict, &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {n->inputs[0], ograds[0]}, &rhs_attrs_dict, &n);
+  }
+  std::vector<nnvm::NodeEntry> ret;
+  ret.emplace_back(nnvm::NodeEntry{lhs_grad, 0, 0});
+  ret.emplace_back(nnvm::NodeEntry{rhs_grad, 0, 0});
+  return ret;
+})
 .add_argument("lhs", "NDArray-or-Symbol", "The first input")
 .add_argument("rhs", "NDArray-or-Symbol", "The second input")
 .add_arguments(DotParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/dot.cu b/src/operator/tensor/dot.cu
index 8ee2e2832fbb..b245b1c9e5ed 100644
--- a/src/operator/tensor/dot.cu
+++ b/src/operator/tensor/dot.cu
@@ -38,8 +38,5 @@ NNVM_REGISTER_OP(_backward_dot)
 NNVM_REGISTER_OP(batch_dot)
 .set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
 
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_attr<FCompute>("FCompute<gpu>", BatchDotBackward_<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 0cb21cedee35..c18a95400f22 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -1415,10 +1415,10 @@ def check_arcsinh():
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
     def check_arccosh():
-        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi])
+        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi, 5*np.pi/4])
         y = nd.arccosh(x)
         # expected ouput for indices=(0, 1, -3, -2, -1) after applying arccosh()
-        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi)]
+        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi), np.arccosh(5*np.pi/4)]
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
     def check_arctanh():
diff --git a/tests/nightly/test_large_vector.py b/tests/nightly/test_large_vector.py
index 23f4b8e4f310..b8edc83220bd 100644
--- a/tests/nightly/test_large_vector.py
+++ b/tests/nightly/test_large_vector.py
@@ -556,8 +556,8 @@ def test_concat():
     a = nd.ones(LARGE_X)
     b = nd.zeros(LARGE_X)
     c = nd.concat(a, b, dim=0)
-    assert c[0][0] == 1
-    assert c[-1][-1] == 0
+    assert c[0] == 1
+    assert c[-1] == 0
     assert c.shape[0] == (2 * LARGE_X)
 
 
@@ -710,6 +710,37 @@ def test_full():
     assert a[-1] == 3
 
 
+def test_sign():
+    a = mx.nd.random.normal(-1, 1, shape=LARGE_X)
+    mx_res = mx.nd.sign(a)
+    assert_almost_equal(mx_res[-1].asnumpy(), np.sign(a[-1].asnumpy()))
+
+
+def test_logical():
+    def check_logical_and(a, b):
+        mx_res = mx.nd.logical_and(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_and(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    def check_logical_or(a, b):
+        mx_res = mx.nd.logical_or(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_or(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    def check_logical_not(a, b):
+        mx_res = mx.nd.logical_not(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_not(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    def check_logical_xor(a, b):
+        mx_res = mx.nd.logical_xor(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_xor(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    a = mx.nd.ones(LARGE_X)
+    b = mx.nd.zeros(LARGE_X)
+    check_logical_and(a, b)
+    check_logical_or(a, b)
+    check_logical_not(a, b)
+    check_logical_xor(a, b)
+
+
 def test_astype():
     x = create_vector(size=LARGE_X//4)
     x = nd.tile(x, 4)
@@ -752,7 +783,7 @@ def assert_correctness_of_rounding_ops(output, mid, expected_vals):
 
 def test_rounding_ops():
     x = create_input_for_rounding_ops()
-    
+
     def check_ceil():
         y = nd.ceil(x)
         # expected ouput for middle 5 values after applying ceil()
@@ -854,6 +885,48 @@ def check_tan():
         expected_output = [-.577, -1, 0, 1, .577]
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
+    def check_arcsinh():
+        x = create_input_for_trigonometric_ops([-np.pi/2, -np.pi/4, 0, np.pi/4, np.pi/2])
+        y = nd.arcsinh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arcsinh()
+        expected_output = [np.arcsinh(-np.pi/2), np.arcsinh(-np.pi/4), 0, np.arcsinh(np.pi/4), np.arcsinh(np.pi/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_arccosh():
+        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi, 5*np.pi/4])
+        y = nd.arccosh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arccosh()
+        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi), np.arccosh(5*np.pi/4)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_arctanh():
+        x = create_input_for_trigonometric_ops([-1/4, -1/2, 0, 1/4, 1/2])
+        y = nd.arctanh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arctanh()
+        expected_output = [np.arctanh(-1/4), np.arctanh(-1/2), 0, np.arctanh(1/4), np.arctanh(1/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_sinh():
+        x = create_input_for_trigonometric_ops([-np.pi/2, -np.pi/4, 0, np.pi/4, np.pi/2])
+        y = nd.sinh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying sinh()
+        expected_output = [np.sinh(-np.pi/2), np.sinh(-np.pi/4), 0, np.sinh(np.pi/4), np.sinh(np.pi/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_cosh():
+        x = create_input_for_trigonometric_ops([0, 1, np.pi/2, 3*np.pi/4, np.pi])
+        y = nd.cosh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying cosh()
+        expected_output = [1, np.cosh(1), np.cosh(np.pi/2), np.cosh(3*np.pi/4), np.cosh(np.pi)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_tanh():
+        x = create_input_for_trigonometric_ops([-1/4, -1/2, 0, 1/4, 1/2])
+        y = nd.tanh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying tanh()
+        expected_output = [np.tanh(-1/4), np.tanh(-1/2), 0, np.tanh(1/4), np.tanh(1/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
     def check_radians():
         x = create_input_for_trigonometric_ops([0, 90, 180, 270, 360])
         y = nd.radians(x)
@@ -874,6 +947,12 @@ def check_degrees():
     check_sin()
     check_cos()
     check_tan()
+    check_arcsinh()
+    check_arccosh()
+    check_arctanh()
+    check_sinh()
+    check_cosh()
+    check_tanh()
     check_radians()
     check_degrees()
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index b764ac73d30c..ae8ad621df75 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -24,6 +24,7 @@
 import platform
 import mxnet as mx
 import scipy.stats as ss
+from nose.tools import assert_raises
 from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.base import MXNetError
@@ -901,6 +902,124 @@ def hybrid_forward(self, F, a):
             expected_grad[basic_index] = 1
             assert same(a.grad.asnumpy(), expected_grad)
 
+@with_seed()
+@use_np
+def test_npx_batch_dot():
+    ctx = mx.context.current_context()
+    dtypes = ['float32', 'float64']
+    if ctx.device_type == 'gpu':
+        dtypes += ['float16']
+    eps_dict = {'float32': 1E-4, 'float64': 1E-4, 'float16': 1E-3}
+    class TestBatchDot(HybridBlock):
+        def __init__(self, transpose_a, transpose_b):
+            super(TestBatchDot, self).__init__()
+            self._transpose_a = transpose_a
+            self._transpose_b = transpose_b
+
+        def hybrid_forward(self, F, lhs, rhs):
+            return F.npx.batch_dot(lhs, rhs,
+                                   transpose_a=self._transpose_a,
+                                   transpose_b=self._transpose_b)
+
+    def batch_dot_numpy(lhs, rhs, transpose_a, transpose_b):
+        assert lhs.ndim == rhs.ndim >= 3
+        if transpose_a:
+            lhs = lhs.swapaxes(-1, -2)
+        if transpose_b:
+            rhs = rhs.swapaxes(-1, -2)
+        return _np.matmul(lhs, rhs)
+
+    def gt_grad_batch_dot_numpy(lhs, rhs, ograd, transpose_a, transpose_b, lhs_req, rhs_req,
+                                init_lhs_grad, init_rhs_grad):
+
+        if transpose_a and transpose_b:
+            # Gradient of z = dot(x.T, y.T)
+            # dx = dot(dz, y).T = dot(y.T, dz.T)
+            # dy = dot(x, dz).T = dot(dz.T, x.T)
+            lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=True, transpose_b=True)
+            rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=True)
+        elif not transpose_a and transpose_b:
+            # Gradient of z = dot(x, y.T)
+            # dx = dot(dz, y)
+            # dy = dot(x.T, dz).T = dot(dz.T, x)
+            lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=False)
+            rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=False)
+        elif transpose_a and not transpose_b:
+            # Gradient of z = dot(x.T, y)
+            # dx = dot(dz, y.T).T = dot(y, dz.T)
+            # dy = dot(x, dz)
+            lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=False, transpose_b=True)
+            rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=False, transpose_b=False)
+        else:
+            # Gradient of z = dot(x, y)
+            # dx = dot(dz, y.T)
+            # dy = dot(x.T, dz)
+            lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=True)
+            rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=True, transpose_b=False)
+        if lhs_req == 'add':
+            lhs_grad += init_lhs_grad
+        if rhs_req == 'add':
+            rhs_grad += init_rhs_grad
+        return lhs_grad, rhs_grad
+
+
+    configs = [
+        ((2, 3, 0), (2, 4, 0), False, True),
+        ((2, 4, 3), (2, 4, 3), True, False),
+        ((0, 3, 0), (0, 0, 2), False, False),
+        ((3, 2, 3, 2), (3, 2, 2, 3), True, True),
+        ((3, 1, 5, 2), (3, 1, 2, 1), False, False)
+    ]
+    bad_configs = [
+        ((5, 3, 2), (5, 1, 3), False, False),
+        ((2, 5, 3, 1), (2, 4, 3, 1), True, False)
+    ]
+    for hybridize in [True, False]:
+        for lhs_shape, rhs_shape, transpose_a, transpose_b in configs:
+            for dtype in dtypes:
+                eps = eps_dict[dtype]
+                for lhs_grad_req in ['write', 'add']:
+                    for rhs_grad_req in ['write', 'add']:
+                        f_batch_dot = TestBatchDot(transpose_a=transpose_a,
+                                                   transpose_b=transpose_b)
+                        if hybridize:
+                            f_batch_dot.hybridize()
+                        lhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
+                        rhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
+                        lhs_val.attach_grad(grad_req=lhs_grad_req)
+                        rhs_val.attach_grad(grad_req=rhs_grad_req)
+                        gt_out = batch_dot_numpy(lhs_val.asnumpy(), rhs_val.asnumpy(),
+                                                 transpose_a, transpose_b)
+                        init_lhs_grad = mx.np.random.uniform(-1.0, 1.0, lhs_shape, dtype=dtype)
+                        init_rhs_grad = mx.np.random.uniform(-1.0, 1.0, rhs_shape, dtype=dtype)
+                        o_grad = mx.np.random.uniform(-1.0, 1.0, gt_out.shape, dtype=dtype)
+                        if lhs_grad_req == 'add':
+                            lhs_val.grad[:] = init_lhs_grad
+                        if rhs_grad_req == 'add':
+                            rhs_val.grad[:] = init_rhs_grad
+                        with mx.autograd.record():
+                            out = f_batch_dot(lhs_val, rhs_val)
+                        out.backward(o_grad)
+                        assert_almost_equal(out.asnumpy(), gt_out, rtol=eps, atol=eps)
+                        gt_lhs_grad, gt_rhs_grad = gt_grad_batch_dot_numpy(lhs_val.asnumpy(),
+                                                              rhs_val.asnumpy(),
+                                                              o_grad.asnumpy(),
+                                                              transpose_a=transpose_a,
+                                                              transpose_b=transpose_b,
+                                                              lhs_req=lhs_grad_req,
+                                                              rhs_req=rhs_grad_req,
+                                                              init_lhs_grad=init_lhs_grad.asnumpy(),
+                                                              init_rhs_grad=init_rhs_grad.asnumpy())
+                        assert_almost_equal(lhs_val.grad.asnumpy(), gt_lhs_grad, rtol=eps, atol=eps)
+                        assert_almost_equal(rhs_val.grad.asnumpy(), gt_rhs_grad, rtol=eps, atol=eps)
+    for lhs_shape, rhs_shape, transpose_a, transpose_b in bad_configs:
+        for dtype in dtypes:
+            lhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
+            rhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
+            assert_raises(MXNetError, lambda: mx.npx.batch_dot(lhs_val, rhs_val,
+                                                               transpose_a=transpose_a,
+                                                               transpose_b=transpose_b))
+
 
 @with_seed()
 @use_np
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 7ea106b2620f..dde28fdb766f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2964,6 +2964,7 @@ def test_big_transpose():
     assert_allclose(x_np, z.asnumpy().astype('uint8'))
 
 
+@with_seed()
 def test_larger_transpose():
     x = mx.nd.random.normal(shape=(50,51))
     y = mx.nd.transpose(x)
@@ -3324,9 +3325,9 @@ def test_batch_dot():
                         agrad_npy = np.empty((batch_size, m, k), dtype=data_type)
                         bgrad_npy = np.empty((batch_size, k, n), dtype=data_type)
                         a_init_grad_npy = np.random.normal(size=(batch_size, m, k))
-                        a_init_grad_npy = a_npy.astype(data_type)
+                        a_init_grad_npy = a_init_grad_npy.astype(data_type)
                         b_init_grad_npy = np.random.normal(size=(batch_size, k, n))
-                        b_init_grad_npy = b_npy.astype(data_type)
+                        b_init_grad_npy = b_init_grad_npy.astype(data_type)
                         for i in range(batch_size):
                             c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :])
                             bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :])