From 91ad2664e1c83c80ff318b29fcd32608bee90d04 Mon Sep 17 00:00:00 2001
From: Talia <31782251+TEChopra1000@users.noreply.github.com>
Date: Wed, 23 Oct 2019 09:32:18 -0700
Subject: [PATCH 01/32] fixed broken links across multiple files (#16581)

---
 .../getting-started/crash-course/5-predict.md        |  2 +-
 .../getting-started/crash-course/6-use_gpus.md       |  2 +-
 .../gluon_from_experiment_to_deployment.md           | 10 +++++-----
 .../tutorials/getting-started/to-mxnet/pytorch.md    | 12 ++++++------
 .../python/tutorials/packages/gluon/image/mnist.md   |  2 +-
 .../src/pages/api/architecture/note_data_loading.md  |  2 +-
 .../docs/tutorials/mxnet_cpp_inference_tutorial.md   | 10 +++++-----
 .../src/pages/api/faq/distributed_training.md        |  6 +++---
 docs/static_site/src/pages/api/faq/float16.md        |  8 ++++----
 .../src/pages/api/faq/gradient_compression.md        |  2 +-
 .../src/pages/api/faq/model_parallel_lstm.md         |  2 +-
 docs/static_site/src/pages/api/faq/recordio.md       |  1 -
 .../src/pages/get_started/build_from_source.md       |  2 +-
 julia/docs/src/tutorial/char-lstm.md                 |  6 +++---
 julia/docs/src/tutorial/mnist.md                     |  2 +-
 julia/docs/src/user-guide/overview.md                |  2 --
 julia/examples/char-lstm/README.md                   |  2 +-
 17 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md b/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
index 7a7738d8df1b..9afe95b58403 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
@@ -21,7 +21,7 @@ A saved model can be used in multiple places, such as to continue training, to f
 
 ## Prerequisites
 
-Please run the [previous tutorial](train.md) to train the network and save its parameters to file. You will need this file to run the following steps.
+Please run the [previous tutorial](4-train.html) to train the network and save its parameters to file. You will need this file to run the following steps.
 
 ```{.python .input  n=1}
 from mxnet import nd
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
index b78c38ab7077..a0788ba7df2d 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
@@ -99,7 +99,7 @@ net(x)
 
 Finally, we show how to use multiple GPUs to jointly train a neural network through data parallelism. Let's assume there are *n* GPUs. We split each data batch into *n* parts, and then each GPU will run the forward and backward passes using one part of the data.
 
-Let's first copy the data definitions and the transform function from the [previous tutorial](predict.md).
+Let's first copy the data definitions and the transform function from the [previous tutorial](5-predict.html).
 
 ```{.python .input}
 batch_size = 256
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index 8d2c4e100c76..b1f65e682263 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -20,7 +20,7 @@
 
 ## Overview
 MXNet Gluon API comes with a lot of great features, and it can provide you everything you need: from experimentation to deploying the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
-This tutorial covers training and inference in Python, please continue to [C++ inference part](https://mxnet.apache.org/versions/master/tutorials/c++/mxnet_cpp_inference_tutorial.html) after you finish.
+This tutorial covers training and inference in Python, please continue to [C++ inference part](/api/cpp/docs/tutorials/cpp_inference) after you finish.
 
 Let's say you need to build a service that provides flower species recognition. A common problem is that you don't have enough data to train a good model. In such cases, a technique called Transfer Learning can be used to make a more robust model.
 In Transfer Learning we make use of a pre-trained model that solves a related task, and was trained on a very large standard dataset, such as ImageNet. ImageNet is from a different domain, but we can utilize the knowledge in this pre-trained model to perform the new task at hand.
@@ -77,7 +77,7 @@ from mxnet.gluon.data.vision import transforms
 from mxnet.gluon.model_zoo.vision import resnet50_v2
 ```
 
-Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](../packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training.
+Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](/api/python/docs/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training.
 Here we set the `epochs` to 1 for quick demonstration, please change to 40 for actual training.
 
 ```python
@@ -161,7 +161,7 @@ test_data = gluon.data.DataLoader(
 
 We will use pre-trained ResNet50_v2 model which was pre-trained on the [ImageNet Dataset](http://www.image-net.org/) with 1000 classes. To match the classes in the Flower dataset, we must redefine the last softmax (output) layer to be 102, then initialize the parameters.
 
-Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](https://mxnet.apache.org/tutorials/gluon/hybrid.html).
+Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](/api/python/docs/tutorials/packages/gluon/blocks/hybridize.html).
 
 
 
@@ -265,7 +265,7 @@ finetune_net.export("flower-recognition", epoch=epochs)
 ## Load the model and run inference using the MXNet Module API
 
 MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
-Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.apache.org/api/python/module/module.html),    [Java](https://mxnet.apache.org/api/java/index.html), [Scala](https://mxnet.apache.org/api/scala/index.html), and [C++](https://mxnet.apache.org/api/c++/index.html) APIs.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](/api/python.html),    [Java](/api/java.html), [Scala](/api/scala.html), and [C++](/api/cpp) APIs.
 
 Here we will briefly introduce how to run inference using Module API in Python. There is more detailed explanation available in the [Predict Image Tutorial](https://mxnet.apache.org/tutorials/python/predict_image.html).
 In general, prediction consists of the following steps:
@@ -315,7 +315,7 @@ You can continue to the [next tutorial](https://mxnet.apache.org/versions/master
 
 You can also find more ways to run inference and deploy your models here:
 1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-2. [Scala Inference examples](https://mxnet.apache.org/tutorials/scala/)
+2. [Scala Inference examples](/api/scala/docs/tutorials/infer)
 4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
diff --git a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
index d7720bac4348..1ab490fbaa42 100644
--- a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
+++ b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
@@ -164,7 +164,7 @@ mx_trainer = gluon.Trainer(mx_net.collect_params(),
                            'sgd', {'learning_rate': 0.1})
 ```
 
-The code difference between frameworks is small. The main difference is that in Apache MXNet we use [Trainer](https://mxnet.apache.org/api/python/docs/api/gluon/mxnet.gluon.Trainer.html) class, which accepts optimization algorithm as an argument. We also use [.collect_params()](/api/python/docs/api/gluon/_autogen/mxnet.gluon.nn.Block.collect_params.html) method to get parameters of the network.
+The code difference between frameworks is small. The main difference is that in Apache MXNet we use [Trainer](/api/python/docs/api/gluon/trainer.html) class, which accepts optimization algorithm as an argument. We also use [.collect_params()](/api/python/docs/api/gluon/block.html#mxnet.gluon.Block.collect_params) method to get parameters of the network.
 
 ### 4. Training
 
@@ -212,13 +212,13 @@ Some of the differences in Apache MXNet when compared to PyTorch are as follows:
 
 * In Apache MXNet, you don't need to flatten the 4-D input into 2-D when feeding the data into forward pass.
 
-* In Apache MXNet, you need to perform the calculation within the [autograd.record()](/api/python/docs/api/gluon-related/_autogen/mxnet.autograd.record.html) scope so that it can be automatically differentiated in the backward pass.
+* In Apache MXNet, you need to perform the calculation within the [autograd.record()](/api/python/docs/api/autograd/index.html?autograd%20record#mxnet.autograd.record) scope so that it can be automatically differentiated in the backward pass.
 
 * It is not necessary to clear the gradient every time as with PyTorch's `trainer.zero_grad()` because by default the new gradient is written in, not accumulated.
 
-* You need to specify the update step size (usually batch size) when performing [step()](/api/python/docs/api/gluon/_autogen/mxnet.gluon.Trainer.step.html) on the trainer.
+* You need to specify the update step size (usually batch size) when performing [step()](/api/python/docs/api/gluon/trainer.html?#mxnet.gluon.Trainer.step) on the trainer.
 
-* You need to call [.asscalar()](/api/python/docs/api/ndarray/_autogen/mxnet.ndarray.NDArray.asscalar.html) to turn a multidimensional array into a scalar.
+* You need to call [.asscalar()](/api/python/docs/api/ndarray/ndarray.html?#mxnet.ndarray.NDArray.asscalar) to turn a multidimensional array into a scalar.
 
 * In this sample, Apache MXNet is twice as fast as PyTorch. Though you need to be cautious with such toy comparisons.
 
@@ -230,9 +230,9 @@ As we saw above, Apache MXNet Gluon API and PyTorch have many similarities. The
 
 While Apache MXNet Gluon API is very similar to PyTorch, there are some extra functionality that can make your code even faster.
 
-* Check out [Hybridize tutorial](/api/python/docs/guide/packages/gluon/hybridize.html) to learn how to write imperative code which can be converted to symbolic one.
+* Check out [Hybridize tutorial](/api/python/docs/tutorials/packages/gluon/blocks/hybridize.html) to learn how to write imperative code which can be converted to symbolic one.
 
-* Also, check out how to extend Apache MXNet with your own [custom layers](/api/python/docs/guide/extend/custom_layer.html).
+* Also, check out how to extend Apache MXNet with your own [custom layers](/api/python/docs/tutorials/packages/gluon/blocks/custom-layer.html?custom_layers).
 
 ## Appendix
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index 8a3d8229413b..a6898278edf6 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -112,7 +112,7 @@ to train the MLP network we defined above.
 
 For our training, we will make use of the stochastic gradient descent (SGD) optimizer. In particular, we'll be using mini-batch SGD. Standard SGD processes train data one example at a time. In practice, this is very slow and one can speed up the process by processing examples in small batches. In this case, our batch size will be 100, which is a reasonable choice. Another parameter we select here is the learning rate, which controls the step size the optimizer takes in search of a solution. We'll pick a learning rate of 0.02, again a reasonable choice. Settings such as batch size and learning rate are what are usually referred to as hyper-parameters. What values we give them can have a great impact on training performance.
 
-We will use [Trainer](https://mxnet.io/api/python/docs/api/gluon/mxnet.gluon.Trainer.html) class to apply the
+We will use [Trainer](/api/python/docs/api/gluon/trainer.html) class to apply the
 [SGD optimizer](https://mxnet.io/api/python/docs/api/gluon-related/_autogen/mxnet.optimizer.SGD.html) on the
 initialized parameters.
 
diff --git a/docs/static_site/src/pages/api/architecture/note_data_loading.md b/docs/static_site/src/pages/api/architecture/note_data_loading.md
index 1279d0361e5f..01bf1f23f600 100644
--- a/docs/static_site/src/pages/api/architecture/note_data_loading.md
+++ b/docs/static_site/src/pages/api/architecture/note_data_loading.md
@@ -125,7 +125,7 @@ then compress into JPEG format.
 After that, we save a header that indicates the index and label
 for that image to be used when constructing the *Data* field for that record.
 We then pack several images together into a file.
-You may want to also review the [example using im2rec.py to create a RecordIO dataset](https://mxnet.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators).
+You may want to also review the [example using im2rec.py to create a RecordIO dataset](https://mxnet.apache.org/api/faq/recordio).
 
 ### Access Arbitrary Parts Of Data
 
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
index 9392eca2977f..0d96817560d0 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
@@ -29,7 +29,7 @@ tag: cpp
 ## Overview
 MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
 Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python]({{'/api/python/docs/api/symbol-related/mxnet.module'|relative_url}}),    [Java]({{'/api/java/docs/api'|relative_url}}), [Scala]({{'/api/scala/docs/api'|relative_url}}), and [C++]({{'/api/cpp/docs/api'|relative_url}}) APIs.
-We will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/example/inference) for our use case.
+We will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) for our use case.
 
 ## Prerequisites
 
@@ -105,7 +105,7 @@ class Predictor {
 
 ### Load the model, synset file, and normalization values
 
-In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp).
+In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/imagenet_inference.cpp).
 
 Next, we need to load synset file, and normalization values. We have made the following change since our synset file contains flower names and we used both mean and standard deviation for image normalization.
 
@@ -280,12 +280,12 @@ Then it will predict your image:
 
 Now you can explore more ways to run inference and deploy your models:
 1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-2. [Scala Inference examples]({{'/api/scala/docs/tutorials'|relative_url}})
-3. [ONNX model inference examples]({{'/api/python/docs/tutorials/deploy/index.html'|relative_url}})
+2. [Scala Inference examples](/api/scala/docs/tutorials)
+3. [ONNX model inference examples](/api/python/docs/tutorials/deploy/index.html)
 4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
 
-1. [Gluon end to end tutorial]({{'/api/python/docs/tutorials/packages/gluon/gluon_from_experiment_to_deployment.html'|relative_url}})
+1. [Gluon end to end tutorial](/api/python/docs/tutorials/getting-started/gluon_from_experiment_to_deployment.html)
 2. [Gluon C++ inference example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
 3. [Gluon C++ package](https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
diff --git a/docs/static_site/src/pages/api/faq/distributed_training.md b/docs/static_site/src/pages/api/faq/distributed_training.md
index caf0123b7aea..622ace60f780 100644
--- a/docs/static_site/src/pages/api/faq/distributed_training.md
+++ b/docs/static_site/src/pages/api/faq/distributed_training.md
@@ -91,7 +91,7 @@ In the case of distributed training though, we would need to divide the dataset
 
 Typically, this split of data for each worker happens through the data iterator,
 on passing the number of parts and the index of parts to iterate over.
-Some iterators in MXNet that support this feature are [mxnet.io.MNISTIterator]({{'//api/mxnet/io/index.html#mxnet.io.MNISTIter'|relative_url}}) and [mxnet.io.ImageRecordIter]({{'/api/mxnet/io/index.html#mxnet.io.ImageRecordIter'|relative_url}}).
+Some iterators in MXNet that support this feature are [mxnet.io.MNISTIterator](/api/python/docs/api/mxnet/io/index.html?MNISTIter#mxnet.io.MNISTIter) and [mxnet.io.ImageRecordIter](api/python/docs/api/mxnet/io/index.html?imagerecorditer#mxnet.io.ImageRecordIter).
 If you are using a different iterator, you can look at how the above iterators implement this.
 We can use the kvstore object to get the number of workers (`kv.num_workers`) and rank of the current worker (`kv.rank`).
 These can be passed as arguments to the iterator.
@@ -101,7 +101,7 @@ to see an example usage.
 ### Updating weights
 KVStore server supports two modes, one which aggregates the gradients and updates the weights using those gradients, and second where the server only aggregates gradients. In the latter case, when a worker process pulls from kvstore, it gets the aggregated gradients. The worker then uses these gradients and applies the weights locally.
 
-When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer]({{'/api/python/docs/api/gluon/mxnet.gluon.Trainer.html'|relative_url}}) object like this:
+When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer](/api/python/docs/api/gluon/trainer.html) object like this:
 
 ```
 trainer = gluon.Trainer(net.collect_params(), optimizer='sgd',
@@ -190,7 +190,7 @@ git clone --recursive https://github.com/apache/incubator-mxnet
 ```
 
 #### Example
-Let us consider training a VGG11 model on the CIFAR10 dataset using [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py).
+Let us consider training a VGG11 model on the CIFAR10 dataset using [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/tools/launch.py).
 ```
 cd example/gluon/
 ```
diff --git a/docs/static_site/src/pages/api/faq/float16.md b/docs/static_site/src/pages/api/faq/float16.md
index d824acb3ce6d..e63bf87ac68f 100644
--- a/docs/static_site/src/pages/api/faq/float16.md
+++ b/docs/static_site/src/pages/api/faq/float16.md
@@ -39,7 +39,7 @@ The float16 data type is a 16 bit floating point representation according to the
 - CUDA 9 or higher
 - cuDNN v7 or higher
 
-This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
+This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](/api/python/docs/tutorials/getting-started/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
 
 ## Using the Gluon API
 
@@ -47,13 +47,13 @@ This tutorial also assumes understanding of how to train a network with float32
 
 With Gluon API, you need to take care of three things to convert a model to support computation with float16.
 
-1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast]({{'/api/python/docs/api/gluon/mxnet.gluon.nn.Block.html#mxnet.gluon.nn.Block.cast'|relative_url}}) method of the `Block` representing the network.
+1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast](/api/python/docs/api/gluon/block.html?cast#mxnet.gluon.Block.cast) method of the `Block` representing the network.
 
 ```python
 net.cast('float16')
 ```
 
-2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype]({{'/api/python/docs/api/ndarray/_autogen/mxnet.ndarray.NDArray.astype.html#mxnet.ndarray.NDArray.astype'|relative_url}}) method of NDArrays.
+2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype](/api/python/docs/api/ndarray/ndarray.html?astype#mxnet.ndarray.NDArray.astype) method of NDArrays.
 
 ```python
 data = data.astype('float16', copy=False)
@@ -98,7 +98,7 @@ net.features = pretrained_net.features
 net.cast('float16')
 ```
 
-You can check the parameters of the model by calling [summary]({{'/api/python/docs/api/gluon/mxnet.gluon.nn.Block.html#mxnet.gluon.nn.Block.summary'|relative_url}}) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
+You can check the parameters of the model by calling [summary](/api/python/docs/api/gluon/block.html?block%20summary#mxnet.gluon.Block.summary) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
 
 ```python
 net.summary(mx.nd.uniform(shape=(1, 3, 224, 224), dtype=np.float16))
diff --git a/docs/static_site/src/pages/api/faq/gradient_compression.md b/docs/static_site/src/pages/api/faq/gradient_compression.md
index 1f4c5fb21903..e2b47c646ada 100644
--- a/docs/static_site/src/pages/api/faq/gradient_compression.md
+++ b/docs/static_site/src/pages/api/faq/gradient_compression.md
@@ -110,7 +110,7 @@ A reference `gluon` implementation with a gradient compression option can be fou
 mod = mx.mod.Module(..., compression_params={'type’:'2bit', 'threshold':0.5})
 ```
 
-A `module` example is provided with [this guide for setting up MXNet with distributed training](https://mxnet.apache.org/versions/master/faq/multi_devices.html#distributed-training-with-multiple-machines). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
+A `module` example is provided with [this guide for setting up MXNet with distributed training](/api/faq/distributed_training). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
 
 ### Configuration Details
 
diff --git a/docs/static_site/src/pages/api/faq/model_parallel_lstm.md b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
index 60df280b38fe..08cf6be76a90 100644
--- a/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
+++ b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
@@ -37,7 +37,7 @@ One key strength of _MXNet_ is its ability to leverage
 powerful heterogeneous hardware environments to achieve significant speedups.
 
 There are two primary ways that we can spread a workload across multiple devices.
-In a previous document, [we addressed data parallelism](multi_devices),
+In a previous document, [we addressed data parallelism](/api/faq/distributed_training),
 an approach in which samples within a batch are divided among the available devices.
 With data parallelism, each device stores a complete copy of the model.
 Here, we explore _model parallelism_, a different approach.
diff --git a/docs/static_site/src/pages/api/faq/recordio.md b/docs/static_site/src/pages/api/faq/recordio.md
index 75407cb3da5f..2e8fcdd647f3 100644
--- a/docs/static_site/src/pages/api/faq/recordio.md
+++ b/docs/static_site/src/pages/api/faq/recordio.md
@@ -38,7 +38,6 @@ We provide two tools for creating a RecordIO dataset.
 * [im2rec.py](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) - implements the tool using the Python API.
 
 Both provide the same output: a RecordIO dataset.
-You may want to also review the [example using real-world data with im2rec.py.](https://mxnet.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators)
 
 ### Prerequisites
 
diff --git a/docs/static_site/src/pages/get_started/build_from_source.md b/docs/static_site/src/pages/get_started/build_from_source.md
index e8f7d468b399..20a4542461c4 100644
--- a/docs/static_site/src/pages/get_started/build_from_source.md
+++ b/docs/static_site/src/pages/get_started/build_from_source.md
@@ -98,7 +98,7 @@ Those can be extended with [LAPACK (Linear Algebra Package)](https://github.com/
 
 MXNet supports multiple mathematical backends for computations on the CPU:
 * [Apple Accelerate](https://developer.apple.com/documentation/accelerate)
-* [ATLAS](https://math-atlas.sourceforge.net/)
+* [ATLAS](http://math-atlas.sourceforge.net/)
 * [MKL](https://software.intel.com/en-us/intel-mkl) (MKL, MKLML)
 * [MKL-DNN](https://github.com/intel/mkl-dnn)
 * [OpenBLAS](https://www.openblas.net/)
diff --git a/julia/docs/src/tutorial/char-lstm.md b/julia/docs/src/tutorial/char-lstm.md
index bc7f7b471d94..ab7e9352b5ab 100644
--- a/julia/docs/src/tutorial/char-lstm.md
+++ b/julia/docs/src/tutorial/char-lstm.md
@@ -31,7 +31,7 @@ networks yet, the example shown here is an implementation of LSTM by
 using the default FeedForward model via explicitly unfolding over time.
 We will be using fixed-length input sequence for training. The code is
 adapted from the [char-rnn example for MXNet's Python
-binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb),
+binding](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb),
 which demonstrates how to use low-level
 [Symbolic API](@ref) to build customized neural
 network models directly.
@@ -165,7 +165,7 @@ char-lstm. To train the model, we just follow the standard high-level
 API. Firstly, we construct a LSTM symbolic architecture:
 
 Note all the parameters are defined in
-[examples/char-lstm/config.jl](https://github.com/dmlc/MXNet.jl/blob/master/examples/char-lstm/config.jl).
+[examples/char-lstm/config.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/config.jl).
 Now we load the text file and define the data provider. The data
 `input.txt` we used in this example is [a tiny Shakespeare
 dataset](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
@@ -318,6 +318,6 @@ illustrations](http://colah.github.io/posts/2015-08-Understanding-LSTMs/),
 but could otherwise be very useful for debugging. As we can see, the
 LSTM unfolded over time is just a (very) deep neural network. The
 complete code for producing this visualization can be found in
-[examples/char-lstm/visualize.jl](https://github.com/apache/incubator-mxnet/tree/master/julia/examples/char-lstmvisualize.jl).
+[examples/char-lstm/visualize.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/visualize.jl).
 
 ![image](images/char-lstm-vis.svg)
diff --git a/julia/docs/src/tutorial/mnist.md b/julia/docs/src/tutorial/mnist.md
index cc5267071f11..edc1a67d2485 100644
--- a/julia/docs/src/tutorial/mnist.md
+++ b/julia/docs/src/tutorial/mnist.md
@@ -23,7 +23,7 @@ multi-layer perceptron and then a convolutional neural network (the
 LeNet architecture) on the [MNIST handwritten digit
 dataset](http://yann.lecun.com/exdb/mnist/). The code for this tutorial
 could be found in
-[examples/mnist](https://github.com/dmlc/MXNet.jl/tree/master/examples/mnist).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
+[examples/mnist](https://github.com/apache/incubator-mxnet/blob/master/julia/docs/src/tutorial/mnist.md).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
 
 Simple 3-layer MLP
 ------------------
diff --git a/julia/docs/src/user-guide/overview.md b/julia/docs/src/user-guide/overview.md
index 974cc7dee974..342448a15bed 100644
--- a/julia/docs/src/user-guide/overview.md
+++ b/julia/docs/src/user-guide/overview.md
@@ -269,8 +269,6 @@ symbolic composition system. It is like
 [Theano](http://deeplearning.net/software/theano/), except that we
 avoided long expression compilation time by providing *larger* neural
 network related building blocks to guarantee computation performance.
-See also [this note](https://mxnet.readthedocs.org/en/latest/program_model.html)
-for the design and trade-off of the MXNet symbolic composition system.
 
 The basic type is `mx.SymbolicNode`. The following is a trivial example of
 composing two symbols with the `+` operation.
diff --git a/julia/examples/char-lstm/README.md b/julia/examples/char-lstm/README.md
index ac745dd4cc41..155f29603623 100644
--- a/julia/examples/char-lstm/README.md
+++ b/julia/examples/char-lstm/README.md
@@ -29,7 +29,7 @@ and `StatsBase.jl`.
 ## Training
 
 This example is adapted from the
-[example in Python binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb) of
+[example in Python binding](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb) of
 MXNet. The data `input.txt` can be downloaded [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
 
 Modify parameters in [config.jl](config.jl) and then run [train.jl](train.jl). An example output

From e22e93f50f9e09d758a283733c901f4917b58321 Mon Sep 17 00:00:00 2001
From: Soji Adeshina <adesojia@amazon.com>
Date: Wed, 23 Oct 2019 14:44:57 -0700
Subject: [PATCH 02/32] fix missing docs due to git add issues (#16496)

---
 .../python/api/gluon/data/index.rst           | 63 +++++++++++++++++++
 .../api/gluon/data/vision/datasets/index.rst  | 26 ++++++++
 .../python/api/gluon/data/vision/index.rst    | 53 ++++++++++++++++
 .../gluon/data/vision/transforms/index.rst    | 48 ++++++++++++++
 .../python/api/mxnet/log/index.rst            | 23 +++++++
 .../python/api/mxnet/model/index.rst          | 23 +++++++
 6 files changed, 236 insertions(+)
 create mode 100644 docs/python_docs/python/api/gluon/data/index.rst
 create mode 100644 docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
 create mode 100644 docs/python_docs/python/api/gluon/data/vision/index.rst
 create mode 100644 docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
 create mode 100644 docs/python_docs/python/api/mxnet/log/index.rst
 create mode 100644 docs/python_docs/python/api/mxnet/model/index.rst

diff --git a/docs/python_docs/python/api/gluon/data/index.rst b/docs/python_docs/python/api/gluon/data/index.rst
new file mode 100644
index 000000000000..f9e8a21e69d2
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/index.rst
@@ -0,0 +1,63 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+gluon.data
+==========
+
+.. automodule:: mxnet.gluon.data
+
+Datasets
+--------
+
+.. autosummary::
+
+   Dataset
+   ArrayDataset
+   RecordFileDataset
+   SimpleDataset
+
+Sampling
+--------
+
+.. autosummary::
+
+   Sampler
+   SequentialSampler
+   RandomSampler
+   BatchSampler
+
+DataLoader
+----------
+
+.. autosummary::
+
+   DataLoader
+
+
+API Reference
+-------------
+.. automodule:: mxnet.gluon.data
+    :members:
+    :imported-members:
+    :autosummary:
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2
+   :glob:
+
+   */index
\ No newline at end of file
diff --git a/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst b/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
new file mode 100644
index 000000000000..6b007526607a
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
@@ -0,0 +1,26 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+vision.datasets
+===============
+
+Gluon provides pre-defined vision datasets functions in the :py:mod:`mxnet.gluon.data.vision.datasets`
+module.
+
+.. automodule:: mxnet.gluon.data.vision.datasets
+    :members:
+    :autosummary:
diff --git a/docs/python_docs/python/api/gluon/data/vision/index.rst b/docs/python_docs/python/api/gluon/data/vision/index.rst
new file mode 100644
index 000000000000..2731b5f4245a
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/vision/index.rst
@@ -0,0 +1,53 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+data.vision
+============
+
+.. automodule:: mxnet.gluon.data.vision
+
+Datasets
+^^^^^^^^
+
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.data.vision.datasets
+
+
+Data transformations
+^^^^^^^^^^^^^^^^^^^^
+
+
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.data.vision.transforms
+
+
+API Reference
+-------------
+.. automodule:: mxnet.gluon.data.vision
+    :members:
+    :autosummary:
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2
+   :glob:
+
+   */index
\ No newline at end of file
diff --git a/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst b/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
new file mode 100644
index 000000000000..60d975d87aff
--- /dev/null
+++ b/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
@@ -0,0 +1,48 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+vision.transforms
+=================
+
+Gluon provides pre-defined vision transformation and data augmentation functions in the :py:mod:`mxnet.gluon.data.vision.transforms`
+module.
+
+.. currentmodule:: mxnet.gluon.data.vision
+
+.. autosummary::
+   :nosignatures:
+
+   transforms.Compose
+   transforms.Cast
+   transforms.ToTensor
+   transforms.Normalize
+   transforms.RandomResizedCrop
+   transforms.CenterCrop
+   transforms.Resize
+   transforms.RandomFlipLeftRight
+   transforms.RandomFlipTopBottom
+   transforms.RandomBrightness
+   transforms.RandomContrast
+   transforms.RandomSaturation
+   transforms.RandomHue
+   transforms.RandomColorJitter
+   transforms.RandomLighting
+
+API Reference
+-------------
+.. automodule:: mxnet.gluon.data.vision.transforms
+    :members:
diff --git a/docs/python_docs/python/api/mxnet/log/index.rst b/docs/python_docs/python/api/mxnet/log/index.rst
new file mode 100644
index 000000000000..fd4d8788c28a
--- /dev/null
+++ b/docs/python_docs/python/api/mxnet/log/index.rst
@@ -0,0 +1,23 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+mxnet.log
+=========
+
+.. automodule:: mxnet.log
+    :members:
+    :autosummary:
\ No newline at end of file
diff --git a/docs/python_docs/python/api/mxnet/model/index.rst b/docs/python_docs/python/api/mxnet/model/index.rst
new file mode 100644
index 000000000000..69bcddce6bc1
--- /dev/null
+++ b/docs/python_docs/python/api/mxnet/model/index.rst
@@ -0,0 +1,23 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+mxnet.model
+===========
+
+.. automodule:: mxnet.model
+    :members:
+    :autosummary:

From 05a4c4f88b27d031f997ff76bdf8a7f7f3b47768 Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Thu, 24 Oct 2019 01:20:45 +0200
Subject: [PATCH 03/32] Create SECURITY.md (#16573)

* Create SECURITY.md

* Update SECURITY.md
---
 SECURITY.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 SECURITY.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000000..bbb4505499c1
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,25 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Security Policy
+
+## Reporting a Vulnerability
+The Apache Software Foundation takes a very active stance in eliminating security problems and denial of service attacks against its products.
+
+We strongly encourage folks to report such problems to our private security mailing list first, before disclosing them in a public forum.
+
+For instructions how to report a security vulnerability, please consult our [security guide](https://mxnet.apache.org/api/faq/security).

From c3395ca60b20f4388dd76746696497148b82fc80 Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@connect.ust.hk>
Date: Wed, 23 Oct 2019 18:17:51 -0700
Subject: [PATCH 04/32] [Numpy] Support N_D(N>=3) batch_dot (#16586)

* Support N_D(N>=3) batch_dot

* use 1E-4

* fix lint

* remove unnecessary comment

* Update test_numpy_op.py
---
 src/operator/tensor/dot-inl.h          | 177 +++++++++----------------
 src/operator/tensor/dot.cc             |  84 +++++++++---
 src/operator/tensor/dot.cu             |   3 -
 tests/python/unittest/test_numpy_op.py | 119 +++++++++++++++++
 tests/python/unittest/test_operator.py |   4 +-
 5 files changed, 249 insertions(+), 138 deletions(-)

diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 96c869f40d40..8405404dc627 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -30,6 +30,7 @@
 #include <algorithm>
 #include <utility>
 #include <type_traits>
+
 #include "./util/tensor_util-inl.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
@@ -1353,6 +1354,7 @@ void BatchDotForward_(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (req[0] == kNullOp) return;
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
       << "Binary function only support input/output with the same type";
@@ -1362,115 +1364,46 @@ void BatchDotForward_(const nnvm::NodeAttrs& attrs,
     (outputs[0].type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
     << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 1, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
-    if (kNullOp != req[0]) {
-      if (param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (!param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (param.transpose_a && !param.transpose_b) {
-        mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else {
-        mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
+    int ndim = outputs[0].ndim();
+    if (outputs[0].shape_.Size() == 0 || inputs[0].shape_.Size() == 0
+                                      || inputs[1].shape_.Size() == 0) {
+      if (outputs[0].shape_.Size() != 0 && req[0] != kAddTo) {
+        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, outputs[0].shape_.Size(),
+                                                          outputs[0].dptr<DType>());
       }
+      return;
     }
-  });
-}
-
-template<typename xpu>
-void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
-                       const OpContext& ctx,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_NE(req[1], kWriteInplace);
-  CHECK_NE(req[0], kWriteInplace);
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64 ||
-    (outputs[0].type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
-    << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_grad = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_grad = outputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 2, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 2, DType*>(
-        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
-    mshadow::Tensor<xpu, 1, DType*> rhs_workspace = workspace[0];
-    mshadow::Tensor<xpu, 1, DType*> lhs_workspace = workspace[1];
+    size_t batch_size = outputs[0].shape_.ProdShape(0, ndim - 2);
+    mshadow::Tensor<xpu, 3, DType> out =
+        outputs[0].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
+                                                        outputs[0].shape_[ndim - 2],
+                                                        outputs[0].shape_[ndim - 1]), s);
+    mshadow::Tensor<xpu, 3, DType> mlhs =
+        inputs[0].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
+                                                       inputs[0].shape_[ndim - 2],
+                                                       inputs[0].shape_[ndim - 1]), s);
+    mshadow::Tensor<xpu, 3, DType> mrhs =
+        inputs[1].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
+                                                       inputs[1].shape_[ndim - 2],
+                                                       inputs[1].shape_[ndim - 1]), s);
+    mshadow::Tensor<xpu, 1, DType*> workspace =
+        ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
     if (param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x.T, y.T)
-      // dy = dot(x, dz).T = dot(dz.T, x.T)
-      // dx = dot(dz, y).T = dot(y.T, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f :  (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     } else if (!param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x, y.T)
-      // dy = dot(x.T, dz).T = dot(dz.T, x)
-      // dx = dot(dz, y)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     } else if (param.transpose_a && !param.transpose_b) {
-      // Gradient of z = dot(x.T, y)
-      // dy = dot(x, dz)
-      // dx = dot(dz, y.T).T = dot(y, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     } else {
-      // Gradient of z = dot(x, y)
-      // dy = dot(x.T, dz)
-      // dx = dot(dz, y.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+      mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
+                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                     workspace);
     }
   });
 }
@@ -1485,24 +1418,34 @@ inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& rshape = (*in_attrs)[1];
   // return false if lhs and rhs both have fully unknown shape
   if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
-  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
+  if (lshape.ndim() >= 3 && rshape.ndim() >= 3 && lshape.ndim() == rshape.ndim()) {
+    int ndim = lshape.ndim();
     // only partially infer shape if last dim of lhs and second dim of rhs is known
-    bool last_dim_known = dim_size_is_known(lshape, 2);
-    bool second_dim_known = dim_size_is_known(rshape, 1);
+    bool last_dim_known = dim_size_is_known(lshape, ndim - 1);
+    bool second_dim_known = dim_size_is_known(rshape, ndim - 2);
     if ( !last_dim_known || !second_dim_known) return false;
-    CHECK(lshape[0] == rshape[0])
-      << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
-      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
-    index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
-    index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
-    index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
-    CHECK(lshape_k == rshape_k)
-      << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
+    for (int i = 0; i < ndim - 2; i++) {
+      CHECK_EQ(lshape[i], rshape[i])
+        << "batch_dot shape error (the leading batch dimensions must be equal): "
+        << lshape << " X " << rshape
+        << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
+    }
+    dim_t out_m = param.transpose_a ? lshape[ndim - 1] : lshape[ndim - 2];
+    dim_t lshape_k = param.transpose_a ? lshape[ndim - 2] : lshape[ndim - 1];
+    dim_t out_n = param.transpose_b ? rshape[ndim - 2] : rshape[ndim - 1];
+    dim_t rshape_k = param.transpose_b ? rshape[ndim - 1] : rshape[ndim - 2];
+    CHECK_EQ(lshape_k, rshape_k)
+      << "batch_dot shape error (shape mismatch): " << lshape << " X " << rshape
       << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
+    std::vector<dim_t> out_shape_vec;
+    for (int i = 0; i < ndim - 2; i++) {
+      out_shape_vec.push_back(lshape[i]);
+    }
+    out_shape_vec.push_back(out_m);
+    out_shape_vec.push_back(out_n);
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(out_shape_vec));
   } else {
-    LOG(FATAL) << "batch_dot currently only support 3D*3D array"
+    LOG(FATAL) << "batch_dot currently only support N-D*N-D array (N >= 3)"
                << lshape << " v.s. " << rshape;
   }
   // return true if output shape is fully inferred
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index 11a056146e1d..556260ed9600 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -115,13 +115,13 @@ NNVM_REGISTER_OP(batch_dot)
 .describe(R"doc(Batchwise dot product.
 
 ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
-``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
+``y`` are data in batch, namely N-D (N >= 3) arrays in shape of `(B0, ..., B_i, :, :)`.
 
-For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
-`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
+For example, given ``x`` with shape `(B_0, ..., B_i, N, M)` and ``y`` with shape
+`(B_0, ..., B_i, M, K)`, the result array will have shape `(B_0, ..., B_i, N, K)`,
 which is computed by::
 
-   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
+   batch_dot(x,y)[b_0, ..., b_i, :, :] = dot(x[b_0, ..., b_i, :, :], y[b_0, ..., b_i, :, :])
 
 )doc" ADD_FILELINE)
 .set_num_inputs(2)
@@ -138,21 +138,73 @@ which is computed by::
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
 .set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
+.set_attr<nnvm::FGradient>("FGradient",
+    [](const nnvm::NodePtr& n,
+       const std::vector<nnvm::NodeEntry>& ograds) {
+  const DotParam& param = nnvm::get<DotParam>(n->attrs.parsed);
+  nnvm::NodePtr lhs_grad;
+  nnvm::NodePtr rhs_grad;
+  std::string lhs_gnode_name = n->attrs.name + "_backward_lhs";
+  std::string rhs_gnode_name = n->attrs.name + "_backward_rhs";
+  if (param.transpose_a && param.transpose_b) {
+    // Gradient of z = dot(x.T, y.T)
+    // dx = dot(dz, y).T = dot(y.T, dz.T)
+    // dy = dot(x, dz).T = dot(dz.T, x.T)
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {n->inputs[1], ograds[0]}, &(n->attrs.dict), &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {ograds[0], n->inputs[0]}, &(n->attrs.dict), &n);
+  } else if (!param.transpose_a && param.transpose_b) {
+    // Gradient of z = dot(x, y.T)
+    // dx = dot(dz, y)
+    // dy = dot(x.T, dz).T = dot(dz.T, x)
+    auto lhs_attrs_dict = n->attrs.dict;
+    auto rhs_attrs_dict = n->attrs.dict;
+    lhs_attrs_dict["transpose_a"] = "false";
+    lhs_attrs_dict["transpose_b"] = "false";
+    rhs_attrs_dict["transpose_a"] = "true";
+    rhs_attrs_dict["transpose_b"] = "false";
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {ograds[0], n->inputs[1]}, &lhs_attrs_dict, &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {ograds[0], n->inputs[0]}, &rhs_attrs_dict, &n);
+  } else if (param.transpose_a && !param.transpose_b) {
+    // Gradient of z = dot(x.T, y)
+    // dx = dot(dz, y.T).T = dot(y, dz.T)
+    // dy = dot(x, dz)
+    auto lhs_attrs_dict = n->attrs.dict;
+    auto rhs_attrs_dict = n->attrs.dict;
+    lhs_attrs_dict["transpose_a"] = "false";
+    lhs_attrs_dict["transpose_b"] = "true";
+    rhs_attrs_dict["transpose_a"] = "false";
+    rhs_attrs_dict["transpose_b"] = "false";
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {n->inputs[1], ograds[0]}, &lhs_attrs_dict, &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {n->inputs[0], ograds[0]}, &rhs_attrs_dict, &n);
+  } else {
+    // Gradient of z = dot(x, y)
+    // dx = dot(dz, y.T)
+    // dy = dot(x.T, dz)
+    auto lhs_attrs_dict = n->attrs.dict;
+    auto rhs_attrs_dict = n->attrs.dict;
+    lhs_attrs_dict["transpose_a"] = "false";
+    lhs_attrs_dict["transpose_b"] = "true";
+    rhs_attrs_dict["transpose_a"] = "true";
+    rhs_attrs_dict["transpose_b"] = "false";
+    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
+                        {ograds[0], n->inputs[1]}, &lhs_attrs_dict, &n);
+    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
+                        {n->inputs[0], ograds[0]}, &rhs_attrs_dict, &n);
+  }
+  std::vector<nnvm::NodeEntry> ret;
+  ret.emplace_back(nnvm::NodeEntry{lhs_grad, 0, 0});
+  ret.emplace_back(nnvm::NodeEntry{rhs_grad, 0, 0});
+  return ret;
+})
 .add_argument("lhs", "NDArray-or-Symbol", "The first input")
 .add_argument("rhs", "NDArray-or-Symbol", "The second input")
 .add_arguments(DotParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/dot.cu b/src/operator/tensor/dot.cu
index 8ee2e2832fbb..b245b1c9e5ed 100644
--- a/src/operator/tensor/dot.cu
+++ b/src/operator/tensor/dot.cu
@@ -38,8 +38,5 @@ NNVM_REGISTER_OP(_backward_dot)
 NNVM_REGISTER_OP(batch_dot)
 .set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
 
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_attr<FCompute>("FCompute<gpu>", BatchDotBackward_<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index b764ac73d30c..ae8ad621df75 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -24,6 +24,7 @@
 import platform
 import mxnet as mx
 import scipy.stats as ss
+from nose.tools import assert_raises
 from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.base import MXNetError
@@ -901,6 +902,124 @@ def hybrid_forward(self, F, a):
             expected_grad[basic_index] = 1
             assert same(a.grad.asnumpy(), expected_grad)
 
+@with_seed()
+@use_np
+def test_npx_batch_dot():
+    ctx = mx.context.current_context()
+    dtypes = ['float32', 'float64']
+    if ctx.device_type == 'gpu':
+        dtypes += ['float16']
+    eps_dict = {'float32': 1E-4, 'float64': 1E-4, 'float16': 1E-3}
+    class TestBatchDot(HybridBlock):
+        def __init__(self, transpose_a, transpose_b):
+            super(TestBatchDot, self).__init__()
+            self._transpose_a = transpose_a
+            self._transpose_b = transpose_b
+
+        def hybrid_forward(self, F, lhs, rhs):
+            return F.npx.batch_dot(lhs, rhs,
+                                   transpose_a=self._transpose_a,
+                                   transpose_b=self._transpose_b)
+
+    def batch_dot_numpy(lhs, rhs, transpose_a, transpose_b):
+        assert lhs.ndim == rhs.ndim >= 3
+        if transpose_a:
+            lhs = lhs.swapaxes(-1, -2)
+        if transpose_b:
+            rhs = rhs.swapaxes(-1, -2)
+        return _np.matmul(lhs, rhs)
+
+    def gt_grad_batch_dot_numpy(lhs, rhs, ograd, transpose_a, transpose_b, lhs_req, rhs_req,
+                                init_lhs_grad, init_rhs_grad):
+
+        if transpose_a and transpose_b:
+            # Gradient of z = dot(x.T, y.T)
+            # dx = dot(dz, y).T = dot(y.T, dz.T)
+            # dy = dot(x, dz).T = dot(dz.T, x.T)
+            lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=True, transpose_b=True)
+            rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=True)
+        elif not transpose_a and transpose_b:
+            # Gradient of z = dot(x, y.T)
+            # dx = dot(dz, y)
+            # dy = dot(x.T, dz).T = dot(dz.T, x)
+            lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=False)
+            rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=False)
+        elif transpose_a and not transpose_b:
+            # Gradient of z = dot(x.T, y)
+            # dx = dot(dz, y.T).T = dot(y, dz.T)
+            # dy = dot(x, dz)
+            lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=False, transpose_b=True)
+            rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=False, transpose_b=False)
+        else:
+            # Gradient of z = dot(x, y)
+            # dx = dot(dz, y.T)
+            # dy = dot(x.T, dz)
+            lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=True)
+            rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=True, transpose_b=False)
+        if lhs_req == 'add':
+            lhs_grad += init_lhs_grad
+        if rhs_req == 'add':
+            rhs_grad += init_rhs_grad
+        return lhs_grad, rhs_grad
+
+
+    configs = [
+        ((2, 3, 0), (2, 4, 0), False, True),
+        ((2, 4, 3), (2, 4, 3), True, False),
+        ((0, 3, 0), (0, 0, 2), False, False),
+        ((3, 2, 3, 2), (3, 2, 2, 3), True, True),
+        ((3, 1, 5, 2), (3, 1, 2, 1), False, False)
+    ]
+    bad_configs = [
+        ((5, 3, 2), (5, 1, 3), False, False),
+        ((2, 5, 3, 1), (2, 4, 3, 1), True, False)
+    ]
+    for hybridize in [True, False]:
+        for lhs_shape, rhs_shape, transpose_a, transpose_b in configs:
+            for dtype in dtypes:
+                eps = eps_dict[dtype]
+                for lhs_grad_req in ['write', 'add']:
+                    for rhs_grad_req in ['write', 'add']:
+                        f_batch_dot = TestBatchDot(transpose_a=transpose_a,
+                                                   transpose_b=transpose_b)
+                        if hybridize:
+                            f_batch_dot.hybridize()
+                        lhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
+                        rhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
+                        lhs_val.attach_grad(grad_req=lhs_grad_req)
+                        rhs_val.attach_grad(grad_req=rhs_grad_req)
+                        gt_out = batch_dot_numpy(lhs_val.asnumpy(), rhs_val.asnumpy(),
+                                                 transpose_a, transpose_b)
+                        init_lhs_grad = mx.np.random.uniform(-1.0, 1.0, lhs_shape, dtype=dtype)
+                        init_rhs_grad = mx.np.random.uniform(-1.0, 1.0, rhs_shape, dtype=dtype)
+                        o_grad = mx.np.random.uniform(-1.0, 1.0, gt_out.shape, dtype=dtype)
+                        if lhs_grad_req == 'add':
+                            lhs_val.grad[:] = init_lhs_grad
+                        if rhs_grad_req == 'add':
+                            rhs_val.grad[:] = init_rhs_grad
+                        with mx.autograd.record():
+                            out = f_batch_dot(lhs_val, rhs_val)
+                        out.backward(o_grad)
+                        assert_almost_equal(out.asnumpy(), gt_out, rtol=eps, atol=eps)
+                        gt_lhs_grad, gt_rhs_grad = gt_grad_batch_dot_numpy(lhs_val.asnumpy(),
+                                                              rhs_val.asnumpy(),
+                                                              o_grad.asnumpy(),
+                                                              transpose_a=transpose_a,
+                                                              transpose_b=transpose_b,
+                                                              lhs_req=lhs_grad_req,
+                                                              rhs_req=rhs_grad_req,
+                                                              init_lhs_grad=init_lhs_grad.asnumpy(),
+                                                              init_rhs_grad=init_rhs_grad.asnumpy())
+                        assert_almost_equal(lhs_val.grad.asnumpy(), gt_lhs_grad, rtol=eps, atol=eps)
+                        assert_almost_equal(rhs_val.grad.asnumpy(), gt_rhs_grad, rtol=eps, atol=eps)
+    for lhs_shape, rhs_shape, transpose_a, transpose_b in bad_configs:
+        for dtype in dtypes:
+            lhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
+            rhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
+            assert_raises(MXNetError, lambda: mx.npx.batch_dot(lhs_val, rhs_val,
+                                                               transpose_a=transpose_a,
+                                                               transpose_b=transpose_b))
+
 
 @with_seed()
 @use_np
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a16dc6c693ab..c87fa6148d3b 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3291,9 +3291,9 @@ def test_batch_dot():
                         agrad_npy = np.empty((batch_size, m, k), dtype=data_type)
                         bgrad_npy = np.empty((batch_size, k, n), dtype=data_type)
                         a_init_grad_npy = np.random.normal(size=(batch_size, m, k))
-                        a_init_grad_npy = a_npy.astype(data_type)
+                        a_init_grad_npy = a_init_grad_npy.astype(data_type)
                         b_init_grad_npy = np.random.normal(size=(batch_size, k, n))
-                        b_init_grad_npy = b_npy.astype(data_type)
+                        b_init_grad_npy = b_init_grad_npy.astype(data_type)
                         for i in range(batch_size):
                             c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :])
                             bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :])

From 0742a9b9c4d1c15a5141869cecedec139f325dea Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Wed, 23 Oct 2019 19:12:14 -0700
Subject: [PATCH 05/32] Large Vector tests for DGL Ops Part 2 (#16497)

* add hyperbolic, logical, sign and regression tests for large vector

* changed hyperbolic functions into existing trignometric functions

* fix trigo and simple bind needs shape as tuple

* fix logical ops, add with_seed

* fix arcosh in largearray, remove regression from largevector
---
 tests/nightly/test_large_array.py      |  4 +-
 tests/nightly/test_large_vector.py     | 85 +++++++++++++++++++++++++-
 tests/python/unittest/test_operator.py |  1 +
 3 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 0cb21cedee35..c18a95400f22 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -1415,10 +1415,10 @@ def check_arcsinh():
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
     def check_arccosh():
-        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi])
+        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi, 5*np.pi/4])
         y = nd.arccosh(x)
         # expected ouput for indices=(0, 1, -3, -2, -1) after applying arccosh()
-        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi)]
+        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi), np.arccosh(5*np.pi/4)]
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
     def check_arctanh():
diff --git a/tests/nightly/test_large_vector.py b/tests/nightly/test_large_vector.py
index 23f4b8e4f310..b8edc83220bd 100644
--- a/tests/nightly/test_large_vector.py
+++ b/tests/nightly/test_large_vector.py
@@ -556,8 +556,8 @@ def test_concat():
     a = nd.ones(LARGE_X)
     b = nd.zeros(LARGE_X)
     c = nd.concat(a, b, dim=0)
-    assert c[0][0] == 1
-    assert c[-1][-1] == 0
+    assert c[0] == 1
+    assert c[-1] == 0
     assert c.shape[0] == (2 * LARGE_X)
 
 
@@ -710,6 +710,37 @@ def test_full():
     assert a[-1] == 3
 
 
+def test_sign():
+    a = mx.nd.random.normal(-1, 1, shape=LARGE_X)
+    mx_res = mx.nd.sign(a)
+    assert_almost_equal(mx_res[-1].asnumpy(), np.sign(a[-1].asnumpy()))
+
+
+def test_logical():
+    def check_logical_and(a, b):
+        mx_res = mx.nd.logical_and(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_and(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    def check_logical_or(a, b):
+        mx_res = mx.nd.logical_or(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_or(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    def check_logical_not(a, b):
+        mx_res = mx.nd.logical_not(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_not(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    def check_logical_xor(a, b):
+        mx_res = mx.nd.logical_xor(a, b)
+        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_xor(a[-1].asnumpy(), b[-1].asnumpy()))
+
+    a = mx.nd.ones(LARGE_X)
+    b = mx.nd.zeros(LARGE_X)
+    check_logical_and(a, b)
+    check_logical_or(a, b)
+    check_logical_not(a, b)
+    check_logical_xor(a, b)
+
+
 def test_astype():
     x = create_vector(size=LARGE_X//4)
     x = nd.tile(x, 4)
@@ -752,7 +783,7 @@ def assert_correctness_of_rounding_ops(output, mid, expected_vals):
 
 def test_rounding_ops():
     x = create_input_for_rounding_ops()
-    
+
     def check_ceil():
         y = nd.ceil(x)
         # expected ouput for middle 5 values after applying ceil()
@@ -854,6 +885,48 @@ def check_tan():
         expected_output = [-.577, -1, 0, 1, .577]
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
+    def check_arcsinh():
+        x = create_input_for_trigonometric_ops([-np.pi/2, -np.pi/4, 0, np.pi/4, np.pi/2])
+        y = nd.arcsinh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arcsinh()
+        expected_output = [np.arcsinh(-np.pi/2), np.arcsinh(-np.pi/4), 0, np.arcsinh(np.pi/4), np.arcsinh(np.pi/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_arccosh():
+        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi, 5*np.pi/4])
+        y = nd.arccosh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arccosh()
+        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi), np.arccosh(5*np.pi/4)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_arctanh():
+        x = create_input_for_trigonometric_ops([-1/4, -1/2, 0, 1/4, 1/2])
+        y = nd.arctanh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arctanh()
+        expected_output = [np.arctanh(-1/4), np.arctanh(-1/2), 0, np.arctanh(1/4), np.arctanh(1/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_sinh():
+        x = create_input_for_trigonometric_ops([-np.pi/2, -np.pi/4, 0, np.pi/4, np.pi/2])
+        y = nd.sinh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying sinh()
+        expected_output = [np.sinh(-np.pi/2), np.sinh(-np.pi/4), 0, np.sinh(np.pi/4), np.sinh(np.pi/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_cosh():
+        x = create_input_for_trigonometric_ops([0, 1, np.pi/2, 3*np.pi/4, np.pi])
+        y = nd.cosh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying cosh()
+        expected_output = [1, np.cosh(1), np.cosh(np.pi/2), np.cosh(3*np.pi/4), np.cosh(np.pi)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
+    def check_tanh():
+        x = create_input_for_trigonometric_ops([-1/4, -1/2, 0, 1/4, 1/2])
+        y = nd.tanh(x)
+        # expected ouput for indices=(0, 1, -3, -2, -1) after applying tanh()
+        expected_output = [np.tanh(-1/4), np.tanh(-1/2), 0, np.tanh(1/4), np.tanh(1/2)]
+        assert_correctness_of_trigonometric_ops(y, expected_output)
+
     def check_radians():
         x = create_input_for_trigonometric_ops([0, 90, 180, 270, 360])
         y = nd.radians(x)
@@ -874,6 +947,12 @@ def check_degrees():
     check_sin()
     check_cos()
     check_tan()
+    check_arcsinh()
+    check_arccosh()
+    check_arctanh()
+    check_sinh()
+    check_cosh()
+    check_tanh()
     check_radians()
     check_degrees()
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index c87fa6148d3b..075816fdc6de 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2931,6 +2931,7 @@ def test_big_transpose():
     assert_allclose(x_np, z.asnumpy().astype('uint8'))
 
 
+@with_seed()
 def test_larger_transpose():
     x = mx.nd.random.normal(shape=(50,51))
     y = mx.nd.transpose(x)

From ca5a2a0b1dface8b7849b4674694e7458a234ade Mon Sep 17 00:00:00 2001
From: Jake Lee <gstu1130@gmail.com>
Date: Wed, 23 Oct 2019 23:31:17 -0700
Subject: [PATCH 06/32] [Numpy] Loading numpy-incompatible NDArray in
 numpy-compatible mode (#16597)

* Make MXIsNumpyShape return enum

* address the comment
---
 include/mxnet/c_api.h                                  |  2 +-
 include/mxnet/imperative.h                             | 10 ++++++----
 .../src/main/native/org_apache_mxnet_native_c_api.cc   |  4 ++--
 src/c_api/c_api_ndarray.cc                             |  2 +-
 src/ndarray/ndarray.cc                                 |  3 ++-
 5 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 177ec5d40146..ac0c6726f2c7 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1170,7 +1170,7 @@ MXNET_DLL int MXAutogradIsTraining(bool* curr);
  * \param curr returns the current status
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIsNumpyShape(bool* curr);
+MXNET_DLL int MXIsNumpyShape(int* curr);
 /*!
  * \brief set numpy compatibility switch
  * \param is_np_shape 1 when numpy shape semantics is thread local on,
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 18f6424e54f7..dbd81e575872 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -108,12 +108,14 @@ class Imperative {
       is_recording_ = is_recording;
       return old;
   }
-  /*! \brief whether numpy compatibility is on. */
-  bool is_np_shape() const {
+  /*! \brief return current numpy compatibility status,
+   *  GlobalOn(2), ThreadLocalOn(1), Off(0).
+   * */
+  int is_np_shape() const {
     if (is_np_shape_global_) {
-      return true;
+      return 2;
     }
-    return is_np_shape_thread_local_;
+    return is_np_shape_thread_local_ ? 1 : 0;
   }
   /*! \brief specify numpy compatibility off, thread local on or global on. */
   bool set_is_np_shape(int is_np_shape) {
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 5c704c9646a2..26eea3dd062b 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -2777,9 +2777,9 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
 // Numpy
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyShape
   (JNIEnv *env, jobject obj, jobject compatibleRef) {
-  bool isNumpyShape;
+  int isNumpyShape;
   int ret = MXIsNumpyShape(&isNumpyShape);
-  SetIntField(env, compatibleRef, static_cast<int>(isNumpyShape));
+  SetIntField(env, compatibleRef, isNumpyShape);
   return ret;
 }
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index b80e17c18071..de208c0fed99 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -276,7 +276,7 @@ int MXAutogradSetIsRecording(int is_recording, int* prev) {
   API_END();
 }
 
-int MXIsNumpyShape(bool* curr) {
+int MXIsNumpyShape(int* curr) {
   API_BEGIN();
   *curr = Imperative::Get()->is_np_shape();
   API_END();
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index e15f72fa6cfa..44da670b800d 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1732,7 +1732,8 @@ bool NDArray::Load(dmlc::Stream *strm) {
            " Please turn on np shape semantics in Python using `with np_shape(True)`"
            " or decorator `use_np_shape` to scope the code of loading the ndarray.";
   } else {
-    CHECK(!Imperative::Get()->is_np_shape())
+    // when the flag is global on, skip the check since it would be always global on.
+    CHECK(Imperative::Get()->is_np_shape() == GlobalOn || !Imperative::Get()->is_np_shape())
         << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
            " Please turn off np shape semantics in Python using `with np_shape(False)`"
            " to scope the code of loading the ndarray.";

From 8270672d249aa634ad161b9014cd0020b0bb553f Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 24 Oct 2019 04:47:01 -0500
Subject: [PATCH 07/32] Surpress subgraph log in CI (#16607)

Change-Id: Ia2ed6fdbb1d2cb5cc607a8856ca13ee338e27eac
---
 .travis.yml                                   |  1 +
 ci/docker/runtime_functions.sh                | 19 +++++++++++++++-
 ci/windows/test_py2_cpu.ps1                   |  1 +
 ci/windows/test_py2_gpu.ps1                   |  1 +
 ci/windows/test_py3_cpu.ps1                   |  1 +
 ci/windows/test_py3_gpu.ps1                   |  1 +
 perl-package/AI-MXNet/t/test_autograd.t       |  1 +
 perl-package/AI-MXNet/t/test_gluon_trainer.t  |  2 +-
 perl-package/AI-MXNet/t/test_module.t         |  1 +
 perl-package/AI-MXNet/t/test_sparse_ndarray.t |  1 +
 src/executor/graph_executor.cc                | 22 +++++++++----------
 src/operator/subgraph/build_subgraph.cc       |  8 +++----
 12 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b0aa26c1a3a1..485faadee277 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,6 +34,7 @@ script:
 # Temporarily disable travis build due to travis constantly time out, tracked in
 # https://github:com/apache/incubator-mxnet/issues/16535:
   - export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+  - export MXNET_SUBGRAPH_VERBOSE=0
   - mv make/osx.mk config.mk
 #  - make -j 2
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 935406bb16a8..aab49f28a427 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1006,6 +1006,7 @@ cd_unittest_ubuntu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     export CD_JOB=1 # signal this is a CD run so any unecessary tests can be skipped
 
@@ -1048,6 +1049,7 @@ unittest_ubuntu_python2_cpu_cython() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
     check_cython 2
@@ -1061,6 +1063,7 @@ unittest_ubuntu_python2_cpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
@@ -1072,6 +1075,7 @@ unittest_ubuntu_python3_cpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
@@ -1082,6 +1086,7 @@ unittest_ubuntu_python3_cpu_mkldnn() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_mkl.xml --verbose tests/python/mkl
@@ -1092,6 +1097,7 @@ unittest_ubuntu_python2_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
@@ -1101,6 +1107,7 @@ unittest_ubuntu_python3_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
@@ -1111,6 +1118,7 @@ unittest_ubuntu_python3_gpu_cython() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
@@ -1122,6 +1130,7 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_OFF_TEST_ONLY=true
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
@@ -1131,6 +1140,7 @@ unittest_ubuntu_tensorrt_gpu() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
@@ -1145,6 +1155,7 @@ unittest_ubuntu_python2_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
@@ -1157,6 +1168,7 @@ unittest_ubuntu_python3_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
@@ -1319,6 +1331,7 @@ integrationtest_ubuntu_gpu_python() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     python example/image-classification/test_score.py
 }
 
@@ -1347,6 +1360,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
     pushd .
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_USE_OPERATOR_TUNING=0
     cd tests/nightly/
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu
@@ -1381,6 +1395,7 @@ integrationtest_ubuntu_gpu_dist_kvstore() {
     pushd .
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     cd tests/nightly/
     ../../tools/launch.py -n 4 --launcher local python dist_device_sync_kvstore.py
     ../../tools/launch.py -n 4 --launcher local python dist_sync_kvstore.py --type=init_gpu
@@ -1568,6 +1583,7 @@ nightly_tutorial_test_ubuntu_python3_gpu() {
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python3
     cd /work/mxnet/tests/tutorials
@@ -1581,6 +1597,7 @@ nightly_tutorial_test_ubuntu_python2_gpu() {
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python2
     cd /work/mxnet/tests/tutorials
@@ -1974,7 +1991,7 @@ cd_package_pypi() {
     popd
 }
 
-# Sanity checks wheel file 
+# Sanity checks wheel file
 cd_integration_test_pypi() {
     set -ex
     local python_cmd=${1:?"This function requires a python command as the first argument"}
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index df9b15ba1ec3..c39d1fa45328 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index f2974ff6f7b6..b2ea62fc7cd4 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index 900bfd161cd0..1e09b5c98ce1 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index b6e951b291fb..9bf7d04d8a88 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -20,6 +20,7 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
diff --git a/perl-package/AI-MXNet/t/test_autograd.t b/perl-package/AI-MXNet/t/test_autograd.t
index 931c6d59333b..2ddad60df989 100644
--- a/perl-package/AI-MXNet/t/test_autograd.t
+++ b/perl-package/AI-MXNet/t/test_autograd.t
@@ -23,6 +23,7 @@ use AI::MXNet::TestUtils qw(same almost_equal rand_ndarray);
 use AI::MXNet::Base qw(:DEFAULT pones);
 use Test::More tests => 246;
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub autograd_assert
 {
diff --git a/perl-package/AI-MXNet/t/test_gluon_trainer.t b/perl-package/AI-MXNet/t/test_gluon_trainer.t
index 81113af28c20..3b1130af4ecf 100644
--- a/perl-package/AI-MXNet/t/test_gluon_trainer.t
+++ b/perl-package/AI-MXNet/t/test_gluon_trainer.t
@@ -25,6 +25,7 @@ use AI::MXNet::TestUtils qw(almost_equal dies_ok);
 use Scalar::Util qw(refaddr);
 use AI::MXNet::Base;
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub test_multi_trainer
 {
@@ -252,4 +253,3 @@ sub test_trainer_reset_kv
 }
 
 test_trainer_reset_kv();
-
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
index 3bbd8fdc4ea4..55e098683399 100644
--- a/perl-package/AI-MXNet/t/test_module.t
+++ b/perl-package/AI-MXNet/t/test_module.t
@@ -22,6 +22,7 @@ use AI::MXNet qw(mx);
 use AI::MXNet::Base;
 use AI::MXNet::TestUtils qw(almost_equal enumerate same_array dies_like rand_ndarray);
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub test_module_layout
 {
diff --git a/perl-package/AI-MXNet/t/test_sparse_ndarray.t b/perl-package/AI-MXNet/t/test_sparse_ndarray.t
index f143346b4890..afb0b25aa816 100644
--- a/perl-package/AI-MXNet/t/test_sparse_ndarray.t
+++ b/perl-package/AI-MXNet/t/test_sparse_ndarray.t
@@ -24,6 +24,7 @@ use AI::MXNet::TestUtils qw(zip assert enumerate same rand_shape_2d rand_shape_3
     rand_sparse_ndarray random_arrays almost_equal rand_ndarray randint allclose dies_ok);
 use AI::MXNet::Base qw(pones pzeros pdl product rand_sparse);
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
+$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 
 sub sparse_nd_ones
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index d92253266f35..882105da1321 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1627,16 +1627,16 @@ static nnvm::Graph InferForwardAttrs(nnvm::Graph g,
 
 static bool SubgraphBackendCheck(const op::SubgraphBackendPtr& backend,
                                  const Context& default_ctx,
-                                 bool verbose = false) {
+                                 int verbose = 1) {
   if (backend->HasAttr("enable") && (backend->GetAttr<bool>("enable") != true)) {
-    if (verbose) {
+    if (verbose > 1) {
       LOG(INFO) << "Subgraph backend " << backend->GetName()
                 << " isn't activated.";
     }
     return false;
   }
   if (backend->HasAttr("context") && backend->GetAttr<Context>("context") != default_ctx) {
-    if (verbose) {
+    if (verbose > 1) {
       LOG(INFO) << "Subgraph backend " << backend->GetName()
                 << " isn't activated as context mismatch.";
     }
@@ -1647,7 +1647,7 @@ static bool SubgraphBackendCheck(const op::SubgraphBackendPtr& backend,
 
 static bool SubgraphPropertyCheck(const std::string& backend_name,
                                   const op::SubgraphPropertyPtr& prop, bool need_grad,
-                                  bool verbose = false) {
+                                  int verbose = 1) {
   auto full_name =
       prop->HasAttr("property_name") ? prop->GetAttr<std::string>("property_name") : std::string();
   if (prop->HasAttr("disable") && prop->GetAttr<bool>("disable") == true) {
@@ -1657,7 +1657,7 @@ static bool SubgraphPropertyCheck(const std::string& backend_name,
   }
   if (prop->HasAttr("inference_only") && prop->GetAttr<bool>("inference_only") == true) {
     if (need_grad) {
-      if (verbose) {
+      if (verbose > 1) {
         LOG(INFO) << "skip partitioning graph with subgraph property " << full_name
                   << " from backend " << backend_name << " as it requires `grad_req=null`.";
       }
@@ -1699,7 +1699,7 @@ static nnvm::Symbol BuildSubgraph(
     const std::unordered_map<std::string, int>& arg_stype_map, const Context& default_ctx,
     const std::map<std::string, Context>& ctx_map, std::vector<Context>* in_arg_ctxes,
     std::vector<Context>* arg_grad_ctxes, std::vector<OpReqType>* grad_req_types,
-    std::vector<Context>* aux_state_ctxes, bool verbose = false) {
+    std::vector<Context>* aux_state_ctxes, int verbose = 1) {
   // setup map for in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes and grad_req_types
   std::unordered_map<std::string, Context> in_arg_ctx_map;
   std::unordered_map<std::string, Context> arg_grad_ctx_map;
@@ -1794,7 +1794,7 @@ static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src, const op::SubgraphBac
                                   std::vector<NDArray>* in_args,
                                   std::vector<NDArray>* arg_grad_store,
                                   std::vector<OpReqType>* grad_req_type,
-                                  std::vector<NDArray>* aux_states, bool verbose = false) {
+                                  std::vector<NDArray>* aux_states, int verbose = 1) {
   // setup map for in_args, arg_grad_store, grad_req_type and aux_states
   std::unordered_map<std::string, NDArray> in_args_map;
   std::unordered_map<std::string, NDArray> arg_grad_store_map;
@@ -1929,11 +1929,11 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
   auto exec = new exec::GraphExecutor();
   bool init = false;
   if (!exec->subgraph_property().empty()) {
-    static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
+    static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
     const auto& backend_name = exec->subgraph_property();
     const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name);
     if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) {
-      LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
+      if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
       std::vector<Context> tmp_in_arg_ctxes = in_arg_ctxes;
       std::vector<Context> tmp_arg_grad_ctxes = arg_grad_ctxes;
       std::vector<Context> tmp_aux_state_ctxes = aux_state_ctxes;
@@ -2001,7 +2001,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
                          const std::vector<NDArray> &aux_states,
                          Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
-  static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
+  static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
   std::vector<NDArray> tmp_in_args = in_args;
   std::vector<NDArray> tmp_arg_grad_store = arg_grad_store;
   std::vector<OpReqType> tmp_grad_req_type = grad_req_type;
@@ -2011,7 +2011,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
     const auto& backend_name = exec->subgraph_property();
     const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name);
     if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) {
-      LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
+      if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
       symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args,
                                    &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states,
                                    verbose);
diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index d43647ac83b9..0f4c570331a2 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -318,8 +318,8 @@ void PreSelectSubgraphNodes(const nnvm::Graph& g, SubgraphSelectorV2Ptr subgraph
       for (auto node : excluded_nodes) {
         excluded_node_names += node->node->attrs.name + ", ";
       }
-      static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
-      if (verbose) {
+      static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
+      if (verbose > 1) {
         LOG(INFO) << "Found a cycle when BFS from node " << simple_nodes[snid]->node->attrs.name
                   << ". Excluding nodes " << excluded_node_names << "and retrying";
       }
@@ -706,9 +706,9 @@ void TopSortEntries(const nnvm::Graph& g,
 }
 
 nnvm::Graph BuildSubgraph(nnvm::Graph&& g) {
-    static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
+    static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
   if (!g.HasAttr("subgraph_property")) {  // treat the whole graph as a subgraph
-    if (verbose) {
+    if (verbose > 1) {
       LOG(INFO) << "The graph has no attribute of subgraph_property attached. "
                    "The original graph is returned.";
     }

From bde443e2399a8ce9ffe5957e5c3021b79c8055d2 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 24 Oct 2019 05:04:22 -0500
Subject: [PATCH 08/32] Fix dequantize memory corruption (#16606)

Change-Id: I51b62a32987bdbcf96f04b1bc6617e66796f648b
---
 src/operator/quantization/dequantize.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index e8e2cd90b86c..9ce135040fb4 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -43,8 +43,6 @@ bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
   }
 #endif
   (*out_attrs)[0] = kDefaultStorage;
-  (*out_attrs)[1] = kDefaultStorage;
-  (*out_attrs)[2] = kDefaultStorage;
   return true;
 }
 

From dd4eaf5c23046d07a4578a219e2dd3622e5620fa Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 24 Oct 2019 05:05:06 -0500
Subject: [PATCH 09/32] [MKLDNN]Fix reorder2default (#16602)

* Fix reorder2default

Change-Id: I74c87af9535f6264e6d1ea7eaed089a6480a3358

* fix

Change-Id: I6d07b43b520a47e7c78bd4b4b6390f5fb95e6957

* Fix

Change-Id: Id72f25c34291be4711f55569c6d61467edd6113d

* Fix CI

Change-Id: I8c33a82555d5ace2d0b682c1e3eefa13f3a44768

* Run CI

Change-Id: Ie8a6dab80ef91c0337cafbae4e3db277e0c7ebf7
---
 src/ndarray/ndarray.cc | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 44da670b800d..aaa7aedf8bcd 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1635,11 +1635,13 @@ void NDArray::Save(dmlc::Stream *strm) const {
     nd_cpu.WaitToRead();
     save_data = nd_cpu.data();
   } else {
+#if MXNET_USE_MKLDNN == 1
+    // For mkldnn, a copy of *this can ensure no write access pending on *this.
+    nd_cpu = this->Copy(Context::CPU());
+    nd_cpu.WaitToRead();
+#else
     this->WaitToRead();
     nd_cpu = *this;
-#if MXNET_USE_MKLDNN == 1
-    if (nd_cpu.IsMKLDNNData())
-      nd_cpu = nd_cpu.Reorder2Default();
 #endif
     save_data = nd_cpu.data();
   }
@@ -2024,15 +2026,18 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
-    this->WaitToRead();
-    RunContext rctx{this->ctx(), nullptr, nullptr, false};
-    NDArray src = *this;
+    Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          RunContext ctx{this->ctx(), nullptr, nullptr, false};
+          NDArray src = *this;
 #if MXNET_USE_MKLDNN == 1
-    if (src.IsMKLDNNData())
-      src = this->Reorder2Default();
+          src = this->Reorder2Default();
 #endif
-    ndarray::Copy<cpu, cpu>(src.data(), &dst,
-                            Context::CPU(), Context::CPU(), rctx);
+          ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), ctx);
+          on_complete();
+        },
+        this->ctx(), {this->var()}, {}, FnProperty::kNormal, 0, "SyncCopyCPU2CPU");
+    this->WaitToWrite();
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushAsync(

From e10e94e64cde6655aa3e7952981661fb49785f5b Mon Sep 17 00:00:00 2001
From: Talia <31782251+TEChopra1000@users.noreply.github.com>
Date: Thu, 24 Oct 2019 09:09:24 -0700
Subject: [PATCH 10/32] second round of fixing broken links in multiple files
 (#16598)

---
 .../python/tutorials/packages/gluon/image/mnist.md   |  2 +-
 .../python/tutorials/packages/ndarray/sparse/csr.md  |  2 +-
 .../tutorials/packages/ndarray/sparse/row_sparse.md  |  2 +-
 .../tutorials/packages/ndarray/sparse/train.md       | 10 +++++-----
 .../_includes/get_started/devices/nvidia-jetson.md   |  2 +-
 .../src/_includes/get_started/get_started.html       |  8 ++++----
 .../src/_includes/get_started/linux/java/cpu.md      |  2 +-
 .../src/_includes/get_started/linux/java/gpu.md      |  2 +-
 .../get_started/linux/julia/build-from-source.md     |  2 +-
 .../src/_includes/get_started/linux/r/cpu.md         |  2 +-
 .../src/_includes/get_started/linux/r/gpu.md         |  2 +-
 .../src/_includes/get_started/macos/java/cpu.md      |  2 +-
 .../src/_includes/get_started/pip_snippet.md         |  2 +-
 .../get_started/windows/julia/build-from-source.md   |  2 +-
 .../src/_includes/get_started/windows/perl/perl.md   |  2 +-
 .../windows/python/cpu/build-from-source.md          |  2 +-
 .../windows/python/gpu/build-from-source.md          |  2 +-
 .../src/_includes/get_started/windows/r/cpu.md       |  2 +-
 .../src/_includes/get_started/windows/r/gpu.md       |  2 +-
 .../pages/api/r/docs/tutorials/callback_function.md  |  8 ++++----
 .../api/r/docs/tutorials/custom_loss_function.md     |  8 ++++----
 .../src/pages/api/r/docs/tutorials/multi_dim_lstm.md |  4 ++--
 .../src/pages/api/r/docs/tutorials/ndarray.md        | 12 ++++++------
 .../src/pages/api/r/docs/tutorials/symbol.md         | 12 ++++++------
 docs/static_site/src/pages/get_started/index.html    |  2 +-
 julia/docs/src/tutorial/mnist.md                     |  2 +-
 26 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index a6898278edf6..39726a3a511c 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -113,7 +113,7 @@ to train the MLP network we defined above.
 For our training, we will make use of the stochastic gradient descent (SGD) optimizer. In particular, we'll be using mini-batch SGD. Standard SGD processes train data one example at a time. In practice, this is very slow and one can speed up the process by processing examples in small batches. In this case, our batch size will be 100, which is a reasonable choice. Another parameter we select here is the learning rate, which controls the step size the optimizer takes in search of a solution. We'll pick a learning rate of 0.02, again a reasonable choice. Settings such as batch size and learning rate are what are usually referred to as hyper-parameters. What values we give them can have a great impact on training performance.
 
 We will use [Trainer](/api/python/docs/api/gluon/trainer.html) class to apply the
-[SGD optimizer](https://mxnet.io/api/python/docs/api/gluon-related/_autogen/mxnet.optimizer.SGD.html) on the
+[SGD optimizer](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.SGD) on the
 initialized parameters.
 
 ```python
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
index 0b362513c0ae..b91279cff4d4 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
@@ -556,7 +556,7 @@ except mx.MXNetError as err:
 
 ## Next 
 
-[Train a Linear Regression Model with Sparse Symbols](http://mxnet.apache.org/tutorials/sparse/train.html)
+[Train a Linear Regression Model with Sparse Symbols](/api/python/docs/tutorials/packages/ndarray/sparse/train.html)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
index 1241182af85b..7500e82cf9e6 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
@@ -578,7 +578,7 @@ except mx.MXNetError as err:
 
 ## Next
 
-[Train a Linear Regression Model with Sparse Symbols](http://mxnet.apache.org/tutorials/sparse/train.html)
+[Train a Linear Regression Model with Sparse Symbols](/api/python/docs/tutorials/packages/ndarray/sparse/train.html)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
index 71669e142a4b..336185cf7583 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
@@ -27,18 +27,18 @@ then train a linear regression model using sparse symbols with the Module API.
 
 To complete this tutorial, we need:
 
-- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.io/get_started).  
+- MXNet. See the instructions for your operating system in [Setup and Installation](/get_started).  
 
-- [Jupyter Notebook](https://jupyter.org/index.html) and [Python Requests](http://docs.python-requests.org/en/master/) packages.
+- [Jupyter Notebook](https://jupyter.org/index.html) and [Python Requests](https://3.python-requests.org/) packages.
 ```
 pip install jupyter requests
 ```
 
 - Basic knowledge of Symbol in MXNet. See the detailed tutorial for Symbol in [Symbol - Neural Network Graphs and Auto-differentiation](https://mxnet.apache.org/tutorials/basic/symbol.html).
 
-- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](https://mxnet.apache.org/versions/master/tutorials/sparse/csr.html).
+- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](/api/python/docs/tutorials/packages/ndarray/sparse/csr.html).
 
-- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](https://mxnet.apache.org/versions/master/tutorials/sparse/row_sparse.html).
+- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](/api/python/docs/tutorials/packages/ndarray/sparse/row_sparse.html).
 
 ## Variables
 
@@ -155,7 +155,7 @@ f = mx.sym.sparse.elemwise_add(c, c)
 ### Storage Type Inference
 
 What will be the output storage types of sparse symbols? In MXNet, for any sparse symbol, the result storage types are inferred based on storage types of inputs.
-You can read the [Sparse Symbol API](https://mxnet.apache.org/versions/master/api/python/symbol/sparse.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
+You can read the [Sparse Symbol API](/api/python/docs/api/symbol/sparse/index.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
 
 
 ```python
diff --git a/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md b/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
index fe515f3392d7..40fb1d2e82f5 100644
--- a/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
+++ b/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
@@ -1,4 +1,4 @@
 # NVIDIA Jetson Devices
 
 To install MXNet on a Jetson TX or Nano, please refer to the [Jetson installation
-guide](get_started/jetson_setup).
\ No newline at end of file
+guide](/get_started/jetson_setup).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/get_started.html b/docs/static_site/src/_includes/get_started/get_started.html
index 4905d28ce2d3..77367c7ed337 100644
--- a/docs/static_site/src/_includes/get_started/get_started.html
+++ b/docs/static_site/src/_includes/get_started/get_started.html
@@ -256,8 +256,8 @@ <h2>Installing MXNet</h2>
             </div> <!-- END - C++-->
 
             <br>
-            For more installation options, refer to the <a href="get_started/ubuntu_setup.html">Ubuntu installation guide</a> and
-            <a href="get_started/centos_setup.html">CentOS installation guide</a>.
+            For more installation options, refer to the <a href="/get_started/ubuntu_setup.html">Ubuntu installation guide</a> and
+            <a href="/get_started/centos_setup.html">CentOS installation guide</a>.
         </div> <!-- END - Linux -->
 
 
@@ -354,7 +354,7 @@ <h2>Installing MXNet</h2>
                 </div> <!-- End of cpu gpu -->
             </div>
             <br>
-            For more installation options, refer to the <a href="get_started/osx_setup.html">MXNet macOS installation guide</a>.
+            For more installation options, refer to the <a href="/get_started/osx_setup.html">MXNet macOS installation guide</a>.
         </div> <!-- END - Mac OS -->
 
 
@@ -440,7 +440,7 @@ <h2>Installing MXNet</h2>
                 </div> <!-- End of cpu gpu -->
             </div> <!-- End of C++ -->
 
-            For more installation options, refer to the <a href="get_started/windows_setup.html">MXNet Windows installation guide</a>.
+            For more installation options, refer to the <a href="/get_started/windows_setup.html">MXNet Windows installation guide</a>.
         </div> <!-- End of Windows -->
 
 
diff --git a/docs/static_site/src/_includes/get_started/linux/java/cpu.md b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
index 5345a2d754b2..fc6f598fa5ee 100644
--- a/docs/static_site/src/_includes/get_started/linux/java/cpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
@@ -1,6 +1,6 @@
 You can use the Maven packages defined in the following dependency to include MXNet in your Java
 project. The Java API is provided as a subset of the Scala API and is intended for inference only.
-Please refer to the <a href="get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
+Please refer to the <a href="/get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
 instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~">
diff --git a/docs/static_site/src/_includes/get_started/linux/java/gpu.md b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
index 5e687a353fe4..6f6757f6e2ea 100644
--- a/docs/static_site/src/_includes/get_started/linux/java/gpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
@@ -1,6 +1,6 @@
 You can use the Maven packages defined in the following dependency to include MXNet in your Java
 project. The Java API is provided as a subset of the Scala API and is intended for inference only.
-Please refer to the <a href="get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
+Please refer to the <a href="/get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
 instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~">
diff --git a/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
index fbbc0bd248a9..018aca9d7387 100644
--- a/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
@@ -1,2 +1,2 @@
-Refer to the [Julia section of the MXNet Ubuntu installation guide](get_started/ubuntu_setup#install-the-mxnet-package-for-julia).
+Refer to the [Julia section of the MXNet Ubuntu installation guide](/get_started/ubuntu_setup#install-the-mxnet-package-for-julia).
 
diff --git a/docs/static_site/src/_includes/get_started/linux/r/cpu.md b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
index c0a4e015b61d..88ca5dd39933 100644
--- a/docs/static_site/src/_includes/get_started/linux/r/cpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
@@ -1,5 +1,5 @@
 The default version of R that is installed with `apt-get` is insufficient. You will need
-to first [install R v3.4.4+ and build MXNet from source](get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
+to first [install R v3.4.4+ and build MXNet from source](/get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
 
 After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
 
diff --git a/docs/static_site/src/_includes/get_started/linux/r/gpu.md b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
index 57afe7a8d65e..16fbfd09d4d4 100644
--- a/docs/static_site/src/_includes/get_started/linux/r/gpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
@@ -1,7 +1,7 @@
 The default version of R that is installed with `apt-get` is insufficient. You will need
 to first
 [install R v3.4.4+ and build MXNet from
-source](get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
+source](/get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
 
 After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings
 with the
diff --git a/docs/static_site/src/_includes/get_started/macos/java/cpu.md b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
index 2050149fd33d..002037a15771 100644
--- a/docs/static_site/src/_includes/get_started/macos/java/cpu.md
+++ b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
@@ -1,7 +1,7 @@
 You can use the Maven packages defined in the following dependency to include MXNet in
 your Java project. The Java API is provided as a subset of the Scala API and is intended for
 inference only.
-Please refer to the [MXNet-Java setup guide](get_started/java_setup.html) for a detailed set of instructions to help you with the setup process.
+Please refer to the [MXNet-Java setup guide](/get_started/java_setup.html) for a detailed set of instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~"><img
 src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg"
diff --git a/docs/static_site/src/_includes/get_started/pip_snippet.md b/docs/static_site/src/_includes/get_started/pip_snippet.md
index f5cc4ea12803..2c4d932fc816 100644
--- a/docs/static_site/src/_includes/get_started/pip_snippet.md
+++ b/docs/static_site/src/_includes/get_started/pip_snippet.md
@@ -1,6 +1,6 @@
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for
-other MXNet pip packages</a>, or <a href="get_started/validate_mxnet.html">validate your MXNet installation</a>.
+other MXNet pip packages</a>, or <a href="/get_started/validate_mxnet.html">validate your MXNet installation</a>.
 
 <div style="text-align: center">
     <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.5.1.png"
diff --git a/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
index f9e61cb1c64e..4fc600468ad1 100644
--- a/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
@@ -1 +1 @@
-Refer to the [Julia section of the MXNet Windows installation guide](get_started/windows_setup.html#install-the-mxnet-package-for-julia).
+Refer to the [Julia section of the MXNet Windows installation guide](/get_started/windows_setup.html#install-the-mxnet-package-for-julia).
diff --git a/docs/static_site/src/_includes/get_started/windows/perl/perl.md b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
index a24ae0aa13f0..1a8eea5261ba 100644
--- a/docs/static_site/src/_includes/get_started/windows/perl/perl.md
+++ b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
@@ -1 +1 @@
-Refer to the [Perl section of the MXNet Windows installation guide](get_started/windows_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
+Refer to the [Perl section of the MXNet Windows installation guide](/get_started/windows_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
index fe0ac055c914..af36205337d2 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
@@ -1 +1 @@
-Refer to the [MXNet Windows installation guide](get_started/windows_setup.html)
\ No newline at end of file
+Refer to the [MXNet Windows installation guide](/get_started/windows_setup.html)
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
index 762f720b5403..55bca3a129d8 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
@@ -1 +1 @@
-To build from source, refer to the [MXNet Windows installation guide](get_started/windows_setup.html).
+To build from source, refer to the [MXNet Windows installation guide](/get_started/windows_setup.html).
diff --git a/docs/static_site/src/_includes/get_started/windows/r/cpu.md b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
index 3110475a2abd..926b8355c984 100644
--- a/docs/static_site/src/_includes/get_started/windows/r/cpu.md
+++ b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
@@ -1,7 +1,7 @@
 Note: packages for 3.6.x are not yet available.
 Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/windows/base/old/).
 
-You can [build MXNet-R from source](get_started/windows_setup.html#install-mxnet-package-for-r), or
+You can [build MXNet-R from source](/get_started/windows_setup.html#install-mxnet-package-for-r), or
 you can use a
 pre-built binary:
 
diff --git a/docs/static_site/src/_includes/get_started/windows/r/gpu.md b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
index 0840d2d2acbc..084f1a5a4012 100644
--- a/docs/static_site/src/_includes/get_started/windows/r/gpu.md
+++ b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
@@ -1,4 +1,4 @@
-You can [build MXNet-R from source](get_started/windows_setup.html#install-mxnet-package-for-r), or
+You can [build MXNet-R from source](/get_started/windows_setup.html#install-mxnet-package-for-r), or
 you can use a
 pre-built binary:
 
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
index 52e4db92f84b..d74112db98b5 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
@@ -272,7 +272,7 @@ Yes! You can stop the training early with `return(FALSE)`. See the following exa
 When the validation metric dips below the threshold we set, the training process stops.
 
 ## Next Steps
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with a Pretrained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model Using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with a Pretrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
index 0f37123f23da..a4ca967d8e2c 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
@@ -225,7 +225,7 @@ sum(abs(test.y - pred6[1,])) / length(test.y)
 
 
 ## Next Steps
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with a PreTrained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model Using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with a PreTrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
index e6e218f2ae13..2c24cdf92fc3 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
@@ -55,7 +55,7 @@ PM2.5 concentration levels.
 
 Load and pre-process the data
 ---------
-The first step is to load in the data and preprocess it. It is assumed that the data has been downloaded in a .csv file: data.csv from the [pollution dataset](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data)
+The first step is to load in the data and preprocess it. It is assumed that the data has been downloaded in a .csv file: data.csv from the [pollution dataset](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data).
 
  ```r
 ## Loading required packages
@@ -324,4 +324,4 @@ We also repeated the above experiments to generate the next 100 samples to 301st
 
 The above tutorial is just for demonstration purposes and has not been tuned extensively for accuracy.
 
-For more tutorials on MXNet-R, head on to [MXNet-R tutorials](https://mxnet.apache.org/tutorials/r/index.html)
+For more tutorials on MXNet-R, head on to [MXNet-R tutorials](/api/r/docs/tutorials)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
index 9113b0d313d5..dc3d1c5a028e 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
@@ -223,9 +223,9 @@ The actual computations are finished, allowing us to copy the results someplace
 the results.
 
 ## Next Steps
-* [Symbol](https://mxnet.io/tutorials/r/symbol.html)
-* [Write and use callback functions](https://mxnet.io/tutorials/r/CallbackFunction.html)
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Symbol](/api/r/docs/tutorials/symbol)
+* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
index 9c3150f97157..b5d6b8fd32a7 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
@@ -130,7 +130,7 @@ In the example, *net* is used as a function to apply to an existing symbol
 
 The [model API](https://github.com/apache/incubator-mxnet/blob/master/R-package/R/model.R) is a thin wrapper around the symbolic executors to support neural net training.
 
-We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](../../api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
+We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](/api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
 
 ## How Efficient Is the Symbolic API?
 
@@ -147,8 +147,8 @@ be more memory efficient than CXXNet and gets to the same runtime with
 greater flexibility.
 
 ## Next Steps
-* [Write and use callback functions](https://mxnet.io/tutorials/r/CallbackFunction.html)
-* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
-* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
+* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
+* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
+* [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
+* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
+* [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/get_started/index.html b/docs/static_site/src/pages/get_started/index.html
index e89b5e3b36e8..02e7cf1b8641 100644
--- a/docs/static_site/src/pages/get_started/index.html
+++ b/docs/static_site/src/pages/get_started/index.html
@@ -28,6 +28,6 @@
 <div class="get-started-from-source">
 <div class="wrapper">
     <h2>Download from source</h2>
-    <p>The signed source code for Apache MXNet (incubating) is available for download <a href="get_started/download">here</a></p>
+    <p>The signed source code for Apache MXNet (incubating) is available for download <a href="/get_started/download">here</a></p>
 </div>
 </div>
diff --git a/julia/docs/src/tutorial/mnist.md b/julia/docs/src/tutorial/mnist.md
index edc1a67d2485..a404f75efe12 100644
--- a/julia/docs/src/tutorial/mnist.md
+++ b/julia/docs/src/tutorial/mnist.md
@@ -23,7 +23,7 @@ multi-layer perceptron and then a convolutional neural network (the
 LeNet architecture) on the [MNIST handwritten digit
 dataset](http://yann.lecun.com/exdb/mnist/). The code for this tutorial
 could be found in
-[examples/mnist](https://github.com/apache/incubator-mxnet/blob/master/julia/docs/src/tutorial/mnist.md).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
+[examples/mnist](/api/julia/docs/api/tutorial/mnist/).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
 
 Simple 3-layer MLP
 ------------------

From 82ddc93e215e45419d3dc3db0ec30f24f40de851 Mon Sep 17 00:00:00 2001
From: Jacob Kim <me@thejacobkim.com>
Date: Fri, 25 Oct 2019 01:10:07 +0900
Subject: [PATCH 11/32] Python Docstring Convetion (#16550)

* Docstring convetnion for

* Docstring convention for

* Docstring convention for

* Docstring convention for

* Docstring convention for

* Docstring convention for

* Docstring convention

* Revert removing new line

* Remove white space
---
 python/mxnet/kvstore.py    |  7 ++-----
 python/mxnet/metric.py     | 20 ++++++--------------
 python/mxnet/profiler.py   |  3 +--
 python/mxnet/rtc.py        |  3 ++-
 python/mxnet/runtime.py    | 22 ++++++----------------
 python/mxnet/test_utils.py | 12 +++++++-----
 python/mxnet/util.py       |  6 ++----
 7 files changed, 26 insertions(+), 47 deletions(-)

diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index 5d332ff45ecb..61c64ec0984f 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -31,8 +31,7 @@
 from .profiler import set_kvstore_handle
 
 def _ctype_key_value(keys, vals):
-    """
-    Returns ctype arrays for the key-value args, and the whether string keys are used.
+    """Returns ctype arrays for the key-value args, and the whether string keys are used.
     For internal use only.
     """
     if isinstance(keys, (tuple, list)):
@@ -66,9 +65,7 @@ def _ctype_key_value(keys, vals):
         return (c_keys, c_handle_array(vals), use_str_keys)
 
 def _ctype_dict(param_dict):
-    """
-    Returns ctype arrays for keys and values(converted to strings) in a dictionary
-    """
+    """Returns ctype arrays for keys and values(converted to strings) in a dictionary"""
     assert(isinstance(param_dict, dict)), \
         "unexpected type for param_dict: " + str(type(param_dict))
     c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()])
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 07ec2ef4d61d..6e2d66cb9d15 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -153,8 +153,7 @@ def reset(self):
         self.global_sum_metric = 0.0
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results
-        to initial state."""
+        """Resets the local portion of the internal evaluation results to initial state."""
         self.num_inst = 0
         self.sum_metric = 0.0
 
@@ -372,8 +371,7 @@ def reset(self):
             pass
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results
-        to initial state."""
+        """Resets the local portion of the internal evaluation results to initial state."""
         try:
             for metric in self.metrics:
                 metric.reset_local()
@@ -592,8 +590,7 @@ def update(self, labels, preds):
 
 
 class _BinaryClassificationMetrics(object):
-    """
-    Private container class for classification metric statistics. True/false positive and
+    """Private container class for classification metric statistics. True/false positive and
      true/false negative counts are sufficient statistics for various classification metrics.
     This class provides the machinery to track those statistics across mini-batches of
     (label, prediction) pairs.
@@ -610,9 +607,7 @@ def __init__(self):
         self.global_true_negatives = 0
 
     def update_binary_stats(self, label, pred):
-        """
-        Update various binary classification counts for a single (label, pred)
-        pair.
+        """Update various binary classification counts for a single (label, pred) pair.
 
         Parameters
         ----------
@@ -691,9 +686,7 @@ def global_fscore(self):
             return 0.
 
     def matthewscc(self, use_global=False):
-        """
-        Calculate the Matthew's Correlation Coefficent
-        """
+        """Calculate the Matthew's Correlation Coefficent"""
         if use_global:
             if not self.global_total_examples:
                 return 0.
@@ -1604,8 +1597,7 @@ def reset(self):
         self.reset_local()
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results
-        to initial state."""
+        """Resets the local portion of the internal evaluation results to initial state."""
         self.num_inst = 0.
         self.lcm = numpy.zeros((self.k, self.k))
 
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index 7dbc060ed60f..8e8ac87c9e06 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -207,8 +207,7 @@ def pause(profile_process='worker'):
 
 
 def resume(profile_process='worker'):
-    """
-    Resume paused profiling.
+    """Resume paused profiling.
 
     Parameters
     ----------
diff --git a/python/mxnet/rtc.py b/python/mxnet/rtc.py
index 4dea0e656b7e..5dfc5ea6dfe2 100644
--- a/python/mxnet/rtc.py
+++ b/python/mxnet/rtc.py
@@ -172,7 +172,8 @@ def get_kernel(self, name, signature):
 
 class CudaKernel(object):
     """Constructs CUDA kernel. Should be created by `CudaModule.get_kernel`,
-    not intended to be used by users."""
+    not intended to be used by users.
+    """
     def __init__(self, handle, name, is_ndarray, dtypes):
         self.handle = handle
         self._name = name
diff --git a/python/mxnet/runtime.py b/python/mxnet/runtime.py
index 0f7de76937c0..f2e98fe674fa 100644
--- a/python/mxnet/runtime.py
+++ b/python/mxnet/runtime.py
@@ -26,9 +26,7 @@
 from .base import _LIB, check_call
 
 class Feature(ctypes.Structure):
-    """
-    Compile time feature description, member fields: `name` and `enabled`.
-    """
+    """Compile time feature description, member fields: `name` and `enabled`."""
     _fields_ = [
         ("_name", ctypes.c_char_p),
         ("_enabled", ctypes.c_bool)
@@ -36,16 +34,12 @@ class Feature(ctypes.Structure):
 
     @property
     def name(self):
-        """
-        Feature name.
-        """
+        """Feature name."""
         return self._name.decode()
 
     @property
     def enabled(self):
-        """
-        True if MXNet was compiled with the given compile-time feature.
-        """
+        """True if MXNet was compiled with the given compile-time feature."""
         return self._enabled
 
     def __repr__(self):
@@ -55,8 +49,7 @@ def __repr__(self):
             return "✖ {}".format(self.name)
 
 def feature_list():
-    """
-    Check the library for compile-time features. The list of features are maintained in libinfo.h and libinfo.cc
+    """Check the library for compile-time features. The list of features are maintained in libinfo.h and libinfo.cc
 
     Returns
     -------
@@ -70,9 +63,7 @@ def feature_list():
     return features
 
 class Features(collections.OrderedDict):
-    """
-    OrderedDict of name to Feature
-    """
+    """OrderedDict of name to Feature"""
     instance = None
     def __new__(cls):
         if cls.instance is None:
@@ -84,8 +75,7 @@ def __repr__(self):
         return str(list(self.values()))
 
     def is_enabled(self, feature_name):
-        """
-        Check for a particular feature by name
+        """Check for a particular feature by name
 
         Parameters
         ----------
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 4862aee8570d..6c8fefca4490 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1935,8 +1935,7 @@ def same_array(array1, array2):
 
 @contextmanager
 def discard_stderr():
-    """
-    Discards error output of a routine if invoked as:
+    """Discards error output of a routine if invoked as:
 
     with discard_stderr():
         ...
@@ -2324,7 +2323,8 @@ def __exit__(self, ptype, value, trace):
 
 def collapse_sum_like(a, shape):
     """Given `a` as a numpy ndarray, perform reduce_sum on `a` over the axes that do not
-    exist in `shape`. Note that an ndarray with `shape` must be broadcastable to `a`."""
+    exist in `shape`. Note that an ndarray with `shape` must be broadcastable to `a`.
+    """
     assert len(a.shape) >= len(shape)
     if np.prod(shape) == 0 or a.size == 0:
         return np.zeros(shape, dtype=a.dtype)
@@ -2349,7 +2349,8 @@ def is_cd_run():
 
 def has_tvm_ops():
     """Returns True if MXNet is compiled with TVM generated operators. If current ctx
-    is GPU, it only returns True for CUDA compute capability > 52 where FP16 is supported."""
+    is GPU, it only returns True for CUDA compute capability > 52 where FP16 is supported.
+    """
     built_with_tvm_op = _features.is_enabled("TVM_OP")
     ctx = current_context()
     if ctx.device_type == 'gpu':
@@ -2367,7 +2368,8 @@ def has_tvm_ops():
 def is_op_runnable():
     """Returns True for all CPU tests. Returns True for GPU tests that are either of the following.
     1. Built with USE_TVM_OP=0.
-    2. Built with USE_TVM_OP=1, but with compute capability >= 53."""
+    2. Built with USE_TVM_OP=1, but with compute capability >= 53.
+    """
     ctx = current_context()
     if ctx.device_type == 'gpu':
         if not _features.is_enabled("TVM_OP"):
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index cef034fd0caa..9e15caae9698 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -60,8 +60,7 @@ def get_gpu_memory(gpu_dev_id):
 
 
 def set_np_shape(active):
-    """
-    Turns on/off NumPy shape semantics, in which `()` represents the shape of scalar tensors,
+    """Turns on/off NumPy shape semantics, in which `()` represents the shape of scalar tensors,
     and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent the shapes
     of zero-size tensors. This is turned off by default for keeping backward compatibility.
 
@@ -568,8 +567,7 @@ def hybrid_forward(self, F, x, w):
 
 
 def np_ufunc_legal_option(key, value):
-    """
-    Checking if ufunc arguments are legal inputs
+    """Checking if ufunc arguments are legal inputs
 
     Parameters
     ----------

From 487d69a0efd5611b1fa887268f40aed6b5a2e3b2 Mon Sep 17 00:00:00 2001
From: Itsuki Toyota <titsuki@cpan.org>
Date: Fri, 25 Oct 2019 01:10:31 +0900
Subject: [PATCH 12/32] [MXNET-1434] Fix a broken link for basic C++ tutorial
 (#16461)

---
 cpp-package/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp-package/README.md b/cpp-package/README.md
index 05fb506db42b..77ff0ee36e80 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -55,7 +55,7 @@ In order to consume the C++ API please follow the steps below.
 
 ## Tutorial
 
-A basic tutorial can be found at <https://mxnet.apache.org/tutorials/c++/basics.html>.
+A basic tutorial can be found at <https://mxnet.apache.org/api/cpp/docs/tutorials/basics>.
 
 ## Examples
 

From 9c99bf2d3e94ed9d1c9ea28e34d36a8e15bdfdb6 Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptredak@nvidia.com>
Date: Thu, 24 Oct 2019 09:10:48 -0700
Subject: [PATCH 13/32] Fix for wrong reqs set after switching from training to
 inference (#16553)

* Debugging reqs

* Move literal strings to const static members

* Fix lint
---
 src/imperative/cached_op.cc       | 76 +++++++++++++++++++------------
 src/imperative/cached_op.h        |  7 +++
 src/imperative/imperative_utils.h |  2 +
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 6818d757ab79..39c2880d627b 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -32,6 +32,22 @@ DMLC_REGISTER_PARAMETER(CachedOpConfig);
 
 constexpr uint32_t kEidNotExist = std::numeric_limits<uint32_t>::max();
 
+const char CachedOp::FULL[] = "full";
+const char CachedOp::FORWARD[] = "forward";
+const char CachedOp::BACKWARD[] = "backward";
+const char CachedOp::REF_COUNT[] = "ref_count";
+const char CachedOp::MEM_PLAN[] = "mem_plan";
+const char CachedOp::STORAGE_PLAN[] = "storage_plan";
+
+namespace {
+
+std::string AddPrefix(const std::string& prefix,
+                      const std::string& s) {
+  return prefix + "_" + s;
+}
+
+}  // namespace
+
 struct CachedOp::GraphInfo {
   nnvm::Graph fwd_graph;
   nnvm::Graph full_graph;
@@ -136,7 +152,7 @@ CachedOp::CachedOp(
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
     }
 
-    fwd_graph_.attrs["forward_ref_count"] =
+    fwd_graph_.attrs[AddPrefix(FORWARD, REF_COUNT)] =
         std::make_shared<dmlc::any>(std::move(ref_count));
 
     inlining_ = !config_.static_alloc &&
@@ -201,9 +217,9 @@ CachedOp::CachedOp(
       }
     }
 
-    auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >("forward_ref_count");
+    auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >(AddPrefix(FORWARD, REF_COUNT));
     for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count.at(i) += ref_count[i];
-    fwd_graph_.attrs["full_ref_count"] =
+    fwd_graph_.attrs[AddPrefix(FULL, REF_COUNT)] =
         std::make_shared<dmlc::any>(std::move(full_ref_count));
 
     size_t num_forward_inputs = num_inputs();
@@ -336,14 +352,15 @@ bool CachedOp::SetForwardGraph(
 
   // When dynmaic shape exists, it is not feasible to plan memory ahead of time
   if (contain_dynamic_shape) {
-    g.attrs.erase("forward_mem_plan");
-    g.attrs.erase("full_mem_plan");
+    g.attrs.erase(AddPrefix(FORWARD, MEM_PLAN));
+    g.attrs.erase(AddPrefix(FULL, MEM_PLAN));
     return false;
   }
+  const std::string& prefix = recording ? FULL : FORWARD;
   if (!match) {
-    g.attrs.erase("forward_mem_plan");
-    g.attrs.erase("full_mem_plan");
-  } else if (g.attrs.count(recording ? "full_mem_plan" : "forward_mem_plan")) {
+    g.attrs.erase(AddPrefix(FORWARD, MEM_PLAN));
+    g.attrs.erase(AddPrefix(FULL, MEM_PLAN));
+  } else if (g.attrs.count(AddPrefix(prefix, MEM_PLAN))) {
     return true;
   }
 
@@ -363,9 +380,9 @@ bool CachedOp::SetForwardGraph(
   }
 
   auto mem_plan = PlanMemory(
-      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
-          recording ? "full_ref_count" : "forward_ref_count"));
-  g.attrs[recording ? "full_mem_plan" : "forward_mem_plan"] =
+      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(AddPrefix(prefix, REF_COUNT)),
+      AddPrefix(prefix, STORAGE_PLAN));
+  g.attrs[AddPrefix(prefix, MEM_PLAN)] =
       std::make_shared<dmlc::any>(std::move(mem_plan));
 
   return false;
@@ -432,7 +449,7 @@ bool CachedOp::SetBackwardGraph(
   size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
   size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
 
-  if (!g.attrs.count("backward_ref_count")) {
+  if (!g.attrs.count(AddPrefix(BACKWARD, REF_COUNT))) {
     std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
     for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
@@ -443,7 +460,7 @@ bool CachedOp::SetBackwardGraph(
       }
     }
     for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)];
-    g.attrs["backward_ref_count"] = std::make_shared<dmlc::any>(std::move(ref_count));
+    g.attrs[AddPrefix(BACKWARD, REF_COUNT)] = std::make_shared<dmlc::any>(std::move(ref_count));
   }
 
   auto shapes = info->fwd_graph.GetAttr<mxnet::ShapeVector>("shape");
@@ -476,8 +493,8 @@ bool CachedOp::SetBackwardGraph(
                                     false, node_range, entry_range);
 
   if (!match) {
-    g.attrs.erase("backward_mem_plan");
-  } else if (g.attrs.count("backward_mem_plan")) {
+    g.attrs.erase(AddPrefix(BACKWARD, MEM_PLAN));
+  } else if (g.attrs.count(AddPrefix(BACKWARD, MEM_PLAN))) {
     return true;
   }
 
@@ -491,11 +508,13 @@ bool CachedOp::SetBackwardGraph(
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
 
   auto mem_plan = PlanMemory(
-      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
+      &g, std::move(storage),
+      g.GetAttr<std::vector<uint32_t> >(AddPrefix(BACKWARD, REF_COUNT)),
+      AddPrefix(BACKWARD, STORAGE_PLAN),
       {num_forward_nodes, idx.num_nodes()},
       {num_forward_entries, idx.num_node_entries()},
       detect_inplace_addto);
-  g.attrs["backward_mem_plan"] = std::make_shared<dmlc::any>(std::move(mem_plan));
+  g.attrs[AddPrefix(BACKWARD, MEM_PLAN)] = std::make_shared<dmlc::any>(std::move(mem_plan));
 
   return false;
 }
@@ -526,9 +545,10 @@ void CachedOp::StaticAllocMemory(
   const auto& default_ctx = state.context;
   nnvm::Graph& g = keep_fwd ? state.info.full_graph : state.info.fwd_graph;
   const auto& idx = g.indexed_graph();
-  const auto& vstorage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(
-      keep_fwd ? "backward_mem_plan" : (recording ? "full_mem_plan" : "forward_mem_plan"));
+  const std::string& graph_type = keep_fwd ? BACKWARD : (recording ? FULL : FORWARD);
+  const auto& storage_plan_attr = AddPrefix(graph_type, STORAGE_PLAN);
+  const auto& storage_plan = g.GetAttr<std::vector<int> >(storage_plan_attr);
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(AddPrefix(graph_type, MEM_PLAN));
   std::vector<int> addto_entry;
   if (g.attrs.count("addto_entry")) {
     addto_entry = g.GetAttr<std::vector<int> >("addto_entry");
@@ -558,9 +578,9 @@ void CachedOp::StaticAllocMemory(
   for (size_t i = start_eid; i < end_eid; ++i) {
     if (addto_entry.size() && addto_entry[i]) {
       state.array_reqs[i] = kAddTo;
-    } else if (vstorage_inplace[i] >= 0) {
+    } else if (storage_plan[i] >= 0) {
       state.array_reqs[i] = kWriteInplace;
-    } else if (vstorage_inplace[i] == -2) {
+    } else if (storage_plan[i] == -2) {
       // -2 indicate that the entry is never referenced.
       state.array_reqs[i] = kNullOp;
     } else {
@@ -862,8 +882,9 @@ OpStatePtr CachedOp::DynamicForward(
   }
 
   // Allocate NDArrays
-  std::vector<uint32_t> ref_count = g.GetAttr<std::vector<uint32_t> >(
-      recording ? "full_ref_count" : "forward_ref_count");
+  const std::string& graph_type = recording ? FULL : FORWARD;
+  std::vector<uint32_t> ref_count =
+    g.GetAttr<std::vector<uint32_t> >(AddPrefix(graph_type, REF_COUNT));
 
   std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
   for (size_t i = 0; i < idx.num_node_entries(); ++i) {
@@ -871,8 +892,7 @@ OpStatePtr CachedOp::DynamicForward(
   }
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
   if (!use_naive_run) {
-    const auto& mem_plan = g.GetAttr<MemoryPlanVector >(
-        recording ? "full_mem_plan" : "forward_mem_plan");
+    const auto& mem_plan = g.GetAttr<MemoryPlanVector >(AddPrefix(graph_type, MEM_PLAN));
     AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
                   mem_plan, arrays, &array_reqs);
     const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
@@ -1011,7 +1031,7 @@ void CachedOp::DynamicBackward(
   }
 
   // Allocate NDArrays
-  auto ref_count = g.GetAttr<std::vector<uint32_t> >("backward_ref_count");
+  auto ref_count = g.GetAttr<std::vector<uint32_t> >(AddPrefix(BACKWARD, REF_COUNT));
   if (retain_graph) {
     for (size_t i = 0; i < num_forward_entries; ++i) ++ref_count[i];
   }
@@ -1027,7 +1047,7 @@ void CachedOp::DynamicBackward(
     if (ref_count[i] == 0) array_reqs[i] = kNullOp;
   }
 
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector >("backward_mem_plan");
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector >(AddPrefix(BACKWARD, MEM_PLAN));
   AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
 
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index db049d59ed80..84f96300c27b 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -140,6 +140,13 @@ class CachedOp {
   void RegisterOpHook(const CachedOp::CachedOpMonCallback& callback,
                       bool monitor_all = false);
 
+  static const char FULL[];
+  static const char FORWARD[];
+  static const char BACKWARD[];
+  static const char REF_COUNT[];
+  static const char MEM_PLAN[];
+  static const char STORAGE_PLAN[];
+
  private:
   struct GraphInfo;
   struct DynamicRuntime;
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index c5932bb3bbfe..8317c6073a24 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -814,6 +814,7 @@ inline MemoryPlanVector PlanMemory(
     nnvm::Graph* p_g,
     nnvm::StorageVector&& storage,
     const std::vector<uint32_t>& ref_count,
+    const std::string& storage_plan,
     const std::pair<uint32_t, uint32_t>& node_range = {0, 0},
     const std::pair<uint32_t, uint32_t>& entry_range = {0, 0},
     bool detect_inplace_addto = false) {
@@ -831,6 +832,7 @@ inline MemoryPlanVector PlanMemory(
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<mxnet::ShapeVector>("shape");
   const auto& storage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
+  g.attrs[storage_plan] = std::make_shared<any>(storage_inplace);
   const auto& storage_ids = g.GetAttr<StorageVector>("storage_id");
   uint32_t entry_start = entry_range.first;
   uint32_t entry_end =

From ef5633448517e8e67f3985fe8984efa8612c32dd Mon Sep 17 00:00:00 2001
From: Iblis Lin <iblis@hs.ntnu.edu.tw>
Date: Fri, 25 Oct 2019 00:12:15 +0800
Subject: [PATCH 14/32] julia/docs: more DRY on page rendering (#16396)

---
 julia/docs/Project.toml             |  2 +-
 julia/docs/make.jl                  | 33 +++++++++++++++++++++++++++++
 julia/docs/mkdocs.yml               |  1 +
 julia/docs/src/api.md               | 15 +------------
 julia/docs/src/api/ndarray.md       | 20 ++---------------
 julia/docs/src/api/symbolic-node.md | 11 +---------
 julia/docs/src/index.md             | 16 ++------------
 julia/src/executor.jl               |  2 +-
 julia/src/symbolic-node/show.jl     |  2 +-
 9 files changed, 43 insertions(+), 59 deletions(-)

diff --git a/julia/docs/Project.toml b/julia/docs/Project.toml
index a4b243b0ffea..023a222beba6 100644
--- a/julia/docs/Project.toml
+++ b/julia/docs/Project.toml
@@ -4,4 +4,4 @@ DocumenterMarkdown = "997ab1e6-3595-5248-9280-8efb232c3433"
 MXNet = "a7949054-b901-59c6-b8e3-7238c29bf7f0"
 
 [compat]
-Documenter = "~0.21"
+Documenter = "~0.23"
diff --git a/julia/docs/make.jl b/julia/docs/make.jl
index 3e541c636888..3ea9b07d1056 100644
--- a/julia/docs/make.jl
+++ b/julia/docs/make.jl
@@ -19,6 +19,39 @@ using Documenter
 using DocumenterMarkdown
 using MXNet
 
+"""
+Return all files of a submodule
+
+julia> listpages("ndarray")
+15-element Array{String,1}:
+ "ndarray.jl"
+ "ndarray/activation.jl"
+ "ndarray/arithmetic.jl"
+ "ndarray/array.jl"
+ ...
+ "ndarray/statistic.jl"
+ "ndarray/trig.jl"
+ "ndarray/type.jl"
+"""
+listpages(x) =
+  ["$x.jl"; joinpath.(x, readdir(joinpath(@__DIR__, "..", "src", x)))]
+
+const api_pages = [
+  "api/context.md",
+  "api/ndarray.md",
+  "api/symbolic-node.md",
+  "api/model.md",
+  "api/initializers.md",
+  "api/optimizers.md",
+  "api/callbacks.md",
+  "api/metric.md",
+  "api/io.md",
+  "api/nn-factory.md",
+  "api/executor.md",
+  "api/kvstore.md",
+  "api/visualize.md",
+]
+
 makedocs(
   sitename = "MXNet.jl",
   modules  = MXNet,
diff --git a/julia/docs/mkdocs.yml b/julia/docs/mkdocs.yml
index 22cb71869673..383505621540 100644
--- a/julia/docs/mkdocs.yml
+++ b/julia/docs/mkdocs.yml
@@ -62,4 +62,5 @@ nav:
     - Symbolic API: api/symbolic-node.md
     - Neural Networks Factory: api/nn-factory.md
     - Executor: api/executor.md
+    - Key-Value Store: api/kvstore.md
     - Network Visualization: api/visualize.md
diff --git a/julia/docs/src/api.md b/julia/docs/src/api.md
index 60cb0831d1bf..04cfadd6d698 100644
--- a/julia/docs/src/api.md
+++ b/julia/docs/src/api.md
@@ -18,18 +18,5 @@
 # API Documentation
 
 ```@contents
-Pages = [
-  "api/symbolic-node.md",
-  "api/ndarray.md",
-  "api/context.md",
-  "api/model.md",
-  "api/initializers.md",
-  "api/optimizers.md",
-  "api/callbacks.md",
-  "api/metric.md",
-  "api/io.md",
-  "api/nn-factory.md",
-  "api/executor.md",
-  "api/visualize.md",
-]
+Pages = api_pages
 ```
diff --git a/julia/docs/src/api/ndarray.md b/julia/docs/src/api/ndarray.md
index 64f59dc5393e..640e8b3ec372 100644
--- a/julia/docs/src/api/ndarray.md
+++ b/julia/docs/src/api/ndarray.md
@@ -19,7 +19,7 @@
 
 ## Arithmetic Operations
 
-In the following example `y` can be a `Real` value or another `NDArray`
+In the following example `y` can be a `Real` value or another `NDArray`.
 
 | API | Example  |                            |
 |-----|----------|----------------------------|
@@ -70,21 +70,5 @@ In the following example `y` can be a `Real` value or another `NDArray`
 
 ```@autodocs
 Modules = [MXNet.mx]
-Pages = [
-  "ndarray.jl",
-  "ndarray/activation.jl",
-  "ndarray/arithmetic.jl",
-  "ndarray/array.jl",
-  "ndarray/autoimport.jl",
-  "ndarray/comparison.jl",
-  "ndarray/context.jl",
-  "ndarray/io.jl",
-  "ndarray/linalg.jl",
-  "ndarray/reduction.jl",
-  "ndarray/remap.jl",
-  "ndarray/show.jl",
-  "ndarray/statistic.jl",
-  "ndarray/trig.jl",
-  "ndarray/type.jl",
-]
+Pages = listpages("ndarray")
 ```
diff --git a/julia/docs/src/api/symbolic-node.md b/julia/docs/src/api/symbolic-node.md
index 0efe4605c414..785dda87fbde 100644
--- a/julia/docs/src/api/symbolic-node.md
+++ b/julia/docs/src/api/symbolic-node.md
@@ -19,14 +19,5 @@
 
 ```@autodocs
 Modules = [MXNet.mx]
-Pages = [
-  "symbolic-node.jl",
-  "symbolic-node/arithmetic.jl",
-  "symbolic-node/array.jl",
-  "symbolic-node/autodiff.jl",
-  "symbolic-node/io.jl",
-  "symbolic-node/op.jl",
-  "symbolic-node/show.jl",
-  "symbolic-node/type.jl",
-]
+Pages = listpages("symbolic-node")
 ```
diff --git a/julia/docs/src/index.md b/julia/docs/src/index.md
index aacd844cc38e..4213265b4bd4 100644
--- a/julia/docs/src/index.md
+++ b/julia/docs/src/index.md
@@ -55,18 +55,6 @@ Depth = 2
 ## API Documentation
 
 ```@contents
-Pages = [
-  "api/context.md",
-  "api/ndarray.md",
-  "api/symbolic-node.md",
-  "api/model.md",
-  "api/initializers.md",
-  "api/optimizers.md",
-  "api/callbacks.md",
-  "api/metric.md",
-  "api/io.md",
-  "api/nn-factory.md",
-  "api/executor.md",
-  "api/visualize.md",
-]
+Pages = api_pages
+Depth = 2
 ```
diff --git a/julia/src/executor.jl b/julia/src/executor.jl
index 37f2dde615b8..7f6c2bb5aa58 100644
--- a/julia/src/executor.jl
+++ b/julia/src/executor.jl
@@ -245,7 +245,7 @@ Total 11 TempSpace resource requested
 ```
 """
 Base.print(io::IO, x::Executor) = print(io, debug_str(x))
-Base.print(x::Executor)         = print(STDOUT, x)
+Base.print(x::Executor)         = print(stdout, x)
 
 function debug_str(x::Executor)
   s_ref = Ref{Cstring}(C_NULL)
diff --git a/julia/src/symbolic-node/show.jl b/julia/src/symbolic-node/show.jl
index f07c6b4655ee..9d40ea124505 100644
--- a/julia/src/symbolic-node/show.jl
+++ b/julia/src/symbolic-node/show.jl
@@ -57,6 +57,6 @@ function Base.print(io::IO, sym::SymbolicNode)
   print(io, unsafe_string(out[]))
 end
 
-Base.print(sym::SymbolicNode) = print(STDOUT, sym)
+Base.print(sym::SymbolicNode) = print(stdout, sym)
 
 

From 4e03e6ac20dcf885b39792ebf4fd9e695e8f3eac Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Thu, 24 Oct 2019 20:30:52 -0700
Subject: [PATCH 15/32] Disables test_bulking_operator_gpu due to flakiness
 (#16611)

---
 tests/python/gpu/test_operator_gpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 06a16b1bb4f8..8b6928a2aa39 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2395,6 +2395,7 @@ def _test_bulking_in_process(seed, time_per_iteration):
 
 
 @with_seed()
+@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/16517')
 def test_bulking_operator_gpu():
     _test_bulking(_test_bulking_in_process)
 

From c0e616f8f41b1fe8aa5edb92766a80c3e2b54775 Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Fri, 25 Oct 2019 09:54:41 -0700
Subject: [PATCH 16/32] C Api for simplebind, fix comment for trigoops, add
 atol to assert (#16585)

* C Api for simplebind, fix comment for trigoops, add atol to assert

* fix build issues

* fix lint and add regression test

* fix indent

* api doc and function name change

* fix lint and add infer shape test
---
 include/mxnet/c_api.h              |  38 +++++
 python/mxnet/symbol/symbol.py      | 110 +++++++++-----
 src/c_api/c_api_executor.cc        | 231 ++++++++++++++++++++++++-----
 tests/nightly/test_large_array.py  |   6 +-
 tests/nightly/test_large_vector.py |  49 +++++-
 5 files changed, 353 insertions(+), 81 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index ac0c6726f2c7..2463a5b75cfd 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2255,6 +2255,44 @@ MXNET_DLL int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
                                      NDArrayHandle** aux_states,
                                      ExecutorHandle shared_exec_handle,
                                      ExecutorHandle* out);
+
+
+MXNET_DLL int MXExecutorSimpleBindEx64(SymbolHandle symbol_handle,
+                                     int dev_type,
+                                     int dev_id,
+                                     const uint32_t num_g2c_keys,
+                                     const char** g2c_keys,
+                                     const int* g2c_dev_types,
+                                     const int* g2c_dev_ids,
+                                     const uint32_t provided_grad_req_list_len,
+                                     const char** provided_grad_req_names,
+                                     const char** provided_grad_req_types,
+                                     const uint32_t num_provided_arg_shapes,
+                                     const char** provided_arg_shape_names,
+                                     const int64_t* provided_arg_shape_data,
+                                     const uint32_t* provided_arg_shape_idx,
+                                     const uint32_t num_provided_arg_dtypes,
+                                     const char** provided_arg_dtype_names,
+                                     const int* provided_arg_dtypes,
+                                     const uint32_t num_provided_arg_stypes,
+                                     const char** provided_arg_stype_names,
+                                     const int* provided_arg_stypes,
+                                     const uint32_t num_shared_arg_names,
+                                     const char** shared_arg_name_list,
+                                     int* shared_buffer_len,
+                                     const char** shared_buffer_name_list,
+                                     NDArrayHandle* shared_buffer_handle_list,
+                                     const char*** updated_shared_buffer_name_list,
+                                     NDArrayHandle** updated_shared_buffer_handle_list,
+                                     uint32_t* num_in_args,
+                                     NDArrayHandle** in_args,
+                                     NDArrayHandle** arg_grads,
+                                     uint32_t* num_aux_states,
+                                     NDArrayHandle** aux_states,
+                                     ExecutorHandle shared_exec_handle,
+                                     ExecutorHandle* out);
+
+
 /*!
  * \brief DEPRECATED. Use MXExecutorReshapeEx instead.
  * Return a new executor with the same symbol and shared memory,
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index b8e8db57188c..6146ab9dc50e 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -1695,42 +1695,80 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
         aux_state_handles = ctypes.POINTER(NDArrayHandle)()
 
         try:
-            check_call(_LIB.MXExecutorSimpleBindEx(self.handle,
-                                                   ctypes.c_int(ctx.device_typeid),
-                                                   ctypes.c_int(ctx.device_id),
-                                                   num_ctx_map_keys,
-                                                   ctx_map_keys,
-                                                   ctx_map_dev_types,
-                                                   ctx_map_dev_ids,
-                                                   mx_uint(provided_req_type_list_len),
-                                                   provided_grad_req_names,
-                                                   provided_grad_req_types,
-                                                   mx_uint(len(provided_arg_shape_names)),
-                                                   c_str_array(provided_arg_shape_names),
-                                                   c_array_buf(mx_int,
-                                                               array('I', provided_arg_shape_data)),
-                                                   c_array_buf(mx_uint,
-                                                               array('i', provided_arg_shape_idx)),
-                                                   num_provided_arg_types,
-                                                   provided_arg_type_names,
-                                                   provided_arg_type_data,
-                                                   num_provided_arg_stypes,
-                                                   provided_arg_stype_names,
-                                                   provided_arg_stype_data,
-                                                   mx_uint(len(shared_arg_name_list)),
-                                                   c_str_array(shared_arg_name_list),
-                                                   ctypes.byref(shared_buffer_len),
-                                                   shared_buffer_names,
-                                                   shared_buffer_handles,
-                                                   ctypes.byref(updated_shared_buffer_names),
-                                                   ctypes.byref(updated_shared_buffer_handles),
-                                                   ctypes.byref(num_in_args),
-                                                   ctypes.byref(in_arg_handles),
-                                                   ctypes.byref(arg_grad_handles),
-                                                   ctypes.byref(num_aux_states),
-                                                   ctypes.byref(aux_state_handles),
-                                                   shared_exec_handle,
-                                                   ctypes.byref(exe_handle)))
+            if sys.version_info[0] > 2 and _int64_enabled():
+                check_call(_LIB.MXExecutorSimpleBindEx64(self.handle,
+                                                         ctypes.c_int(ctx.device_typeid),
+                                                         ctypes.c_int(ctx.device_id),
+                                                         num_ctx_map_keys,
+                                                         ctx_map_keys,
+                                                         ctx_map_dev_types,
+                                                         ctx_map_dev_ids,
+                                                         mx_uint(provided_req_type_list_len),
+                                                         provided_grad_req_names,
+                                                         provided_grad_req_types,
+                                                         mx_uint(len(provided_arg_shape_names)),
+                                                         c_str_array(provided_arg_shape_names),
+                                                         c_array_buf(mx_int64,
+                                                                     array('q', provided_arg_shape_data)),
+                                                         c_array_buf(mx_uint,
+                                                                     array('i', provided_arg_shape_idx)),
+                                                         num_provided_arg_types,
+                                                         provided_arg_type_names,
+                                                         provided_arg_type_data,
+                                                         num_provided_arg_stypes,
+                                                         provided_arg_stype_names,
+                                                         provided_arg_stype_data,
+                                                         mx_uint(len(shared_arg_name_list)),
+                                                         c_str_array(shared_arg_name_list),
+                                                         ctypes.byref(shared_buffer_len),
+                                                         shared_buffer_names,
+                                                         shared_buffer_handles,
+                                                         ctypes.byref(updated_shared_buffer_names),
+                                                         ctypes.byref(updated_shared_buffer_handles),
+                                                         ctypes.byref(num_in_args),
+                                                         ctypes.byref(in_arg_handles),
+                                                         ctypes.byref(arg_grad_handles),
+                                                         ctypes.byref(num_aux_states),
+                                                         ctypes.byref(aux_state_handles),
+                                                         shared_exec_handle,
+                                                         ctypes.byref(exe_handle)))
+            else:
+                check_call(_LIB.MXExecutorSimpleBindEx(self.handle,
+                                                       ctypes.c_int(ctx.device_typeid),
+                                                       ctypes.c_int(ctx.device_id),
+                                                       num_ctx_map_keys,
+                                                       ctx_map_keys,
+                                                       ctx_map_dev_types,
+                                                       ctx_map_dev_ids,
+                                                       mx_uint(provided_req_type_list_len),
+                                                       provided_grad_req_names,
+                                                       provided_grad_req_types,
+                                                       mx_uint(len(provided_arg_shape_names)),
+                                                       c_str_array(provided_arg_shape_names),
+                                                       c_array_buf(mx_int,
+                                                                   array('I', provided_arg_shape_data)),
+                                                       c_array_buf(mx_uint,
+                                                                   array('i', provided_arg_shape_idx)),
+                                                       num_provided_arg_types,
+                                                       provided_arg_type_names,
+                                                       provided_arg_type_data,
+                                                       num_provided_arg_stypes,
+                                                       provided_arg_stype_names,
+                                                       provided_arg_stype_data,
+                                                       mx_uint(len(shared_arg_name_list)),
+                                                       c_str_array(shared_arg_name_list),
+                                                       ctypes.byref(shared_buffer_len),
+                                                       shared_buffer_names,
+                                                       shared_buffer_handles,
+                                                       ctypes.byref(updated_shared_buffer_names),
+                                                       ctypes.byref(updated_shared_buffer_handles),
+                                                       ctypes.byref(num_in_args),
+                                                       ctypes.byref(in_arg_handles),
+                                                       ctypes.byref(arg_grad_handles),
+                                                       ctypes.byref(num_aux_states),
+                                                       ctypes.byref(aux_state_handles),
+                                                       shared_exec_handle,
+                                                       ctypes.byref(exe_handle)))
         except MXNetError as e:
             error_msg = "simple_bind error. Arguments:\n"
             for k, v in kwargs.items():
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index ff85b4fd62fa..afc64f73de7c 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -515,44 +515,11 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
   API_END();
 }
 
-/*!
- * \brief
- * \param symbol_handle symbol handle
- * \param dev_type default device type
- * \param dev_id default device id
- * \param num_g2c_keys number of group2ctx keys
- * \param g2c_keys key list of group2ctx
- * \param g2c_dev_types device type list of group2ctx
- * \param g2c_dev_ids id list of group2ctx
- * \param provided_grad_req_list_len grad_req length provided by users in front-end
- * \param provided_grad_req_names grad_req names provided by users in front-end
- * \param provided_grad_req_types req types provided by users in front-end
- * \param num_provided_arg_shapes number of user provided in_arg and aux_state shapes
- * \param provided_arg_shape_names name list of provided shapes
- * \param provided_arg_shape_data provided shape data
- * \param provided_arg_shape_idx provided shape data index
- * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes
- * \param provided_arg_dtype_names argument name list of provided dtypes
- * \param provided_arg_dtypes data of provided dtypes
- * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types
- * \param provided_arg_stype_names argument name list of provided storage types
- * \param provided_arg_stypes data of provided storage types
- * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec
- * \param shared_arg_name_list parameter name list passed from _bind_ith_exec
- * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec
- * \param shared_buffer_name_list shared data array names passed from _bind_ith_exec
- * \param shared_buffer_handle_list shared data array handles passed from _bind_ith_exec
- * \param updated_shared_buffer_name_list updated shared data array names after binding
- * \param updated_shared_buffer_handle_list updated shared data arrays after binding
- * \param num_in_args number of input arguments of this sym
- * \param in_args list_arguments associated with the current executor
- * \param arg_grads list of gradients of in_args associated with the current executor
- * \param num_aux_states number of aux states of this sym
- * \param aux_states list_auxiliary_states associated with the current executor
- * \param shared_exec_handle shared excutor handle passed from _bind_ith_exec
- * \param out the handle of the executor to be created
- */
-int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+
+namespace mxnet {
+
+template<typename DType>
+int _SimpleBindImpl(SymbolHandle symbol_handle,
                            int dev_type,
                            int dev_id,
                            const uint32_t num_g2c_keys,
@@ -564,7 +531,7 @@ int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
                            const char** provided_grad_req_types,
                            const uint32_t num_provided_arg_shapes,
                            const char** provided_arg_shape_names,
-                           const int* provided_arg_shape_data,
+                           const DType* provided_arg_shape_data,
                            const uint32_t* provided_arg_shape_idx,
                            const uint32_t num_provided_arg_dtypes,
                            const char** provided_arg_dtype_names,
@@ -849,6 +816,192 @@ int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
   API_END();
 }
 
+}  // namespace mxnet
+
+
+/*!
+ * \brief Executor for simple_bind
+ * when INT64_TENSOR_SIZE = OFF
+ * \param symbol_handle symbol handle
+ * \param dev_type default device type
+ * \param dev_id default device id
+ * \param num_g2c_keys number of group2ctx keys
+ * \param g2c_keys key list of group2ctx
+ * \param g2c_dev_types device type list of group2ctx
+ * \param g2c_dev_ids id list of group2ctx
+ * \param provided_grad_req_list_len grad_req length provided by users in front-end
+ * \param provided_grad_req_names grad_req names provided by users in front-end
+ * \param provided_grad_req_types req types provided by users in front-end
+ * \param num_provided_arg_shapes number of user provided in_arg and aux_state shapes
+ * \param provided_arg_shape_names name list of provided shapes
+ * \param provided_arg_shape_data provided shape data
+ * \param provided_arg_shape_idx provided shape data index
+ * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes
+ * \param provided_arg_dtype_names argument name list of provided dtypes
+ * \param provided_arg_dtypes data of provided dtypes
+ * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types
+ * \param provided_arg_stype_names argument name list of provided storage types
+ * \param provided_arg_stypes data of provided storage types
+ * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec
+ * \param shared_arg_name_list parameter name list passed from _bind_ith_exec
+ * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec
+ * \param shared_buffer_name_list shared data array names passed from _bind_ith_exec
+ * \param shared_buffer_handle_list shared data array handles passed from _bind_ith_exec
+ * \param updated_shared_buffer_name_list updated shared data array names after binding
+ * \param updated_shared_buffer_handle_list updated shared data arrays after binding
+ * \param num_in_args number of input arguments of this sym
+ * \param in_args list_arguments associated with the current executor
+ * \param arg_grads list of gradients of in_args associated with the current executor
+ * \param num_aux_states number of aux states of this sym
+ * \param aux_states list_auxiliary_states associated with the current executor
+ * \param shared_exec_handle shared excutor handle passed from _bind_ith_exec
+ * \param out the handle of the executor to be created
+ */
+int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+                           int dev_type,
+                           int dev_id,
+                           const uint32_t num_g2c_keys,
+                           const char** g2c_keys,
+                           const int* g2c_dev_types,
+                           const int* g2c_dev_ids,
+                           const uint32_t provided_grad_req_list_len,
+                           const char** provided_grad_req_names,
+                           const char** provided_grad_req_types,
+                           const uint32_t num_provided_arg_shapes,
+                           const char** provided_arg_shape_names,
+                           const int* provided_arg_shape_data,
+                           const uint32_t* provided_arg_shape_idx,
+                           const uint32_t num_provided_arg_dtypes,
+                           const char** provided_arg_dtype_names,
+                           const int* provided_arg_dtypes,
+                           const uint32_t num_provided_arg_stypes,
+                           const char** provided_arg_stype_names,
+                           const int* provided_arg_stypes,
+                           const uint32_t num_shared_arg_names,
+                           const char** shared_arg_name_list,
+                           int* shared_buffer_len,
+                           const char** shared_buffer_name_list,
+                           NDArrayHandle* shared_buffer_handle_list,
+                           const char*** updated_shared_buffer_name_list,
+                           NDArrayHandle** updated_shared_buffer_handle_list,
+                           uint32_t* num_in_args,
+                           NDArrayHandle** in_args,
+                           NDArrayHandle** arg_grads,
+                           uint32_t* num_aux_states,
+                           NDArrayHandle** aux_states,
+                           ExecutorHandle shared_exec_handle,
+                           ExecutorHandle* out) {
+  return mxnet::_SimpleBindImpl(symbol_handle,
+                            dev_type, dev_id,
+                            num_g2c_keys, g2c_keys, g2c_dev_types, g2c_dev_ids,
+                            provided_grad_req_list_len, provided_grad_req_names,
+                            provided_grad_req_types,
+                            num_provided_arg_shapes, provided_arg_shape_names,
+                            provided_arg_shape_data, provided_arg_shape_idx,
+                            num_provided_arg_dtypes, provided_arg_dtype_names, provided_arg_dtypes,
+                            num_provided_arg_stypes, provided_arg_stype_names, provided_arg_stypes,
+                            num_shared_arg_names, shared_arg_name_list,
+                            shared_buffer_len, shared_buffer_name_list,
+                            shared_buffer_handle_list, updated_shared_buffer_name_list,
+                            updated_shared_buffer_handle_list,
+                            num_in_args, in_args, arg_grads,
+                            num_aux_states, aux_states,
+                            shared_exec_handle, out);
+}
+
+
+// TODO(ChaiBapchya): add API doc for rest of C APIs for int64
+/*!
+ * \brief Large tensor specific implementation for simple_bind executor
+ * when USE_INT64_TENSOR_SIZE = ON
+ * \param symbol_handle symbol handle
+ * \param dev_type default device type
+ * \param dev_id default device id
+ * \param num_g2c_keys number of group2ctx keys
+ * \param g2c_keys key list of group2ctx
+ * \param g2c_dev_types device type list of group2ctx
+ * \param g2c_dev_ids id list of group2ctx
+ * \param provided_grad_req_list_len grad_req length provided by users in front-end
+ * \param provided_grad_req_names grad_req names provided by users in front-end
+ * \param provided_grad_req_types req types provided by users in front-end
+ * \param num_provided_arg_shapes number of user provided in_arg and aux_state shapes
+ * \param provided_arg_shape_names name list of provided shapes
+ * \param provided_arg_shape_data provided shape data
+ * \param provided_arg_shape_idx provided shape data index
+ * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes
+ * \param provided_arg_dtype_names argument name list of provided dtypes
+ * \param provided_arg_dtypes data of provided dtypes
+ * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types
+ * \param provided_arg_stype_names argument name list of provided storage types
+ * \param provided_arg_stypes data of provided storage types
+ * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec
+ * \param shared_arg_name_list parameter name list passed from _bind_ith_exec
+ * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec
+ * \param shared_buffer_name_list shared data array names passed from _bind_ith_exec
+ * \param shared_buffer_handle_list shared data array handles passed from _bind_ith_exec
+ * \param updated_shared_buffer_name_list updated shared data array names after binding
+ * \param updated_shared_buffer_handle_list updated shared data arrays after binding
+ * \param num_in_args number of input arguments of this sym
+ * \param in_args list_arguments associated with the current executor
+ * \param arg_grads list of gradients of in_args associated with the current executor
+ * \param num_aux_states number of aux states of this sym
+ * \param aux_states list_auxiliary_states associated with the current executor
+ * \param shared_exec_handle shared excutor handle passed from _bind_ith_exec
+ * \param out the handle of the executor to be created
+ */
+int MXExecutorSimpleBindEx64(SymbolHandle symbol_handle,
+                           int dev_type,
+                           int dev_id,
+                           const uint32_t num_g2c_keys,
+                           const char** g2c_keys,
+                           const int* g2c_dev_types,
+                           const int* g2c_dev_ids,
+                           const uint32_t provided_grad_req_list_len,
+                           const char** provided_grad_req_names,
+                           const char** provided_grad_req_types,
+                           const uint32_t num_provided_arg_shapes,
+                           const char** provided_arg_shape_names,
+                           const int64_t* provided_arg_shape_data,
+                           const uint32_t* provided_arg_shape_idx,
+                           const uint32_t num_provided_arg_dtypes,
+                           const char** provided_arg_dtype_names,
+                           const int* provided_arg_dtypes,
+                           const uint32_t num_provided_arg_stypes,
+                           const char** provided_arg_stype_names,
+                           const int* provided_arg_stypes,
+                           const uint32_t num_shared_arg_names,
+                           const char** shared_arg_name_list,
+                           int* shared_buffer_len,
+                           const char** shared_buffer_name_list,
+                           NDArrayHandle* shared_buffer_handle_list,
+                           const char*** updated_shared_buffer_name_list,
+                           NDArrayHandle** updated_shared_buffer_handle_list,
+                           uint32_t* num_in_args,
+                           NDArrayHandle** in_args,
+                           NDArrayHandle** arg_grads,
+                           uint32_t* num_aux_states,
+                           NDArrayHandle** aux_states,
+                           ExecutorHandle shared_exec_handle,
+                           ExecutorHandle* out) {
+  return mxnet::_SimpleBindImpl(symbol_handle,
+                            dev_type, dev_id,
+                            num_g2c_keys, g2c_keys, g2c_dev_types, g2c_dev_ids,
+                            provided_grad_req_list_len, provided_grad_req_names,
+                            provided_grad_req_types,
+                            num_provided_arg_shapes, provided_arg_shape_names,
+                            provided_arg_shape_data, provided_arg_shape_idx,
+                            num_provided_arg_dtypes, provided_arg_dtype_names, provided_arg_dtypes,
+                            num_provided_arg_stypes, provided_arg_stype_names, provided_arg_stypes,
+                            num_shared_arg_names, shared_arg_name_list,
+                            shared_buffer_len, shared_buffer_name_list,
+                            shared_buffer_handle_list, updated_shared_buffer_name_list,
+                            updated_shared_buffer_handle_list,
+                            num_in_args, in_args, arg_grads,
+                            num_aux_states, aux_states,
+                            shared_exec_handle, out);
+}
+
+
 int MXExecutorReshape(int partial_shaping,
                       int allow_up_sizing,
                       int dev_type,
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index c18a95400f22..74ac179a7e60 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -1351,17 +1351,17 @@ def check_trunc():
 
 
 def create_input_for_trigonometric_ops(vals):
-    # Creates large vector input of size(LARGE_X*10, SMALL_Y/10) from vals using tile operator
+    # Creates large vector input of size(LARGE_X*10, SMALL_Y/10) from vals using broadcast_to operator
     inp = nd.array(vals).reshape(1, 5)
     inp = nd.broadcast_to(inp, (LARGE_X*10, SMALL_Y//10))
     return inp
 
 
-def assert_correctness_of_trigonometric_ops(output, expected_vals):
+def assert_correctness_of_trigonometric_ops(output, expected_vals, atol=1e-3):
     # checks verifies 5 values at positions(0, 1, -3, -2, -1) of the input vector
     output_idx_to_inspect = [0, 1, -3, -2, -1]
     for i in range(len(output_idx_to_inspect)):
-        assert np.abs(output[1][output_idx_to_inspect[i]].asnumpy()-expected_vals[i]) <= 1e-3
+        assert np.abs(output[1][output_idx_to_inspect[i]].asnumpy()-expected_vals[i]) <= atol
 
 
 def test_trigonometric_ops():
diff --git a/tests/nightly/test_large_vector.py b/tests/nightly/test_large_vector.py
index b8edc83220bd..c6a99a5d0826 100644
--- a/tests/nightly/test_large_vector.py
+++ b/tests/nightly/test_large_vector.py
@@ -64,7 +64,7 @@ def test_ndarray_random_randint():
     high = 2**34
     a = nd.random.randint(low, high, dtype=np.int64, shape=LARGE_X).asnumpy()
     assert a.shape == (LARGE_X,)
-    assert (a >= low).all()  and (a < high).all()
+    assert (a >= low).all() and (a < high).all()
 
 
 def test_ndarray_empty():
@@ -710,6 +710,39 @@ def test_full():
     assert a[-1] == 3
 
 
+def test_regression():
+    shape = (LARGE_X, )
+
+    def check_regression(symbol, forward, shape):
+        # init executor
+        data_s = mx.symbol.Variable('data')
+        label_s = mx.symbol.Variable('label')
+        out_s = symbol(data=data_s, label=label_s)
+        exe = out_s.simple_bind(ctx=mx.cpu(0), data=shape, label=shape)
+
+        arg_map = dict(zip(out_s.list_arguments(), exe.arg_arrays))
+
+        # init data
+        data = mx.random.uniform(-1, -1, shape)
+        arg_map["data"][:] = data
+        atol = 1e-5
+        density = 0.5
+        stype = 'default'
+        label = arg_map["label"]
+        label[:] = rand_ndarray(shape, stype, density=density)
+        exe.forward(is_train=True)
+        exe.backward()
+        np_out = forward(data.asnumpy())
+        assert_almost_equal(exe.outputs[0].asnumpy(), np_out, atol=atol)
+
+    check_regression(mx.symbol.LogisticRegressionOutput,
+                     lambda x: 1.0 / (1.0 + np.exp(-x)),
+                     shape)
+    check_regression(mx.symbol.LinearRegressionOutput,
+                     lambda x: x,
+                     shape)
+
+
 def test_sign():
     a = mx.nd.random.normal(-1, 1, shape=LARGE_X)
     mx_res = mx.nd.sign(a)
@@ -978,11 +1011,11 @@ def test_add_n():
 def test_modulo():
     x = mx.nd.ones(LARGE_X)*6
     y = mx.nd.ones(LARGE_X)*4
-    z = (x%y)
+    z = (x % y)
     assert z[0] == 2
     assert z[-1] == 2
     x = mx.nd.ones(LARGE_X)*5
-    z = nd.modulo(x,y)
+    z = nd.modulo(x, y)
     assert z[0] == 1
     assert z[-1] == 1
 
@@ -1022,6 +1055,16 @@ def test_gather():
     assert np.sum(arr[idx] == 2) == 10
 
 
+def test_infer_shape():
+    data_1 = mx.symbol.Variable('data_1')
+    data_2 = mx.symbol.Variable('data_2')
+    add = data_1+data_2
+    # > add.infer_shape(data_1=(LARGE_X,), data_2=(LARGE_X,))
+    # OUTPUT - arg_shapes, out_shapes, aux_shapes
+    _, out_shapes, _ = add.infer_shape(data_1=(LARGE_X,), data_2=(LARGE_X,))
+    assert out_shapes == [(LARGE_X,)]
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From c574067d75d612b5e2a95eaed7618cc9c1e3b540 Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Fri, 25 Oct 2019 12:36:52 -0700
Subject: [PATCH 17/32] Imagenet inference to nightly fix (#16599)

* split to cd and shell

* comment

* lots of prints

* copy binary at correct location

* remove comments

* add mkl lib

* update docker run build function

* set nvidia docker true to run imagenet inference on GPU

* Revert "set nvidia docker true to run imagenet inference on GPU"

This reverts commit 98f8eef2057351d7964f1e9326ea6772c216f0af.
As we don't need GPU for compilation.
---
 ci/docker/runtime_functions.sh       |  5 +++--
 tests/nightly/JenkinsfileForBinaries | 10 +++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index aab49f28a427..b53db3f980f1 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1481,8 +1481,9 @@ nightly_test_installation() {
 nightly_test_imagenet_inference() {
     set -ex
     echo $PWD
-    cp /work/mxnet/build/cpp-package/example/imagenet_inference .
-    /work/mxnet/cpp-package/example/inference/unit_test_imagenet_inference.sh
+    cp /work/mxnet/build/cpp-package/example/imagenet_inference /work/mxnet/cpp-package/example/inference/
+    cd /work/mxnet/cpp-package/example/inference/
+    ./unit_test_imagenet_inference.sh
 }
 
 #Runs a simple MNIST training example
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index a66159d0075b..af87b2c35658 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -20,7 +20,7 @@
 
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_lib_cpp_example = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, build/cpp-package/example/imagenet_inference'
+mx_lib_cpp_example_mkl = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, build/cpp-package/example/imagenet_inference, lib/libmkldnn.so.0, lib/libmklml_intel.so'
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
@@ -34,10 +34,10 @@ core_logic: {
   stage('Build') {
     parallel 'GPU: CUDA10.1+cuDNN7': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-gpu') {
+        ws('workspace/build-mkldnn-gpu') {
           utils.init_git()
-          utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7', false)
-          utils.pack_lib('gpu', mx_lib_cpp_example)
+          utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn', false)
+          utils.pack_lib('gpu', mx_lib_cpp_example_mkl)
         }
       }
     }/*,
@@ -73,7 +73,7 @@ core_logic: {
     'ImageNet Inference: GPU': {
       node(NODE_LINUX_GPU) {
         ws('workspace/nt-ImageInferenceTest') {
-          utils.unpack_and_init('gpu', mx_lib_cpp_example)
+          utils.unpack_and_init('gpu', mx_lib_cpp_example_mkl)
           utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_imagenet_inference', true)
         }
       }

From 78627387d41f284067bec164063ae53dbc32adf3 Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Fri, 25 Oct 2019 15:55:50 -0700
Subject: [PATCH 18/32] Fix python doc build issue (#16630)

* pin the pip versions

* remove nbconvert comment
---
 docs/python_docs/environment.yml | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/docs/python_docs/environment.yml b/docs/python_docs/environment.yml
index 11e43a1733f3..5f66d7715af9 100644
--- a/docs/python_docs/environment.yml
+++ b/docs/python_docs/environment.yml
@@ -27,13 +27,12 @@ dependencies:
 - matplotlib
 - notebook
 - pip:
-  # using nbconvert master until v5.5 comes out
-  - git+https://github.com/jupyter/nbconvert@master
-  - nbsphinx>=0.4.2
-  - recommonmark
-  - notedown
-  - pypandoc
-  - breathe
-  - mock
-  - awscli
-  - autodocsumm
+  - nbconvert==5.6.1
+  - nbsphinx==0.4.3
+  - recommonmark==0.6.0
+  - notedown==1.5.1
+  - pypandoc==1.4
+  - breathe==4.13.1
+  - mock==3.0.5
+  - awscli==1.16.266
+  - autodocsumm==0.1.11

From 0712f00a2a1fd0d33db97678a176064017a0e75d Mon Sep 17 00:00:00 2001
From: Brenton Chu <brentonlongchu@gmail.com>
Date: Fri, 25 Oct 2019 20:27:10 -0700
Subject: [PATCH 19/32] Faster general take (#16615)

* Sped up perf of take op when axis != 0

* Formatting and syntax fixes

* Rename Take to specify axis

* Fix line length lint errors
---
 src/operator/tensor/indexing_op.cc | 59 ++++++++++++++++-------------
 src/operator/tensor/indexing_op.cu | 61 ++++++++++++++++--------------
 src/operator/tensor/indexing_op.h  | 28 +++++++-------
 3 files changed, 79 insertions(+), 69 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 9961218b5482..470abee71a59 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -29,7 +29,7 @@ namespace mxnet {
 namespace op {
 
 template<bool clip = true>
-struct TakeCPU {
+struct TakeZeroAxisCPU {
   // assume that idx have been flattened to a 1-D tensor (N,)
   // assume that out_data and in_data have been flattened to 2-D tensors, (N, M) and (K, M)
   // M is the number of columns of in_data and out_data
@@ -88,8 +88,9 @@ void EmbeddingOpForwardDnsImpl<cpu>(mshadow::Stream<cpu>* s,
       Tensor<cpu, 2, DType> wmat = weight.get<cpu, 2, DType>(s);
       Tensor<cpu, 2, DType> out = output.get_with_shape<cpu, 2, DType>(
         Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-      Kernel<TakeCPU<true>, cpu>::Launch(s, oshape.Size() / wmat.shape_[1], out.dptr_, wmat.dptr_,
-                                         idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
+      Kernel<TakeZeroAxisCPU<true>, cpu>::Launch(s, oshape.Size() / wmat.shape_[1], out.dptr_,
+                                                 wmat.dptr_, idx.dptr_,
+                                                 wmat.shape_[1], wmat.shape_[0]);
     });
   });
 }
@@ -308,17 +309,17 @@ void TakeOpForward<cpu>(const nnvm::NodeAttrs& attrs,
       }
       if (actual_axis == 0) {
         if (param.mode == take_::kClip) {
-          Kernel<TakeCPU<true>, cpu>::Launch(s, idxshape.Size(),
-                                             outputs[take_::kOut].dptr<DType>(),
-                                             inputs[take_::kArr].dptr<DType>(),
-                                             inputs[take_::kIdx].dptr<IType>(),
-                                             oshape.Size()/idxshape.Size(), arrshape[0]);
+          Kernel<TakeZeroAxisCPU<true>, cpu>::Launch(s, idxshape.Size(),
+                                                     outputs[take_::kOut].dptr<DType>(),
+                                                     inputs[take_::kArr].dptr<DType>(),
+                                                     inputs[take_::kIdx].dptr<IType>(),
+                                                     oshape.Size()/idxshape.Size(), arrshape[0]);
         } else {
-          Kernel<TakeCPU<false>, cpu>::Launch(s, idxshape.Size(),
-                                              outputs[take_::kOut].dptr<DType>(),
-                                              inputs[take_::kArr].dptr<DType>(),
-                                              inputs[take_::kIdx].dptr<IType>(),
-                                              oshape.Size()/idxshape.Size(), arrshape[0]);
+          Kernel<TakeZeroAxisCPU<false>, cpu>::Launch(s, idxshape.Size(),
+                                                      outputs[take_::kOut].dptr<DType>(),
+                                                      inputs[take_::kArr].dptr<DType>(),
+                                                      inputs[take_::kIdx].dptr<IType>(),
+                                                      oshape.Size()/idxshape.Size(), arrshape[0]);
         }
       } else {
         mshadow::Shape<10> in_strides;
@@ -332,21 +333,25 @@ void TakeOpForward<cpu>(const nnvm::NodeAttrs& attrs,
           out_strides[i] = stride;
         }
         if (param.mode == take_::kClip) {
-          Kernel<Take<true>, cpu>::Launch(s, oshape.Size(),
-                                          outputs[take_::kOut].dptr<DType>(),
-                                          inputs[take_::kArr].dptr<DType>(),
-                                          inputs[take_::kIdx].dptr<IType>(),
-                                          in_strides, out_strides, arrshape.ndim(),
-                                          oshape.ndim(), idxshape.ndim(),
-                                          arrshape[actual_axis], actual_axis);
+          Kernel<TakeNonzeroAxis<true>, cpu>::Launch(s, oshape.Size(),
+                                                     outputs[take_::kOut].dptr<DType>(),
+                                                     inputs[take_::kArr].dptr<DType>(),
+                                                     inputs[take_::kIdx].dptr<IType>(),
+                                                     out_strides[actual_axis-1],
+                                                     in_strides[actual_axis-1],
+                                                     in_strides[actual_axis], arrshape.ndim(),
+                                                     oshape.ndim(), idxshape.ndim(),
+                                                     arrshape[actual_axis], actual_axis);
         } else {
-          Kernel<Take<false>, cpu>::Launch(s, oshape.Size(),
-                                           outputs[take_::kOut].dptr<DType>(),
-                                           inputs[take_::kArr].dptr<DType>(),
-                                           inputs[take_::kIdx].dptr<IType>(),
-                                           in_strides, out_strides, arrshape.ndim(),
-                                           oshape.ndim(), idxshape.ndim(),
-                                           arrshape[actual_axis], actual_axis);
+          Kernel<TakeNonzeroAxis<false>, cpu>::Launch(s, oshape.Size(),
+                                                      outputs[take_::kOut].dptr<DType>(),
+                                                      inputs[take_::kArr].dptr<DType>(),
+                                                      inputs[take_::kIdx].dptr<IType>(),
+                                                      out_strides[actual_axis-1],
+                                                      in_strides[actual_axis-1],
+                                                      in_strides[actual_axis], arrshape.ndim(),
+                                                      oshape.ndim(), idxshape.ndim(),
+                                                      arrshape[actual_axis], actual_axis);
         }
       }
     });
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 0b4c20bf2bb5..3ccf1f39d4f7 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -116,11 +116,8 @@ struct AddTakeGradRspDeterministicKernel {
   }
 };
 
-/*! \brief name the struct Take instead of take
- * to avoid conflict with the take function in mshadow
- */
 template<bool clip = true>
-struct TakeGPU {
+struct TakeZeroAxisGPU {
   // assume that idx have been flattened to a 1-D tensor (N,)
   // assume that out_data and in_data have been flattened to 2-D tensors, (N, M) and (K, M)
   // M is the number of columns of in_data and out_data
@@ -180,8 +177,8 @@ void EmbeddingOpForwardDnsImpl<gpu>(mshadow::Stream<gpu>* s,
       Tensor<gpu, 2, DType> wmat = weight.get<gpu, 2, DType>(s);
       Tensor<gpu, 2, DType> out = output.get_with_shape<gpu, 2, DType>(
         Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-      Kernel<TakeGPU<true>, gpu>::Launch(s, oshape.Size(), out.dptr_, wmat.dptr_,
-                                         idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
+      Kernel<TakeZeroAxisGPU<true>, gpu>::Launch(s, oshape.Size(), out.dptr_, wmat.dptr_,
+                                                 idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
     });
   });
 }
@@ -502,17 +499,17 @@ void TakeOpForward<gpu>(const nnvm::NodeAttrs& attrs,
       }
       if (actual_axis == 0) {
         if (param.mode == take_::kClip) {
-          Kernel<TakeGPU<true>, gpu>::Launch(s, oshape.Size(),
-                                             outputs[take_::kOut].dptr<DType>(),
-                                             inputs[take_::kArr].dptr<DType>(),
-                                             inputs[take_::kIdx].dptr<IType>(),
-                                             oshape.Size()/idxshape.Size(), arrshape[0]);
+          Kernel<TakeZeroAxisGPU<true>, gpu>::Launch(s, oshape.Size(),
+                                                     outputs[take_::kOut].dptr<DType>(),
+                                                     inputs[take_::kArr].dptr<DType>(),
+                                                     inputs[take_::kIdx].dptr<IType>(),
+                                                     oshape.Size()/idxshape.Size(), arrshape[0]);
         } else {
-          Kernel<TakeGPU<false>, gpu>::Launch(s, oshape.Size(),
-                                              outputs[take_::kOut].dptr<DType>(),
-                                              inputs[take_::kArr].dptr<DType>(),
-                                              inputs[take_::kIdx].dptr<IType>(),
-                                              oshape.Size()/idxshape.Size(), arrshape[0]);
+          Kernel<TakeZeroAxisGPU<false>, gpu>::Launch(s, oshape.Size(),
+                                                      outputs[take_::kOut].dptr<DType>(),
+                                                      inputs[take_::kArr].dptr<DType>(),
+                                                      inputs[take_::kIdx].dptr<IType>(),
+                                                      oshape.Size()/idxshape.Size(), arrshape[0]);
         }
       } else {
         mshadow::Shape<10> in_strides;
@@ -526,19 +523,27 @@ void TakeOpForward<gpu>(const nnvm::NodeAttrs& attrs,
           out_strides[i] = stride;
         }
         if (param.mode == take_::kClip) {
-          Kernel<Take<true>, gpu>::Launch(s, oshape.Size(),
-                                          outputs[take_::kOut].dptr<DType>(),
-                                          inputs[take_::kArr].dptr<DType>(),
-                                          inputs[take_::kIdx].dptr<IType>(),
-                                          in_strides, out_strides, arrshape.ndim(), oshape.ndim(),
-                                          idxshape.ndim(), arrshape[actual_axis], actual_axis);
+          Kernel<TakeNonzeroAxis<true>, gpu>::Launch(s, oshape.Size(),
+                                                     outputs[take_::kOut].dptr<DType>(),
+                                                     inputs[take_::kArr].dptr<DType>(),
+                                                     inputs[take_::kIdx].dptr<IType>(),
+                                                     out_strides[actual_axis-1],
+                                                     in_strides[actual_axis-1],
+                                                     in_strides[actual_axis],
+                                                     arrshape.ndim(), oshape.ndim(),
+                                                     idxshape.ndim(), arrshape[actual_axis],
+                                                     actual_axis);
         } else {
-          Kernel<Take<false>, gpu>::Launch(s, oshape.Size(),
-                                           outputs[take_::kOut].dptr<DType>(),
-                                           inputs[take_::kArr].dptr<DType>(),
-                                           inputs[take_::kIdx].dptr<IType>(),
-                                           in_strides, out_strides, arrshape.ndim(), oshape.ndim(),
-                                           idxshape.ndim(), arrshape[actual_axis], actual_axis);
+          Kernel<TakeNonzeroAxis<false>, gpu>::Launch(s, oshape.Size(),
+                                                      outputs[take_::kOut].dptr<DType>(),
+                                                      inputs[take_::kArr].dptr<DType>(),
+                                                      inputs[take_::kIdx].dptr<IType>(),
+                                                      out_strides[actual_axis-1],
+                                                      in_strides[actual_axis-1],
+                                                      in_strides[actual_axis],
+                                                      arrshape.ndim(), oshape.ndim(),
+                                                      idxshape.ndim(), arrshape[actual_axis],
+                                                      actual_axis);
         }
       }
     });
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index bb524dd0f5e9..828d761fefd4 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -296,11 +296,11 @@ inline bool SparseEmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
-/*! \brief name the struct Take instead of take
- * to avoid conflict with the take function in mshadow
+/*! \brief name the struct TakeNonzeroAxis for general take when
+ * axis is not zero, use TakeZeroAxisGPU or TakeZeroAxisCPU for axis zero
  */
 template<bool clip = true>
-struct Take {
+struct TakeNonzeroAxis {
   /*!
    * \brief Map function for take operator
    * \param i           global thread id
@@ -315,28 +315,28 @@ struct Take {
    */
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
-                                  const IType* idx,
-                                  const mshadow::Shape<10> in_stride,
-                                  const mshadow::Shape<10> out_stride,
+                                  const IType* idx, const int out_prev_stride,
+                                  const int in_prev_stride, const int in_stride,
                                   const int in_ndims, const int out_ndims, const int idx_ndims,
                                   const int axis_dim, const int axis) {
     // i is the global flattened index in the output
-    const int64_t out_head_index = (axis == 0) ? 0 : (i / out_stride[axis - 1]);
-    const int64_t out_rest_index = (axis == 0) ? i : (i % out_stride[axis - 1]);
-    const int64_t out_mid_index = out_rest_index / in_stride[axis];
+    const int64_t out_head_index = i / out_prev_stride;
+    const int64_t out_rest_index = i % out_prev_stride;
+    const int64_t out_mid_index = out_rest_index / in_stride;
     const int64_t out_tail_index = (axis == in_ndims - 1) ?
-                                   0 : (out_rest_index % in_stride[axis]);
+                                   0 : (out_rest_index % in_stride);
     int64_t idx_index = static_cast<int64_t>(idx[out_mid_index]);
     if (clip) {
       idx_index = (idx_index < 0) ? 0 : idx_index;
       idx_index = (idx_index > axis_dim - 1) ? (axis_dim - 1) : idx_index;
+    } else {
+      idx_index %= axis_dim;
+      idx_index += (idx_index < 0) ? axis_dim : 0;
     }
-    idx_index %= axis_dim;
-    idx_index += (idx_index < 0) ? axis_dim : 0;
     const int64_t in_tail_index = out_tail_index;
     const int64_t in_head_index = out_head_index;
-    int64_t in_src_index = in_tail_index + idx_index * in_stride[axis];
-    in_src_index += (axis == 0) ? 0 : in_head_index * in_stride[axis - 1];
+    int64_t in_src_index = in_tail_index + idx_index * in_stride;
+    in_src_index += in_head_index * in_prev_stride;
     out_data[i] = in_data[in_src_index];
   }
 };

From 8c44af4eba798b379c374c15582f9aea7dd7d8fd Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Sat, 26 Oct 2019 00:13:42 -0700
Subject: [PATCH 20/32] [Gluon] Don't serialize shared parameters twice
 (#16582)

Add deduplicate argument (default of False) to save_parameters.
---
 python/mxnet/gluon/block.py         | 35 ++++++++++++++++++++-----
 tests/python/unittest/test_gluon.py | 40 +++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index eff7dd754572..629ff22ec4e0 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -24,7 +24,7 @@
 import copy
 import warnings
 import re
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 
 from ..base import mx_real_t, MXNetError
 from .. import symbol, ndarray, initializer
@@ -413,7 +413,7 @@ def _collect_params_with_prefix(self, prefix=''):
             ret.update(child._collect_params_with_prefix(prefix + name))
         return ret
 
-    def save_parameters(self, filename):
+    def save_parameters(self, filename, deduplicate=False):
         """Save parameters to file.
 
         Saved parameters can only be loaded with `load_parameters`. Note that this
@@ -424,6 +424,10 @@ def save_parameters(self, filename):
         ----------
         filename : str
             Path to file.
+        deduplicate : bool, default False
+            If True, save shared parameters only once. Otherwise, if a Block
+            contains multiple sub-blocks that share parameters, each of the
+            shared parameters will be separately saved for every sub-block.
 
         References
         ----------
@@ -431,7 +435,17 @@ def save_parameters(self, filename):
         <https://mxnet.apache.org/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html>`_
         """
         params = self._collect_params_with_prefix()
-        arg_dict = {key : val._reduce() for key, val in params.items()}
+
+        if deduplicate:
+            # Shared parameters are stored only a single time as of MXNet 1.6.
+            # Shared parameters are registered under multiple prefixes returned by
+            # _collect_params_with_prefix. We select a single one and only store
+            # it. In load_parameters it is sufficient for a shared parameter to
+            # only set it for a single prefix.
+            reverse_params = {v: k for k, v in params.items()}
+            params = {v: k for k, v in reverse_params.items()}
+
+        arg_dict = {key: val._reduce() for key, val in params.items()}
         save_fn = _mx_npx.save if is_np_array() else ndarray.save
         save_fn(filename, arg_dict)
 
@@ -510,15 +524,24 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
 
         if not any('.' in i for i in loaded.keys()):
             # legacy loading
-            del loaded
+            loaded = None  # This should be changed to `del loaded` when dropping Python 2
             self.collect_params().load(
                 filename, ctx, allow_missing, ignore_extra, self.prefix,
                 cast_dtype=cast_dtype, dtype_source=dtype_source)
             return
 
         if not allow_missing:
-            for name in params.keys():
-                assert name in loaded, \
+            # Shared parameters are stored only a single time as of MXNet 1.6.
+            # We thus retrieve all prefixes (through _collect_params_with_prefix)
+            # that a shared parameter is used with. Check that there are no
+            # missing parameters that were not yet already loaded from the
+            # shared version.
+            params_inv = defaultdict(list)
+            for k, v in params.items():
+                params_inv[v].append(k)
+
+            for name, param in params.items():
+                assert any(p in loaded for p in params_inv[param]), \
                     "Parameter '%s' is missing in file '%s', which contains parameters: %s. " \
                     "Set allow_missing=True to ignore missing parameters."%(
                         name, filename, _brief_print_list(loaded.keys()))
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index f1d0cc7ac274..f1413e2b99c2 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -1511,6 +1511,46 @@ def forward(self, x):
     net2 = Network()
     net2.load_parameters('tmp.params')
 
+@with_seed()
+def test_save_load_deduplicate_with_shared_params():
+    class B(mx.gluon.Block):
+        def __init__(self, params=None):
+            super(B, self).__init__(params=params)
+
+            with self.name_scope():
+                self.weight = self.params.get('weight', shape=(10, 10))
+
+    class C(mx.gluon.Block):
+        def __init__(self, b1, b2):
+            super(C, self).__init__()
+            self.b1 = b1
+            self.b2 = b2
+
+    b1 = B()
+    b2 = B(b1.collect_params())
+    c = C(b1, b2)
+    c.initialize()
+    c.save_parameters('tmp.params', deduplicate=True)
+
+    params = mx.nd.load('tmp.params')
+    assert len(params) == 1  # Only a single copy of the shared parameter is saved
+
+    b1 = B()
+    b2 = B(b1.collect_params())
+    c = C(b1, b2)
+    c.load_parameters('tmp.params')
+
+    # Test default behavior
+    c.save_parameters('tmp2.params', deduplicate=False)
+
+    params = mx.nd.load('tmp2.params')
+    assert len(params) == 2  # Only a single copy of the shared parameter is saved
+
+    b1 = B()
+    b2 = B(b1.collect_params())
+    c = C(b1, b2)
+    c.load_parameters('tmp2.params')
+
 @with_seed()
 def test_symbol_block_save_load():
     class Net(gluon.HybridBlock):

From e2624551616e930f4106e7366568572208994740 Mon Sep 17 00:00:00 2001
From: Haozheng Fan <fhztc1997618@gmail.com>
Date: Sat, 26 Oct 2019 15:43:08 +0800
Subject: [PATCH 21/32] Fix index overflow bug in einsum (#16589)

* fix index overflow

* check index overflow

* fix index overflow in einsum path

* fix indent

* reduce NPY_MAXARGS

* safe accumulate
---
 benchmark/python/einsum/benchmark_einsum.py   |   9 +
 src/operator/mxnet_op.h                       |  12 ++
 src/operator/numpy/np_einsum_op-inl.h         | 170 +++++++++---------
 src/operator/numpy/np_einsum_op.cc            |  11 ++
 src/operator/numpy/np_einsum_path_op-inl.h    | 114 ++++++------
 .../unittest/test_numpy_interoperability.py   |  34 ++++
 tests/python/unittest/test_numpy_op.py        |  33 ++--
 7 files changed, 226 insertions(+), 157 deletions(-)

diff --git a/benchmark/python/einsum/benchmark_einsum.py b/benchmark/python/einsum/benchmark_einsum.py
index 3593de2db9e1..6de8223287da 100644
--- a/benchmark/python/einsum/benchmark_einsum.py
+++ b/benchmark/python/einsum/benchmark_einsum.py
@@ -48,6 +48,15 @@ def test_np_einsum():
     cost = measure_cost(500, np.einsum, *args, optimize=True)
     print("Greedy einsum: {} ms".format(cost * 1000))
 
+    print("RNN Use Case:")
+    a = np.random.uniform(0, 1, size=(64, 128, 512))
+    b = np.random.uniform(0, 1, size=(128, 512, 2, 2))
+    args = ['bij, ijkl->bkl', a, b]
+    cost = measure_cost(2, np.einsum, *args, optimize=True)
+    print('Greedy einsum: {} ms'.format(cost * 1000))
+    cost = measure_cost(2, np.einsum, *args)
+    print('Basic einsum: {} ms'.format(cost * 1000))
+
     print('Inner Product:')
     a = np.ones(6000000)
     b = np.ones(6000000)
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 8ccc34247b6f..463c71b5b0eb 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -619,6 +619,18 @@ MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
   return stride;
 }
 
+/* Increment coordinates */
+template<int ndim>
+MSHADOW_XINLINE bool inc(Shape<ndim>* coord, const Shape<ndim>& shape) {
+  ++(*coord)[ndim-1];
+  #pragma unroll
+  for (int i = ndim - 1; i > 0 && (*coord)[i] >= shape[i]; --i) {
+    (*coord)[i] -= shape[i];
+    ++(*coord)[i-1];
+  }
+  return (*coord)[0] < shape[0];
+}
+
 /* Increment coordinates and modify index */
 template<int ndim>
 MSHADOW_XINLINE void inc(Shape<ndim>* coord, const Shape<ndim>& shape,
diff --git a/src/operator/numpy/np_einsum_op-inl.h b/src/operator/numpy/np_einsum_op-inl.h
index 2145abec682b..051280763331 100644
--- a/src/operator/numpy/np_einsum_op-inl.h
+++ b/src/operator/numpy/np_einsum_op-inl.h
@@ -73,8 +73,8 @@
 namespace mxnet {
 namespace op {
 
-#define NPY_MAXDIMS 32
-#define NPY_MAXARGS 32
+#define NPY_MAXDIMS 16
+#define NPY_MAXARGS 16
 
 inline TShape get_stride(const TShape& shape) {
   int ndim = shape.ndim(), prod = 1;
@@ -415,40 +415,45 @@ class EinsumOp {
   }
 };  // class EinsumOp
 
-template<int dimension, int req, bool back>
-struct numpy_einsum {
+template<int dimension, int req, bool back, typename AType>
+struct numpy_einsum{
   template<typename DType>
   MSHADOW_XINLINE static void Map(index_t i, DType* out,
                                   common::StaticArray<DType*, NPY_MAXARGS> op,
                                   mshadow::Shape<dimension> oshape,
-                                  mshadow::Shape<dimension> ostride,
+                                  common::StaticArray<mshadow::Shape<dimension>,
+                                                      NPY_MAXARGS> ostride,
                                   mshadow::Shape<dimension> reduceshape,
-                                  mshadow::Shape<dimension> reducestride,
-                                  mshadow::Shape<dimension> itershape,
                                   common::StaticArray<mshadow::Shape<dimension>,
-                                                      NPY_MAXARGS> iterstride,
+                                                      NPY_MAXARGS> rstride,
                                   int nop,
                                   int iop0,
                                   const DType* out_grad) {
     using namespace mxnet_op;
-    index_t oidx = back ? dot(unravel(dot(unravel(i, oshape), ostride), itershape),
-                              iterstride[iop0]) : i;
+    mshadow::Shape<dimension> oidx = unravel(i, oshape);
+    i = back ? dot(oidx, ostride[iop0]) : i;
     if (req == kWriteTo) {
-      out[oidx] = (DType)0;
+      out[i] = (DType)0;
+    }
+    for (int rdim = 0; rdim < dimension; ++rdim) {
+      if (reduceshape[rdim] == 0) {
+        return;
+      }
     }
-    for (int j = 0; j < reduceshape.Size(); j++) {
-      mshadow::Shape<dimension> idx = unravel(dot(unravel(j, reduceshape), reducestride) +
-                                              dot(unravel(i, oshape), ostride),
-                                              itershape);
-      DType tmp = back ? out_grad[dot(idx, iterstride[nop])] :  (DType)1;
+    mshadow::Shape<dimension> ridx = unravel(0, reduceshape);
+    AType sum = 0;
+    do {
+      AType tmp = back ? static_cast<AType>(out_grad[dot(oidx, ostride[nop]) +
+                                                     dot(ridx, rstride[nop])]): (AType)1;
       for (int iop = 0; iop < nop; ++iop) {
         if (iop != iop0) {
-          index_t k = dot(idx, iterstride[iop]);
-          tmp = tmp * op[iop][k];
+          index_t k = dot(oidx, ostride[iop]) + dot(ridx, rstride[iop]);
+          tmp = tmp * static_cast<AType>(op[iop][k]);
         }
       }
-      out[oidx] = out[oidx] + tmp;
-    }
+      sum = sum + tmp;
+    }while (inc(&ridx, reduceshape));
+    out[i] = out[i] + static_cast<DType>(sum);
   }
 };
 
@@ -603,12 +608,12 @@ inline void NumpyEinsumProcess(const std::vector<TBlob>& inputs,
   }
 
   /* Step 4: Set up the op_axes for the iterator */
-  TShape itershape(ndim_iter, -1), iterstride_true(ndim_iter, -1);
+  TShape itershape(ndim_iter, -1);
+  std::vector<TShape> iterstride(nop + 1, TShape(ndim_iter, 0));
   TShape oshape = back ? inputs[0].shape_ : outputs[0].shape_;
   TShape ostride_true = get_stride(oshape);
-  TShape reduceshape, ostride, reducestride;
-  std::vector<TShape> iterstride(nop + 1, TShape(ndim_iter, 0));
-  std::vector<TShape> remainshape(nop), opstride(nop), remainstride(nop);
+  TShape reduceshape;
+  std::vector<TShape> remainshape(nop);
   int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
   int *op_axes[NPY_MAXARGS];
 
@@ -632,7 +637,6 @@ inline void NumpyEinsumProcess(const std::vector<TBlob>& inputs,
   for (idim = 0; idim < ndim_output; ++idim) {
     iterstride[nop][idim] = ostride_true[idim];
   }
-  iterstride_true = get_stride(itershape);
   reduceshape = TShape(ndim_iter - ndim_output, 0);
   for (idim = ndim_output; idim < ndim_iter; ++idim) {
     reduceshape[idim - ndim_output] = itershape[idim];
@@ -648,30 +652,6 @@ inline void NumpyEinsumProcess(const std::vector<TBlob>& inputs,
     remainshape[iop] = TShape(rsh.begin(), rsh.end());
   }
 
-  // calculate stride
-  ostride = TShape(ndim_output, 0);
-  for (idim = 0; idim < ndim_output; ++idim) {
-    ostride[idim] = iterstride_true[idim];
-  }
-  reducestride = TShape(ndim_iter - ndim_output, 0);
-  for (idim = ndim_output; idim < ndim_iter; ++idim) {
-    reducestride[idim - ndim_output] = iterstride_true[idim];
-  }
-  for (iop = 0; iop < nop; ++iop) {
-    opstride[iop] = TShape(opshape[iop].ndim(), 0);
-    remainstride[iop] = TShape(remainshape[iop].ndim(), 0);
-    int j = 0;
-    for (idim = 0; idim < ndim_iter; ++idim) {
-      if (op_axes_arrays[iop][idim] != -1 &&
-          itershape[idim] == opshape[iop][op_axes_arrays[iop][idim]]) {
-        opstride[iop][op_axes_arrays[iop][idim]] = iterstride_true[idim];
-      } else {
-        remainstride[iop][j++] = iterstride_true[idim];
-      }
-    }
-    CHECK_EQ(j, remainstride[iop].ndim());
-  }
-
   // exclude the 0-dim case
   if (ndim_iter == 0) {
     ndim_iter = 1;
@@ -681,14 +661,10 @@ inline void NumpyEinsumProcess(const std::vector<TBlob>& inputs,
     iterstride[iop] = pad(iterstride[iop], ndim_iter);
   }
   oshape = pad(oshape, ndim_iter);
-  ostride = pad(ostride, ndim_iter);
   reduceshape = pad(reduceshape, ndim_iter);
-  reducestride = pad(reducestride, ndim_iter);
   for (iop = 0; iop < nop; ++iop) {
     opshape[iop] = pad(opshape[iop], ndim_iter);
-    opstride[iop] = pad(opstride[iop], ndim_iter);
     remainshape[iop] = pad(remainshape[iop], ndim_iter);
-    remainstride[iop] = pad(remainstride[iop], ndim_iter);
   }
 
   if (!back) {
@@ -696,28 +672,33 @@ inline void NumpyEinsumProcess(const std::vector<TBlob>& inputs,
       return;
     }
     const TBlob &out_data = outputs[0];
-    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ACC_TYPE_SWITCH(out_data.type_flag_, DType, AType, {
       mxnet::common::StaticArray<DType*, NPY_MAXARGS> op;
       for (iop = 0; iop < nop; ++iop) {
         op[iop] = inputs[iop].dptr<DType>();
       }
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         MXNET_NDIM_SWITCH_EX(ndim_iter, dimension, {
-          mxnet::common::StaticArray<mshadow::Shape<dimension>, NPY_MAXARGS> iterstride_arr;
-          for (iop = 0; iop <= nop; ++iop) {
-            iterstride_arr[iop] = iterstride[iop].get<dimension>();
+          mxnet::common::StaticArray<mshadow::Shape<dimension>, NPY_MAXARGS> ostride_arr;
+          mxnet::common::StaticArray<mshadow::Shape<dimension>, NPY_MAXARGS> rstride_arr;
+          for (iop = 0; iop < nop; ++iop) {
+            mshadow::Shape<dimension> otmp, rtmp;
+            for (idim = 0; idim < dimension; ++idim) {
+              otmp[idim] = idim < ndim_output ? iterstride[iop][idim] : 1;
+              rtmp[idim] = idim < dimension - ndim_output ? iterstride[iop][idim + ndim_output] : 1;
+            }
+            ostride_arr[iop] = otmp;
+            rstride_arr[iop] = rtmp;
           }
-          Kernel<numpy_einsum<dimension, req_type, 0>,
+          Kernel<numpy_einsum<dimension, req_type, 0, AType>,
                  xpu>::Launch(ctx.get_stream<xpu>(),
                               oshape.Size(),
                               out_data.dptr<DType>(),
                               op,
                               oshape.get<dimension>(),
-                              ostride.get<dimension>(),
+                              ostride_arr,
                               reduceshape.get<dimension>(),
-                              reducestride.get<dimension>(),
-                              itershape.get<dimension>(),
-                              iterstride_arr,
+                              rstride_arr,
                               nop,
                               -1,
                               reinterpret_cast<DType*>(NULL));
@@ -743,31 +724,44 @@ inline void NumpyEinsumProcess(const std::vector<TBlob>& inputs,
     for (int i = 0; i < nop; ++i) {
       const TBlob &out_data = outputs[i];
       const TBlob &out_grad = inputs[0];
-      MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      std::vector<TShape> opstride(nop + 1, TShape(ndim_iter, 0));
+      std::vector<TShape> remainstride(nop + 1, TShape(ndim_iter, 0));
+      for (iop = 0; iop <= nop; ++iop) {
+        int j = 0;
+        for (idim = 0; idim < ndim_iter; ++idim) {
+          if (op_axes_arrays[i][idim] == -1 ||
+              opshape[i][op_axes_arrays[i][idim]] == 1) {
+            remainstride[iop][j++] = iterstride[iop][idim];
+          } else {
+            opstride[iop][op_axes_arrays[i][idim]] = iterstride[iop][idim];
+          }
+        }
+      }
+      MXNET_ACC_TYPE_SWITCH(out_data.type_flag_, DType, AType, {
         mxnet::common::StaticArray<DType*, NPY_MAXARGS> op;
         for (iop = 0; iop < nop; ++iop) {
           op[iop] = inputs[iop + back].dptr<DType>();
         }
         MXNET_ASSIGN_REQ_SWITCH(req[i], req_type, {
           MXNET_NDIM_SWITCH_EX(ndim_iter, dimension, {
-            mxnet::common::StaticArray<mshadow::Shape<dimension>, NPY_MAXARGS> iterstride_arr;
+            mxnet::common::StaticArray<mshadow::Shape<dimension>, NPY_MAXARGS> opstride_arr;
+            mxnet::common::StaticArray<mshadow::Shape<dimension>, NPY_MAXARGS> remainstride_arr;
             for (iop = 0; iop <= nop; ++iop) {
-              iterstride_arr[iop] = iterstride[iop].get<dimension>();
+              opstride_arr[iop] = opstride[iop].get<dimension>();
+              remainstride_arr[iop] = remainstride[iop].get<dimension>();
             }
-            Kernel<numpy_einsum<dimension, req_type, 1>,
+            Kernel<numpy_einsum<dimension, req_type, 1, AType>,
                   xpu>::Launch(ctx.get_stream<xpu>(),
-                              opshape[i].Size(),
-                              out_data.dptr<DType>(),
-                              op,
-                              opshape[i].get<dimension>(),
-                              opstride[i].get<dimension>(),
-                              remainshape[i].get<dimension>(),
-                              remainstride[i].get<dimension>(),
-                              itershape.get<dimension>(),
-                              iterstride_arr,
-                              nop,
-                              i,
-                              out_grad.dptr<DType>());
+                               opshape[i].Size(),
+                               out_data.dptr<DType>(),
+                               op,
+                               opshape[i].get<dimension>(),
+                               opstride_arr,
+                               remainshape[i].get<dimension>(),
+                               remainstride_arr,
+                               nop,
+                               i,
+                               out_grad.dptr<DType>());
           })
         })
       })
@@ -798,13 +792,14 @@ inline void NumpyEinsumForward(const OpStatePtr& state_ptr,
   std::vector<std::vector<int> > pos;
   std::string string_repr;
   paths = einsum_path(state.subscripts, inputs, true, ctx.run_ctx, &pos, &string_repr);
-  int paths_len = paths.size(), temp_space_size = 0, max_temp_space_size = 0;
+  int paths_len = paths.size();
+  size_t temp_space_size = 0, max_temp_space_size = 0;
   std::vector<TBlob> operands(inputs), tmp_operands, temp_space_vec(paths_len - 1);
   for (int i = 0; i + 1 < paths_len; ++i) {
     temp_space_size += paths[i].oshape.Size();
   }
   for (int i = 0; i < paths_len; ++i) {
-    max_temp_space_size = std::max(max_temp_space_size, static_cast<int>(paths[i].oshape.Size()));
+    max_temp_space_size = std::max(max_temp_space_size, paths[i].oshape.Size());
   }
   temp_space_size += max_temp_space_size;
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -813,7 +808,7 @@ inline void NumpyEinsumForward(const OpStatePtr& state_ptr,
                                                false,
                                                outputs[0].type_flag_));
     Tensor<xpu, 1, DType> temp_space = state.tempspace->data().FlatTo1D<xpu, DType>();
-    int begin = max_temp_space_size;
+    size_t begin = max_temp_space_size;
     for (int i = 0; i < paths_len - 1; ++i) {
       TBlob tblob = TBlob(temp_space.Slice(begin, begin + paths[i].oshape.Size()));
       temp_space_vec[i] = tblob.reshape(paths[i].oshape);
@@ -910,12 +905,13 @@ inline void NumpyEinsumBackward(const OpStatePtr& state_ptr,
   }
   // calculate temporary space size for temp_grad
   const std::vector<Step>& paths = state.paths;
-  int paths_len = paths.size(), temp_space_size = 0, max_temp_space_size = 0;
+  int paths_len = paths.size();
+  size_t temp_space_size = 0, max_temp_space_size = 0;
   for (int i = 0; i < paths_len - 1; ++i) {
     temp_space_size += paths[i].oshape.Size();
   }
   for (int i = 0; i < paths_len; ++i) {
-    max_temp_space_size = std::max(max_temp_space_size, static_cast<int>(paths[i].oshape.Size()));
+    max_temp_space_size = std::max(max_temp_space_size, paths[i].oshape.Size());
   }
   temp_space_size += max_temp_space_size;
   // replay the forward process
@@ -936,8 +932,8 @@ inline void NumpyEinsumBackward(const OpStatePtr& state_ptr,
     }
   }
   // calculate temporary space size for tensordot
-  int tensordot_max_tempspace_size = 0;
-  int begin_tensordot_tempspace = 0;
+  size_t tensordot_max_tempspace_size = 0;
+  size_t begin_tensordot_tempspace = 0;
   std::vector<TBlob> temp_inputs, temp_outputs;
   std::vector<OpReqType> temp_req;
   std::vector<size_t> tensordot_tempspace_size;
@@ -999,7 +995,7 @@ inline void NumpyEinsumBackward(const OpStatePtr& state_ptr,
       }
       tensordot_tempspace_size.push_back(cur_tensordot_tempspace_size);
       tensordot_max_tempspace_size = std::max(tensordot_max_tempspace_size,
-                                              static_cast<int>(cur_tensordot_tempspace_size));
+                                              cur_tensordot_tempspace_size);
     }
     begin_tensordot_tempspace = temp_space_size;
     temp_space_size += (tensordot_max_tempspace_size + sizeof(DType) - 1) / sizeof(DType);
@@ -1010,7 +1006,7 @@ inline void NumpyEinsumBackward(const OpStatePtr& state_ptr,
     // allocate temporary space for gradients of intermediate results
     Tensor<xpu, 1, DType> temp_space = ctx.requested[0].get_space_typed<xpu, 1, DType>
       (Shape1(temp_space_size), s);
-    int begin = max_temp_space_size;
+    size_t begin = max_temp_space_size;
     for (int i = 0; i + 1 < paths_len; ++i) {
       TBlob tblob = TBlob(temp_space.Slice(begin, begin + paths[i].oshape.Size()));
       temp_grad[i] = tblob.reshape(paths[i].oshape);
diff --git a/src/operator/numpy/np_einsum_op.cc b/src/operator/numpy/np_einsum_op.cc
index 4d232b9b7c04..522780f5f3ad 100644
--- a/src/operator/numpy/np_einsum_op.cc
+++ b/src/operator/numpy/np_einsum_op.cc
@@ -305,6 +305,17 @@ bool NumpyEinsumShape(const nnvm::NodeAttrs& attrs,
     oshape[i] = dimension_dict[static_cast<int>(output_str[i])];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  size_t lim = static_cast<size_t>(std::numeric_limits<index_t>::max());
+  for (int i = 0; i < num_args; ++i) {
+    CHECK_LE(in_attrs->at(i).Size(), lim)
+      << "Size of operand " << i
+      << " exceeds the maximum index."
+      << " Try setting `USE_INT64_TENSOR_SIZE`.";
+  }
+  CHECK_LE(oshape.Size(), lim)
+    << "Size of output"
+    << " exceeds the maximum index."
+    << " Try setting `USE_INT64_TENSOR_SIZE`.";
   return shape_is_known(oshape);
 }
 
diff --git a/src/operator/numpy/np_einsum_path_op-inl.h b/src/operator/numpy/np_einsum_path_op-inl.h
index cebd4e8ce9af..968d52106da7 100644
--- a/src/operator/numpy/np_einsum_path_op-inl.h
+++ b/src/operator/numpy/np_einsum_path_op-inl.h
@@ -80,7 +80,7 @@ struct Contraction {
 };
 
 struct Alternative {
-  int cost[2];
+  int64_t cost[2];
   std::vector<int> positions;
   SetVector new_input_sets;
 };
@@ -115,28 +115,28 @@ inline size_t _compute_size_by_dict(const std::bitset<MAXAXIS>& indices,
   return ret;
 }
 
-inline int _flop_count(const std::string& idx_contraction,
-                       bool inner,
-                       int num_terms,
-                       const dim_t size_dictionary[]) {
+inline int64_t _flop_count(const std::string& idx_contraction,
+                           bool inner,
+                           int num_terms,
+                           const dim_t size_dictionary[]) {
   size_t overall_size = _compute_size_by_dict(idx_contraction, size_dictionary);
   int op_factor = std::max(1, num_terms - 1);
   if (inner) {
     ++op_factor;
   }
-  return overall_size * op_factor;
+  return static_cast<int64_t>(overall_size) * op_factor;
 }
 
-inline int _flop_count(const std::bitset<MAXAXIS>& idx_contraction,
-                       bool inner,
-                       int num_terms,
-                       const dim_t size_dictionary[]) {
+inline int64_t _flop_count(const std::bitset<MAXAXIS>& idx_contraction,
+                           bool inner,
+                           int num_terms,
+                           const dim_t size_dictionary[]) {
   size_t overall_size = _compute_size_by_dict(idx_contraction, size_dictionary);
   int op_factor = std::max(1, num_terms - 1);
   if (inner) {
     ++op_factor;
   }
-  return overall_size * op_factor;
+  return static_cast<int64_t>(overall_size) * op_factor;
 }
 
 inline Contraction _find_contraction(const std::vector<int>& positions,
@@ -164,16 +164,16 @@ inline int _parse_possible_contraction(const std::vector<int>& positions,
                                        const SetVector& input_sets,
                                        const std::bitset<MAXAXIS>& output_set,
                                        const dim_t idx_dict[],
-                                       int memory_limit,
-                                       int path_cost,
-                                       int naive_cost,
+                                       size_t memory_limit,
+                                       int64_t path_cost,
+                                       int64_t naive_cost,
                                        Alternative* ret) {
   // Find the contraction
   Contraction contract = _find_contraction(positions, input_sets, output_set);
 
   // Sieve the results based on memory_limit
   size_t new_size = _compute_size_by_dict(contract.new_result, idx_dict);
-  if (new_size > static_cast<size_t>(memory_limit)) {
+  if (new_size > memory_limit) {
     return -1;
   }
 
@@ -182,10 +182,10 @@ inline int _parse_possible_contraction(const std::vector<int>& positions,
   for (auto p : positions) {
     old_sizes += _compute_size_by_dict(input_sets[p], idx_dict);
   }
-  int remove_size = old_sizes - new_size;
+  int64_t remove_size = static_cast<int64_t>(old_sizes) - static_cast<int64_t>(new_size);
 
-  int cost = _flop_count(contract.idx_contract, contract.idx_removed.any(),
-                            positions.size(), idx_dict);
+  int64_t cost = _flop_count(contract.idx_contract, contract.idx_removed.any(),
+                             positions.size(), idx_dict);
   ret->cost[0] = -remove_size;
   ret->cost[1] = cost;
 
@@ -206,7 +206,7 @@ inline void _update_other_results(std::vector<Alternative>* results,
   int bx = best_con[0], by = best_con[1];
   size_t size = results->size();
 
-  for (int i = size - 1; i >= 0; --i) {
+  for (int i = static_cast<int>(size) - 1; i >= 0; --i) {
     int x = results->at(i).positions[0], y = results->at(i).positions[1];
 
     // Ignore results involving tensors just contracted
@@ -233,9 +233,9 @@ inline void _update_other_results(std::vector<Alternative>* results,
 inline std::vector<std::vector<int> > _greedy_path(const SetVector* input_sets,
                                                    const std::bitset<MAXAXIS>& output_set,
                                                    const dim_t idx_dict[],
-                                                   int memory_limit) {
-  size_t isize = input_sets->size();
-  size_t iteration_num = isize;
+                                                   size_t memory_limit) {
+  int isize = static_cast<int>(input_sets->size());
+  int iteration_num = isize;
   // Handle trivial cases that leaked through
   if (isize == 1) {
     return std::vector<std::vector<int> >{std::vector<int>{0}};
@@ -245,23 +245,23 @@ inline std::vector<std::vector<int> > _greedy_path(const SetVector* input_sets,
 
   // Build up a naive cost
   std::vector<int> range(isize);
-  for (size_t i = 0; i < isize; ++i) {
+  for (int i = 0; i < isize; ++i) {
     range[i] = i;
   }
   Contraction contract = _find_contraction(range, *input_sets, output_set);
-  int naive_cost = _flop_count(contract.idx_contract, contract.idx_removed.any(),
-                                  isize, idx_dict);
+  int64_t naive_cost = _flop_count(contract.idx_contract, contract.idx_removed.any(),
+                                   isize, idx_dict);
 
   // Initially iterate over all pairs
   std::vector<Alternative> known_contractions;
   Alternative best;
-  int path_cost = 0;
+  int64_t path_cost = 0;
   std::vector<std::vector<int> > ret;
 
-  for (size_t iteration = 0; iteration + 1 < iteration_num; ++iteration) {
+  for (int iteration = 0; iteration + 1 < iteration_num; ++iteration) {
     if (iteration == 0) {
-      for (int x = 0; x < static_cast<int>(isize); ++x) {
-        for (int y = x + 1; y < static_cast<int>(isize); ++y) {
+      for (int x = 0; x < isize; ++x) {
+        for (int y = x + 1; y < isize; ++y) {
           if (!((input_sets->at(x) & input_sets->at(y)).any())) {
             continue;
           }
@@ -280,7 +280,7 @@ inline std::vector<std::vector<int> > _greedy_path(const SetVector* input_sets,
         }
       }
     } else {
-      for (int x = 0; x < static_cast<int>(isize) - 1; ++x) {
+      for (int x = 0; x < isize - 1; ++x) {
         int y = isize - 1;
         if (!((input_sets->at(x) & input_sets->at(y)).any())) {
             continue;
@@ -303,8 +303,8 @@ inline std::vector<std::vector<int> > _greedy_path(const SetVector* input_sets,
     // If we do not have a inner contraction, rescan pairs including outer products
     if (known_contractions.size() == 0) {
       // Then check the outer productsj
-      for (int x = 0; x < static_cast<int>(isize); ++x) {
-        for (int y = x + 1; y < static_cast<int>(isize); ++y) {
+      for (int x = 0; x < isize; ++x) {
+        for (int y = x + 1; y < isize; ++y) {
           Alternative alternative;
           int result = _parse_possible_contraction(std::vector<int>{x, y},
                                                    *input_sets,
@@ -323,7 +323,7 @@ inline std::vector<std::vector<int> > _greedy_path(const SetVector* input_sets,
       // If we still did not find any remaining contractions, default back to einsum like behavior
       if (known_contractions.size() == 0) {
         std::vector<int> range(isize);
-        for (size_t i = 0; i < isize; ++i) {
+        for (int i = 0; i < isize; ++i) {
           range[i] = i;
         }
         ret.push_back(range);
@@ -332,17 +332,17 @@ inline std::vector<std::vector<int> > _greedy_path(const SetVector* input_sets,
     }
 
     // Sort based on first index
-    int best_cost[2], idx = -1;
-    size_t size = known_contractions.size();
-    for (size_t i = 0; i < size; ++i) {
+    int64_t best_cost[2];
+    int idx = -1, size = static_cast<int>(known_contractions.size());
+    for (int i = 0; i < size; ++i) {
       auto x = known_contractions[i];
       if (idx == -1) {
         best_cost[0] = x.cost[0];
         best_cost[1] = x.cost[1];
         idx = i;
       } else if (x.cost[0] < best_cost[0] ||
-               (x.cost[0] == best_cost[0] &&
-               x.cost[1] < best_cost[1])) {
+                 (x.cost[0] == best_cost[0] &&
+                  x.cost[1] < best_cost[1])) {
         best_cost[0] = x.cost[0];
         best_cost[1] = x.cost[1];
         idx = i;
@@ -356,7 +356,7 @@ inline std::vector<std::vector<int> > _greedy_path(const SetVector* input_sets,
     // Next iteration only compute contractions with the new tensor
     // All other contractions have been accounted for
     input_sets = &best.new_input_sets;
-    isize = input_sets->size();
+    isize = static_cast<int>(input_sets->size());
 
     // Update path and total cost
     ret.push_back(best.positions);
@@ -708,9 +708,9 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
 
   // Build a few useful list and sets
   std::vector<std::string> input_list = split(parsed_subscripts[0], ",");
-  size_t isize = input_list.size();
+  int isize = static_cast<int>(input_list.size());
   SetVector input_sets;
-  for (int i = 0; i < static_cast<int>(isize); ++i) {
+  for (int i = 0; i < isize; ++i) {
     input_sets.push_back(str2set(input_list[i]));
   }
   std::bitset<MAXAXIS> output_set = str2set(parsed_subscripts[1]);
@@ -721,7 +721,7 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
   dim_t dimension_dict[MAXAXIS];
   SetVector broadcast_indices(isize);
   memset(dimension_dict, -1, sizeof(dimension_dict));
-  for (size_t i = 0; i < isize; ++i) {
+  for (int i = 0; i < isize; ++i) {
     const std::string& term = input_list[i];
     const TShape& sh = operands[i].shape_;
     CHECK_EQ(sh.ndim(), term.length())
@@ -756,8 +756,8 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
 
   // Compute size of each input array plus the output array
   std::vector<size_t> size_list(isize + 1);
-  size_t max_size = -1, memory_arg;
-  for (size_t i = 0; i < isize; ++i) {
+  size_t max_size = 0, memory_arg;
+  for (int i = 0; i < isize; ++i) {
     size_list[i] = _compute_size_by_dict(input_list[i], dimension_dict);
     max_size = std::max(max_size, size_list[i]);
   }
@@ -778,7 +778,7 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
   std::vector<std::vector<int> > path;
   if (optimize == false) {
     path.push_back(std::vector<int>());
-    for (size_t i = 0; i < isize; ++i) {
+    for (int i = 0; i < isize; ++i) {
       path[0].push_back(i);
     }
   } else {
@@ -801,7 +801,7 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
     Contraction contract = _find_contraction(contract_inds, input_sets, output_set);
     input_sets = contract.remaining;
 
-    int cost = _flop_count(contract.idx_contract,
+    int64_t cost = _flop_count(contract.idx_contract,
                            contract.idx_removed.any(),
                            contract_inds.size(),
                            dimension_dict);
@@ -847,9 +847,9 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
                          a < b);
                 });
     }
-    size_t len_idx_result = idx_result.length();
+    int len_idx_result = static_cast<int>(idx_result.length());
     ret[i].oshape = TShape(len_idx_result, -1);
-    for (size_t j = 0; j < len_idx_result; ++j) {
+    for (int j = 0; j < len_idx_result; ++j) {
       ret[i].oshape[j] = dimension_dict[static_cast<int>(idx_result[j])];
     }
 
@@ -867,18 +867,18 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
       std::vector<int> left_pos, right_pos;
       left_pos.reserve(MAXAXIS);
       right_pos.reserve(MAXAXIS);
-      size_t tmp[MAXAXIS] = {0};
-      size_t length_left_input = tmp_inputs[0].length();
-      size_t length_right_input = tmp_inputs[1].length();
-      for (size_t j = 0; j < length_right_input; ++j) {
+      int tmp[MAXAXIS] = {0};
+      int length_left_input = static_cast<int>(tmp_inputs[0].length());
+      int length_right_input = static_cast<int>(tmp_inputs[1].length());
+      for (int j = 0; j < length_right_input; ++j) {
         if (contract.idx_removed.test(static_cast<int>(tmp_inputs[1][j]))) {
           tmp[static_cast<int>(tmp_inputs[1][j])] = j;
         }
       }
-      for (size_t j = 0; j < length_left_input; ++j) {
+      for (int j = 0; j < length_left_input; ++j) {
         if (contract.idx_removed.test(static_cast<int>(tmp_inputs[0][j]))) {
-          left_pos.push_back(static_cast<int>(j));
-          right_pos.push_back(static_cast<int>(tmp[static_cast<int>(tmp_inputs[0][j])]));
+          left_pos.push_back(j);
+          right_pos.push_back(tmp[static_cast<int>(tmp_inputs[0][j])]);
         }
       }
       // Calculate left_pos and right_pos
@@ -887,11 +887,11 @@ inline std::vector<Step> einsum_path(const std::string& subscripts,
       // Calculate do_einsum
       ret[i].do_einsum = (tensor_result != idx_result);
       // Calculate tshape
-      CHECK_EQ(tensor_result.length(), len_idx_result)
+      CHECK_EQ(static_cast<int>(tensor_result.length()), len_idx_result)
         << "tensordot produces dim " << tensor_result.length()
         << ", while einsum produces dim " << len_idx_result << ".";
       ret[i].tshape = TShape(len_idx_result, -1);
-      for (size_t j = 0; j < len_idx_result; ++j) {
+      for (int j = 0; j < len_idx_result; ++j) {
         ret[i].tshape[j] = dimension_dict[static_cast<int>(tensor_result[j])];
       }
       // Calculate blas2einsum_str
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 9e8156f3239c..62004ac6d263 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -258,6 +258,7 @@ def _add_workload_einsum():
     size_dict = dict(zip(chars, sizes))
 
     configs = [
+        # test_einsum_broadcast
         ('ij...,j...->ij...', [(2, 3, 4), (3,)]),
         ('ij...,...j->ij...', [(2, 3, 4), (3,)]),
         ('ij...,j->ij...', [(2, 3, 4), (3,)]),
@@ -310,6 +311,39 @@ def _add_workload_einsum():
         ('abjk,kl,jl,ab->ab', [(1, 1, 5, 4), (4, 6), (5, 6), (7, 7)]),
         ('obk,ijk->ioj', [(2, 4, 8), (2, 4, 8)]),
     ]
+    # check_einsum_sums
+    configs.extend([('i->', [(i,)]) for i in range(1, 17)])
+    configs.extend([('...i->...', [(2, 3, i,)]) for i in range(1, 17)])
+    configs.extend([('i...->...', [(2, i,)]) for i in range(1, 17)])
+    configs.extend([('i...->...', [(2, 3, i,)]) for i in range(1, 17)])
+    configs.extend([('ii', [(i, i,)]) for i in range(1, 17)])
+    configs.extend([('..., ...', [(3, i,), (2, 3, i,)]) for i in range(1, 17)])
+    configs.extend([('...i, ...i', [(2, 3, i,), (i,)]) for i in range(1, 17)])
+    configs.extend([('i..., i...', [(i, 3, 2,), (i,)]) for i in range(1, 11)])
+    configs.extend([('i, j', [(3,), (i,)]) for i in range(1, 17)])
+    configs.extend([('ij, j', [(4, i), (i,)]) for i in range(1, 17)])
+    configs.extend([('ji, j', [(i, 4), (i,)]) for i in range(1, 17)])
+    configs.extend([('ij, jk', [(4, i), (i, 6)]) for i in range(1, 8)])
+    configs.extend([
+        ('ij,jk,kl', [(3, 4), (4, 5), (5, 6)]),
+        ('ijk, jil -> kl', [(3, 4, 5), (4, 3, 2)]),
+        ('i, i, i -> i', [(8,), (8,), (8,)]),
+        (',i->', [(), (9,)]),
+        ('i,->', [(9,), ()]),
+    ])
+    configs.extend([('...,...', [(n,), (n,)]) for n in range(1, 25)])
+    configs.extend([('i,i', [(n,), (n,)]) for n in range(1, 25)])
+    configs.extend([('i,->i', [(n,), ()]) for n in range(1, 25)])
+    configs.extend([(',i->i', [(), (n,)]) for n in range(1, 25)])
+    configs.extend([('i,->', [(n,), ()]) for n in range(1, 25)])
+    configs.extend([(',i->', [(), (n,)]) for n in range(1, 25)])
+    configs.extend([('...,...', [(n - 1,), (n - 1,)]) for n in range(1, 25)])
+    configs.extend([('i,i', [(n - 1,), (n - 1,)]) for n in range(1, 25)])
+    configs.extend([('i,->i', [(n - 1,), ()]) for n in range(1, 25)])
+    configs.extend([(',i->i', [(), (n - 1,)]) for n in range(1, 25)])
+    configs.extend([('i,->', [(n - 1,), ()]) for n in range(1, 25)])
+    configs.extend([(',i->', [(), (n - 1,)]) for n in range(1, 25)])
+
     for optimize in [False, True]:
         for config in configs:
             subscripts, args = config
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index ae8ad621df75..5476fbee8be4 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3496,16 +3496,22 @@ def dbg(name, data):
                                                                     _np.dot(args[0].T, _np.dot(_np.ones((2, 2)), args[2].T)),
                                                                     _np.dot(_np.dot(args[0], args[1]).T, _np.ones((2, 2))))),
         # broadcast bug
-        (('ij, ij -> i'), [(1, 4), (2, 4)], lambda *args: (_np.sum(args[1], axis=0)[None, :],
-                                                           _np.tile(args[0], [2, 1]))),
+        ('ij, ij -> i', [(1, 4), (2, 4)], lambda *args: (_np.sum(args[1], axis=0)[None, :],
+                                                         _np.tile(args[0], [2, 1]))),
+        # issue #16576
+        # commented due to long running time
+        # ('abiz,abjz->abij', [(64, 8, 128, 512), (64, 8, 128, 512)], lambda *args: (_np.matmul(_np.ones((64, 8, 128, 128)), args[1]),
+        #                                                                            _np.matmul(_np.ones((64, 8, 128, 128)), args[0]))),
     ]
-    dtypes = ['int32', 'float16', 'float32', 'float64']
+    dtypes = ['float16', 'float32', 'float64', 'int32']
+    acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
+                'int32': 'int64'}
     for hybridize in [False, True]:
         for dtype in dtypes:
             for config in configs:
                 for optimize in [False, True]:
-                    rtol = 1e-0 if dtype == 'float16' else 1e-3
-                    atol = 1e-1 if dtype == 'float16' else 1e-5
+                    rtol = 1e-2 if dtype == 'float16' else 1e-3
+                    atol = 1e-4 if dtype == 'float16' else 1e-5 
                     (subscripts, operands, get_grad) = config
                     test_einsum = TestEinsum(subscripts, optimize)
                     if hybridize:
@@ -3513,11 +3519,11 @@ def dbg(name, data):
                     x = []
                     x_np = []
                     for shape in operands:
-                        x_np.append(_np.array(_np.random.uniform(-10.0, 10.0, shape),
-                                            dtype=dtype))
-                        x.append(np.array(x_np[-1], dtype=dtype))
+                        tmp = _np.array(_np.random.uniform(-1.0, 1.0, shape), dtype=dtype)
+                        x_np.append(tmp.astype(acc_type[dtype]))
+                        x.append(np.array(tmp, dtype=dtype))
                         x[-1].attach_grad()
-                    expected_np = _np.einsum(subscripts, *x_np, optimize=optimize)
+                    expected_np = _np.einsum(subscripts, *x_np, optimize=optimize).astype(dtype)
                     with mx.autograd.record():
                         out_mx = test_einsum(*x)
                     assert out_mx.shape == expected_np.shape
@@ -3535,7 +3541,7 @@ def dbg(name, data):
                     expected_np = _np.einsum(subscripts, *x_np, optimize=optimize)
                     assert_almost_equal(out_mx.asnumpy(), expected_np, rtol=rtol, atol=atol)
                     for (iop, op) in enumerate(x):
-                        assert_almost_equal(op.grad.asnumpy(), get_grad(*x_np)[iop], rtol=rtol, atol=atol)
+                        assert_almost_equal(op.grad.asnumpy(), get_grad(*x_np)[iop].astype(dtype), rtol=rtol, atol=atol)
     configs = [
         (('ij,jk,kl->il'), [(2, 2), (2, 5), (5, 2)]),
         (('ea,fb,abcd,gc,hd->efgh'), [(5, 5), (5, 5), (5, 5, 5, 5), (5, 5), (5, 5)]),
@@ -3545,8 +3551,8 @@ def dbg(name, data):
         for dtype in dtypes:
             for config in configs:
                 (subscripts, operands) = config
-                rtol = 1e-0 if dtype == 'float16' else 1e-2
-                atol = 1e-1 if dtype == 'float16' else 1e-2
+                rtol = 1e-2 if dtype == 'float16' else 1e-3
+                atol = 1e-4 if dtype == 'float16' else 1e-5 
                 grad = []
                 x_np = []
                 for shape in operands:
@@ -3560,7 +3566,8 @@ def dbg(name, data):
                     test_einsum = TestEinsum(subscripts, optimize)
                     if hybridize:
                         test_einsum.hybridize()
-                    expected_np = _np.einsum(subscripts, *x_np, optimize=optimize)
+                    expected_np = _np.einsum(subscripts, *[op.astype(acc_type[dtype]) for op in x_np],
+                                             optimize=optimize).astype(dtype)
                     with mx.autograd.record():
                         out_mx = test_einsum(*x)
                     assert out_mx.shape == expected_np.shape

From 29e467be25d6b70d9e188cfb49ed2389f396ab39 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Sat, 26 Oct 2019 15:56:35 +0800
Subject: [PATCH 22/32] Move some subgraph verbose to MXNET_SUBGRAPH_VERBOSE=2
 (#16622)

* Move subgraph pass log to verbose=2

* Run CI
---
 src/operator/subgraph/build_subgraph.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index 0f4c570331a2..8e7617d57c44 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -717,7 +717,7 @@ nnvm::Graph BuildSubgraph(nnvm::Graph&& g) {
   using namespace sg;
 
   const SubgraphPropertyPtr& subg_prop = g.GetAttr<SubgraphPropertyPtr>("subgraph_property");
-  if (verbose) {
+  if (verbose > 1) {
     const std::string& prop_name = subg_prop->HasAttr("property_name")
                                        ? subg_prop->GetAttr<std::string>("property_name")
                                        : "partition graph";

From c130cc9ef94776fb7ef5fb9ef1fb6559522dcacf Mon Sep 17 00:00:00 2001
From: Xingjian Shi <xshiab@connect.ust.hk>
Date: Sat, 26 Oct 2019 18:47:09 -0700
Subject: [PATCH 23/32] add npx reshape (#16640)

---
 python/mxnet/_numpy_op_doc.py          |  66 ++++++++
 src/operator/numpy/np_matrix_op-inl.h  |  54 ++++++-
 src/operator/numpy/np_matrix_op.cc     | 206 ++++++++++++++++++++++---
 src/operator/numpy/np_matrix_op.cu     |   3 +
 tests/python/unittest/test_numpy_op.py |  63 ++++++++
 5 files changed, 371 insertions(+), 21 deletions(-)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index d9bb378d3049..bcbef9d047d1 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -961,3 +961,69 @@ def _np_broadcast_to(array, shape, out=None):
            [1., 2., 3.]])
     """
     pass
+
+
+def _npx_reshape(a, newshape, reverse=False, order='C'):
+    """
+    Gives a new shape to an array without changing its data.
+    This function always returns a copy of the input array if
+    ``out`` is not provided.
+
+    Parameters
+    ----------
+    a : ndarray
+        Array to be reshaped.
+    newshape : int or tuple of ints
+        The new shape should be compatible with the original shape.
+        If an integer, then the result will be a 1-D array of that length.
+        One shape dimension can be -1. In this case, the value is inferred
+        from the length of the array and remaining dimensions.
+        -2 to -6 are used for data manipulation.
+
+        - -2 copy this dimension from the input to the output shape.
+        - -3 will skip current dimension if and only if the current dim size is one.
+        - -4 copy all remain of the input dimensions to the output shape.
+        - -5 use the product of two consecutive dimensions of the input
+          shape as the output.
+        - -6 split one dimension of the input into two dimensions passed
+          subsequent to -6 in the new shape.
+
+    reverse : bool, optional
+        If set to true, the special values will be inferred from right to left.
+    order : {'C'}, optional
+        Read the elements of `a` using this index order, and place the
+        elements into the reshaped array using this index order.  'C'
+        means to read / write the elements using C-like index order,
+        with the last axis index changing fastest, back to the first
+        axis index changing slowest. Other order types such as 'F'/'A'
+        may be added in the future.
+
+    Returns
+    -------
+    reshaped_array : ndarray
+        It will be always a copy of the original array. This behavior is different
+        from the official NumPy ``reshape`` operator where views of the original array may be
+        generated.
+
+    Examples
+    --------
+    >>> x = np.ones((2, 3, 8))
+    >>> npx.reshape(x, (-2, -2, 2, -1)).shape
+    (2, 3, 2, 4)
+    >>> x = np.ones((8, 3, 3, 3, 4, 4))
+    >>> npx.reshape(x, (-6, 2, -1, -4)).shape
+    (2, 4, 3, 3, 3, 4, 4)
+    >>> x = np.ones((8, 3, 3, 3, 4, 4))
+    >>> npx.reshape(x, (-5, -4)).shape
+    (24, 3, 3, 4, 4)
+    >>> x = np.ones((8, 1, 1, 1, 3))
+    >>> npx.reshape(x, (-2, -3, -3, -3, -2)).shape
+    (8, 3)
+    >>> x = np.ones((8, 3, 3, 3, 3, 8))
+    >>> npx.reshape(x, (-4, -5), reverse=True).shape
+    (8, 3, 3, 3, 24)
+    >>> x = np.ones((8, 3, 2, 4, 8))
+    >>> npx.reshape(x, (-4, -1, 2, -6), reverse=True).shape
+    (8, 3, 2, 4, 4, 2)
+    """
+    pass
diff --git a/src/operator/numpy/np_matrix_op-inl.h b/src/operator/numpy/np_matrix_op-inl.h
index b3206bf4aa75..9ce84835f1a8 100644
--- a/src/operator/numpy/np_matrix_op-inl.h
+++ b/src/operator/numpy/np_matrix_op-inl.h
@@ -27,6 +27,7 @@
 
 #include <vector>
 #include <algorithm>
+#include <string>
 #include "../tensor/matrix_op-inl.h"
 #include "../nn/concat-inl.h"
 #include "../../common/utils.h"
@@ -51,6 +52,58 @@ struct NumpyVstackParam : public dmlc::Parameter<NumpyVstackParam> {
   }
 };
 
+struct NumpyReshapeParam : public dmlc::Parameter<NumpyReshapeParam> {
+  mxnet::TShape newshape;
+  std::string order;
+  DMLC_DECLARE_PARAMETER(NumpyReshapeParam) {
+    DMLC_DECLARE_FIELD(newshape)
+        .describe("The new shape should be compatible with the original shape."
+                  " If an integer, then the result will be a 1-D array of that length."
+                  " One shape dimension can be -1. In this case, the value is inferred"
+                  " from the length of the array and remaining dimensions.");
+    DMLC_DECLARE_FIELD(order)
+        .set_default("C")
+        .describe("Read the elements of a using this index order, and place the elements into"
+                  " the reshaped array using this index order. 'C' means to read/write the elements"
+                  " using C-like index order, with the last axis index changing fastest,"
+                  " back to the first axis index changing slowest."
+                  " Note that currently only C-like order is"
+                  " supported");
+  }
+};
+
+struct NumpyXReshapeParam : public dmlc::Parameter<NumpyXReshapeParam> {
+  mxnet::TShape newshape;
+  bool reverse;
+  std::string order;
+  DMLC_DECLARE_PARAMETER(NumpyXReshapeParam) {
+    DMLC_DECLARE_FIELD(newshape)
+        .describe("The new shape should be compatible with the original shape."
+                  " If an integer, then the result will be a 1-D array of that length."
+                  " One shape dimension can be -1. In this case, the value is inferred"
+                  " from the length of the array and remaining dimensions."
+                  " -2 to -6 are used for data manipulation."
+                  " -2 copy this dimension from the input to the output shape."
+                  " -3 will skip current dimension if and only if the current dim size is one."
+                  " -4 copy all remain of the input dimensions to the output shape."
+                  " -5 use the product of two consecutive dimensions of the input"
+                  " shape as the output."
+                  " -6 split one dimension of the input into two dimensions passed"
+                  " subsequent to -6 in the new shape.");
+    DMLC_DECLARE_FIELD(reverse)
+        .set_default(false)
+        .describe("If true then the special values are inferred from right to left");
+    DMLC_DECLARE_FIELD(order)
+        .set_default("C")
+        .describe("Read the elements of a using this index order, and place the elements into"
+                  " the reshaped array using this index order. 'C' means to read/write the elements"
+                  " using C-like index order, with the last axis index changing fastest,"
+                  " back to the first axis index changing slowest."
+                  " Note that currently only C-like order is"
+                  " supported");
+  }
+};
+
 template<typename xpu>
 void NumpyTranspose(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
@@ -731,7 +784,6 @@ inline void HSplitOpBackward(const nnvm::NodeAttrs &attrs,
   }
   SplitOpBackwardImpl<xpu>(attrs, ctx, inputs, req, outputs, real_axis);
 }
-
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/numpy/np_matrix_op.cc b/src/operator/numpy/np_matrix_op.cc
index 7bcd6ad27b52..0a6f9a150d8b 100644
--- a/src/operator/numpy/np_matrix_op.cc
+++ b/src/operator/numpy/np_matrix_op.cc
@@ -34,6 +34,9 @@ DMLC_REGISTER_PARAMETER(NumpyTransposeParam);
 DMLC_REGISTER_PARAMETER(NumpyRollParam);
 DMLC_REGISTER_PARAMETER(NumpyMoveaxisParam);
 DMLC_REGISTER_PARAMETER(NumpyRot90Param);
+DMLC_REGISTER_PARAMETER(NumpyReshapeParam);
+DMLC_REGISTER_PARAMETER(NumpyXReshapeParam);
+
 
 bool NumpyTransposeShape(const nnvm::NodeAttrs& attrs,
                          mxnet::ShapeVector *in_attrs,
@@ -126,26 +129,6 @@ NNVM_REGISTER_OP(_np_transpose)
 .add_argument("a", "NDArray-or-Symbol", "Source input")
 .add_arguments(NumpyTransposeParam::__FIELDS__());
 
-struct NumpyReshapeParam : public dmlc::Parameter<NumpyReshapeParam> {
-  mxnet::TShape newshape;
-  std::string order;
-  DMLC_DECLARE_PARAMETER(NumpyReshapeParam) {
-      DMLC_DECLARE_FIELD(newshape)
-          .describe("The new shape should be compatible with the original shape."
-                    " If an integer, then the result will be a 1-D array of that length."
-                    " One shape dimension can be -1. In this case, the value is inferred"
-                    " from the length of the array and remaining dimensions.");
-      DMLC_DECLARE_FIELD(order)
-      .set_default("C")
-      .describe("Read the elements of a using this index order, and place the elements into"
-                " the reshaped array using this index order. 'C' means to read/write the elements"
-                " using C-like index order, with the last axis index changing fastest, back to the"
-                " first axis index changing slowest. Note that currently only C-like order is"
-                " supported");
-  }
-};
-
-DMLC_REGISTER_PARAMETER(NumpyReshapeParam);
 
 bool NumpyReshapeInferShape(const mxnet::TShape& src, mxnet::TShape* dst) {
   if (shape_is_known(src) && shape_is_known(*dst)) {
@@ -202,6 +185,164 @@ bool NumpyReshapeShape(const nnvm::NodeAttrs& attrs,
   return success;
 }
 
+bool NumpyXReshapeInferShape(const mxnet::TShape& src,
+                             const mxnet::TShape& target,
+                             mxnet::TShape* output,
+                             const std::string &default_error_msg) {
+  bool target_shape_is_known = true;
+  dim_t target_size = 1;
+  for (int i = 0; i < target.ndim(); ++i) {
+    if (target[i] < 0) {
+      target_shape_is_known = false;
+      target_size  = -1;
+      break;
+    } else {
+      target_size *= target[i];
+    }
+  }
+  if (shape_is_known(src) && target_shape_is_known) {
+    CHECK_EQ(src.Size(), target_size) << default_error_msg;
+    *output = TShape(target.begin(), target.end());
+    return true;
+  } else if (!shape_is_known(src) || target.ndim() == -1) {
+    return false;
+  } else {
+    int unknown_axis = -1;
+    dim_t known_dim_size_prod = 1;
+    std::vector<dim_t> output_shape_vector;
+    int src_inx = 0;
+    for (int i = 0; i < target.ndim(); ++i) {
+      dim_t proposed_dim = target[i];
+      CHECK(proposed_dim >= -6)
+        << "Dimension size must be greater than -6, received " << proposed_dim;
+      if (proposed_dim == -1) {
+        // infer the known dimension
+        CHECK_LT(unknown_axis, 0)
+          << "One and only one dim can be inferred";
+        unknown_axis = output_shape_vector.size();
+        output_shape_vector.push_back(-1);
+        src_inx++;
+      } else if (proposed_dim == -2) {
+        // copy the dimension from src to output
+        CHECK_LT(src_inx, src.ndim())
+          << "Unmatching dimension of proposed new shape";
+        known_dim_size_prod *= src[src_inx];
+        output_shape_vector.push_back(src[src_inx++]);
+      } else if (proposed_dim == -3) {
+        // skip the source dimension if and only if it is one
+        CHECK_EQ(src[src_inx], 1)
+          <<"-3 index should only be used to skip dimension size 1";
+        src_inx++;
+      } else if (proposed_dim == -4) {
+        // copy all remaining dims from source
+        while (src_inx < src.ndim()) {
+          known_dim_size_prod *= src[src_inx];
+          const dim_t dn = src[src_inx++];
+          output_shape_vector.push_back(dn);
+        }
+      } else if (proposed_dim == -5) {
+        // merge two dims from source
+        CHECK_LT(src_inx, src.ndim()-1)
+          <<"Not enough dimensions left for the product";
+        const dim_t d1 = src[src_inx++];
+        const dim_t d2 = src[src_inx++];
+        if (!mxnet::dim_size_is_known(d1) || !mxnet::dim_size_is_known(d2)) {
+          CHECK_LT(unknown_axis, 0)
+            << "One and only one dim can be inferred";
+          unknown_axis = output_shape_vector.size();
+          output_shape_vector.push_back(-1);
+        } else {
+          known_dim_size_prod *= d1*d2;
+          output_shape_vector.push_back(d1 * d2);
+        }
+      } else if (proposed_dim == -6) {
+        // split the source dim s into two dims
+        // read the left dim and then the right dim (either can be -1)
+        CHECK_LT(i + 2, target.ndim());
+        CHECK_LT(src_inx, src.ndim());
+        const dim_t d0 = src[src_inx++];
+        dim_t d1 = target[++i];
+        dim_t d2 = target[++i];
+        CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
+        if (d1 == -1 && d0 >= 0) d1 = d0 / d2;  // d0 must be known to do this
+        if (d2 == -1 && d0 >= 0) d2 = d0 / d1;  // d0 must be known to do this
+        CHECK(d1 * d2 == static_cast<dim_t>(d0) || static_cast<dim_t>(d0) == dim_t(-1))
+          <<"Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
+        if (d1 == -1) {
+          CHECK_LT(unknown_axis, 0)
+            << "One and only one dim can be inferred";
+          unknown_axis = output_shape_vector.size();
+        } else if (d2 == -1) {
+          CHECK_LT(unknown_axis, 0)
+            << "One and only one dim can be inferred";
+          unknown_axis = output_shape_vector.size() + 1;
+        }
+        known_dim_size_prod *= d0 == -1 ? 1 : d0;
+        output_shape_vector.push_back(d1);
+        output_shape_vector.push_back(d2);
+      } else {
+        // greater than 0, new shape
+        known_dim_size_prod *= proposed_dim;
+        output_shape_vector.push_back(proposed_dim);
+        src_inx++;
+      }
+    }
+
+    if (unknown_axis > -1) {
+      // if the input in zero size tensor, the output must be of known shape of zero size
+      CHECK_NE(known_dim_size_prod, 0) << default_error_msg;
+      CHECK(src.Size() % known_dim_size_prod == 0) << default_error_msg;
+      output_shape_vector[unknown_axis] = src.Size() / known_dim_size_prod;
+    }
+
+    *output = mxnet::TShape(output_shape_vector.begin(), output_shape_vector.end());
+    CHECK_EQ((*output).Size(), src.Size()) << default_error_msg;
+    return true;
+  }
+}
+
+bool NumpyXReshapeShape(const nnvm::NodeAttrs& attrs,
+                       mxnet::ShapeVector* in_attrs,
+                       mxnet::ShapeVector* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyXReshapeParam& param = nnvm::get<NumpyXReshapeParam>(attrs.parsed);
+  // sanity check
+  bool has_unknown_dim_size = false;
+  for (int i = 0; i < param.newshape.ndim(); ++i) {
+    if (param.newshape[i] < 0) {
+      CHECK_GE(param.newshape[i], -6)
+        << "Dimension size must be greater than or equal to -6";
+      if (param.newshape[i] == -1) {
+        CHECK(!has_unknown_dim_size) << "Can only specify one unknown dimension";
+        has_unknown_dim_size = true;
+      }
+    }
+  }
+
+  mxnet::TShape output_shape;
+  bool success;
+  std::stringstream ss;
+  ss << "Cannot reshape array of shape " << in_attrs->at(0)
+     << " into shape " << param.newshape
+     << " , reverse = " << param.reverse;
+  std::string err_msg = ss.str();
+  if (!param.reverse) {
+    success = NumpyXReshapeInferShape(in_attrs->at(0),
+                                      param.newshape, &output_shape, err_msg);
+  } else {
+    mxnet::TShape rev_in_shape = in_attrs->at(0);
+    mxnet::TShape rev_newshape = param.newshape;
+    std::reverse(rev_in_shape.begin(), rev_in_shape.end());
+    std::reverse(rev_newshape.begin(), rev_newshape.end());
+    success = NumpyXReshapeInferShape(rev_in_shape,
+                                      rev_newshape, &output_shape, err_msg);
+    std::reverse(output_shape.begin(), output_shape.end());
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, output_shape);
+  return success;
+}
+
 NNVM_REGISTER_OP(_np_reshape)
 .describe(R"code()code" ADD_FILELINE)
 .add_alias("_npi_reshape")
@@ -227,6 +368,31 @@ NNVM_REGISTER_OP(_np_reshape)
 .add_argument("a", "NDArray-or-Symbol", "Array to be reshaped.")
 .add_arguments(NumpyReshapeParam::__FIELDS__());
 
+
+NNVM_REGISTER_OP(_npx_reshape)
+.describe(R"code()code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<NumpyXReshapeParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyXReshapeShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"a"};
+  })
+.add_argument("a", "NDArray-or-Symbol", "Array to be reshaped.")
+.add_arguments(NumpyXReshapeParam::__FIELDS__());
+
 bool NumpySqueezeShape(const nnvm::NodeAttrs& attrs,
                        mxnet::ShapeVector *in_attrs,
                        mxnet::ShapeVector *out_attrs) {
diff --git a/src/operator/numpy/np_matrix_op.cu b/src/operator/numpy/np_matrix_op.cu
index 8c8301bb3bbf..6b4f7a11a9a2 100644
--- a/src/operator/numpy/np_matrix_op.cu
+++ b/src/operator/numpy/np_matrix_op.cu
@@ -109,5 +109,8 @@ NNVM_REGISTER_OP(_npi_hsplit)
 NNVM_REGISTER_OP(_npi_hsplit_backward)
 .set_attr<FCompute>("FCompute<gpu>", HSplitOpBackward<gpu>);
 
+NNVM_REGISTER_OP(_npx_reshape)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 5476fbee8be4..98a7b05dca9f 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3674,6 +3674,69 @@ def test_np_true_divide():
         assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-3, atol=1e-3, use_broadcast=False)
 
 
+@with_seed()
+@use_np
+def test_npx_reshape():
+    class TestNumpyXReshape(HybridBlock):
+        def __init__(self, newshape, reverse):
+            super(TestNumpyXReshape, self).__init__()
+            self._newshape = newshape
+            self._reverse = reverse
+
+        def hybrid_forward(self, F, a, *args, **kwargs):
+            return F.npx.reshape(a, self._newshape, reverse=self._reverse)
+
+    test_cases = [
+        [(2, 3, 5, 5),  (-2, -1),         False, (2, 75)],
+        [(2, 3, 5, 5),  (-2, -2, -1),     False, (2, 3, 25)],
+        [(5, 3, 4, 5),  (-2, -1, -2),     False, (5, 15, 4)],
+        [(2, 3, 5, 4),  (-1, -2, -2),     False, (8, 3, 5)],
+        [(2, 3, 5, 5),  (-2, -2, -2, -2), False, (2, 3, 5, 5)],
+        [(2, 1, 4, 5),  (-2, -3, -2, -2), False, (2, 4, 5)],
+        [(1, 1, 4, 1),  (-3, -3, -2, -2), False, (4, 1)],
+        [(1, 1, 1, 1),  (-3, -3, -3, -3), False, ()],
+        [(2, 4, 5, 3),  (-1, 2, 2, 1),    False, (30, 2, 2, 1)],
+        [(2, 3, 5, 6),  (-4,),            False, (2, 3, 5, 6)],
+        [(2, 3, 5, 6),  (6, 1, -4),       False, (6, 1, 5, 6)],
+        [(2, 3, 5, 6),  (-5, -5),         False, (6, 30)],
+        [(2, 3, 5, 6),  (-5, -1),         False, (6, 30)],
+        [(64,),         (-6, 16, 4),      False, (16, 4)],
+        [(64,),         (-6, 16, -1),     False, (16, 4)],
+        [(64, 1, 2, 3), (-6, 16, -1, -4), False, (16, 4, 1, 2, 3)],
+        [(8, 5, 4, 6),  (-4, -1, 3, -6),  True,  (8, 5, 4, 2, 3)]
+    ]
+    for hybridize in [True, False]:
+        for shape, newshape, reverse, expected_ret_shape in test_cases:
+            for grad_req in ['write', 'add']:
+                # test gluon
+                test_reshape = TestNumpyXReshape(newshape=newshape, reverse=reverse)
+                if hybridize:
+                    test_reshape.hybridize()
+
+                a = mx.np.random.uniform(-1, 1, shape).astype(np.float32)
+                init_a_grad = mx.np.random.uniform(-1, 1, shape).astype(np.float32)
+                a.attach_grad(grad_req=grad_req)
+                if grad_req == 'add':
+                    a.grad[:] = init_a_grad
+                with mx.autograd.record():
+                    y = test_reshape(a)
+                assert y.shape == expected_ret_shape,\
+                    'y.shape={}, expected_ret_shape={}'.format(y.shape, expected_ret_shape)
+                assert_almost_equal(y.asnumpy(), a.asnumpy().reshape(expected_ret_shape), rtol=1e-3, atol=1e-5)
+
+                # test backward
+                mx.autograd.backward(y)
+                expected_grad = _np.ones(shape)
+                if grad_req == 'add':
+                    expected_grad += init_a_grad.asnumpy()
+                assert_almost_equal(a.grad.asnumpy(), expected_grad, rtol=1e-3, atol=1e-5)
+
+                # test imperative
+                npx_out = npx.reshape(a, newshape, reverse=reverse)
+                expected_out = _np.reshape(a.asnumpy(), expected_ret_shape)
+                assert_almost_equal(npx_out.asnumpy(), expected_out, rtol=1e-3, atol=1e-5)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 9f21cddb3f6cc81e67a192f313066f7e9edd7fa8 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Sun, 27 Oct 2019 10:21:14 -0700
Subject: [PATCH 24/32] RNNOp only call cuda/cudnn if GPU ctx is requested
 (#16632)

---
 src/operator/rnn-inl.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index ead7501a48b0..b448261f215d 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -422,6 +422,8 @@ class RNNOp {
     init_mem_ = false;
     reserve_mem_size_ = 0;
 #endif
+
+    if (ctx_.dev_type == kGPU) {
 #if MXNET_USE_CUDNN == 1
     init_cudnn_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
@@ -505,6 +507,7 @@ class RNNOp {
       LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
     }
 #endif  // MXNET_USE_CUDNN == 1
+    }
 
     if (ctx_.dev_type == kCPU) {
       this->init_space_ = false;
@@ -523,6 +526,7 @@ class RNNOp {
   }
 
   ~RNNOp() {
+    if (ctx_.dev_type == kGPU) {
 #if MXNET_USE_CUDNN == 1
     CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
     CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
@@ -557,6 +561,7 @@ class RNNOp {
     CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
 #endif  // MXNET_USE_CUDNN_GE_7200
 #endif  // MXNET_USE_CUDNN
+    }
   }
 
   void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,

From 73c6b4a78f2a1f843dd136ad4a7eea7fd4323ea6 Mon Sep 17 00:00:00 2001
From: Hu Shiwen <yajiedesign@gmail.com>
Date: Mon, 28 Oct 2019 03:31:07 +0800
Subject: [PATCH 25/32] fix bad encode (#16641)

---
 src/operator/contrib/allclose_op-inl.h | 4 ++--
 src/operator/numpy/np_einsum_op-inl.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/contrib/allclose_op-inl.h b/src/operator/contrib/allclose_op-inl.h
index a858450f0007..a10c7795e568 100644
--- a/src/operator/contrib/allclose_op-inl.h
+++ b/src/operator/contrib/allclose_op-inl.h
@@ -58,8 +58,8 @@ struct AllCloseParam : public dmlc::Parameter<AllCloseParam> {
       .describe("Absolute tolerance.");
     DMLC_DECLARE_FIELD(equal_nan)
       .set_default(true)
-      .describe("Whether to compare NaN’s as equal. If True, NaN’s in A will be considered equal "
-                "to NaN’s in B in the output array.");
+      .describe("Whether to compare NaN's as equal. If True, NaN's in A will be considered equal "
+                "to NaN's in B in the output array.");
   }
 };
 
diff --git a/src/operator/numpy/np_einsum_op-inl.h b/src/operator/numpy/np_einsum_op-inl.h
index 051280763331..d2f399b2533d 100644
--- a/src/operator/numpy/np_einsum_op-inl.h
+++ b/src/operator/numpy/np_einsum_op-inl.h
@@ -394,7 +394,7 @@ struct NumpyEinsumParam: public dmlc::Parameter<NumpyEinsumParam> {
       .set_default("")
       .describe("Specifies the subscripts for summation as comma separated list"
       " of subscript labels. An implicit (classical Einstein summation) calculation"
-      " is performed unless the explicit indicator ‘->’ is included as well as"
+      " is performed unless the explicit indicator '->' is included as well as"
       " subscript labels of the precise output form.");
     DMLC_DECLARE_FIELD(optimize)
       .set_default(0);

From 84d61a1df3eca95be68c15d39fe057064a4da018 Mon Sep 17 00:00:00 2001
From: Robert Stone <talby@trap.mtview.ca.us>
Date: Sun, 27 Oct 2019 12:31:50 -0700
Subject: [PATCH 26/32] [Perl] - ndarray to native array conversion fix
 (#16635)

---
 perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm |  6 +++++-
 perl-package/AI-MXNet/t/test_ndarray.t        | 19 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index f75cc84b2a8f..1d968c14a487 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -116,7 +116,11 @@ method STORABLE_thaw($cloning, $buf, $writable)
 
 method split_array(@args)
 {
-     $self->shape->[0] > 1 ? $self->split(num_outputs => $self->shape->[0], squeeze_axis => @{ $self->shape } > 1 ? 1 : 0, axis => 0) : [$self];
+    my $shape = $self->shape;
+    return [] if $shape->[0] == 0;
+    my $list = $self->split(num_outputs=>$shape->[0],
+        squeeze_axis=>int(@$shape > 1), axis=>0);
+    $shape->[0] == 1 ? [ $list ] : $list;
 }
 
 method at(Index @indices)
diff --git a/perl-package/AI-MXNet/t/test_ndarray.t b/perl-package/AI-MXNet/t/test_ndarray.t
index a6cd113c3f89..1e290b4bc715 100644
--- a/perl-package/AI-MXNet/t/test_ndarray.t
+++ b/perl-package/AI-MXNet/t/test_ndarray.t
@@ -19,7 +19,7 @@ use strict;
 use warnings;
 use AI::MXNet qw(mx);
 use AI::MXNet::TestUtils qw(almost_equal same rand_ndarray randint zip);
-use Test::More tests => 251;
+use Test::More tests => 261;
 use PDL;
 use File::Temp qw(tempdir);
 use IO::File;
@@ -217,6 +217,22 @@ sub test_histogram
     ok(same($bins->aspdl, pdl([10, 20, 30, 60])));
 }
 
+sub test_array_overload
+{
+    # array conversions are largely calls to mx->nd->split(), but have
+    # special cases around dimensions of length 0 and 1.
+    is_deeply([ @{ mx->nd->array(zeros(7, 0)) } ], []);
+    is_deeply(mx->nd->zeros([3, 7])->[0]->shape, [ 7 ]);
+    is_deeply(mx->nd->zeros([2, 7])->[0]->shape, [ 7 ]);
+    is_deeply(mx->nd->zeros([1, 7])->[0]->shape, [ 7 ]);
+    is_deeply(mx->nd->zeros([3, 7, 11])->[0]->shape, [7, 11]);
+    is_deeply(mx->nd->zeros([2, 7, 11])->[0]->shape, [7, 11]);
+    is_deeply(mx->nd->zeros([1, 7, 11])->[0]->shape, [7, 11]);
+    is_deeply(mx->nd->zeros([3, 7, 11, 13])->[0]->shape, [7, 11, 13]);
+    is_deeply(mx->nd->zeros([2, 7, 11, 13])->[0]->shape, [7, 11, 13]);
+    is_deeply(mx->nd->zeros([1, 7, 11, 13])->[0]->shape, [7, 11, 13]);
+}
+
 test_ndarray_slice();
 test_ndarray_reshape();
 test_moveaxis();
@@ -226,3 +242,4 @@ test_linalg_gemm2();
 test_image_to_tensor();
 test_buffer_load();
 test_histogram();
+test_array_overload();

From d12e674e58214fb73e467d16f7362320f3ad8c28 Mon Sep 17 00:00:00 2001
From: Talia <31782251+TEChopra1000@users.noreply.github.com>
Date: Sun, 27 Oct 2019 12:33:31 -0700
Subject: [PATCH 27/32] fixing broken links in multiple files - round 3
 (#16634)

---
 .../python/tutorials/extend/custom_layer.md   |  2 +-
 .../gluon_from_experiment_to_deployment.md    |  4 +--
 .../gluon/training/fit_api_tutorial.md        |  2 +-
 .../packages/ndarray/sparse/train.md          |  4 +--
 .../packages/ndarray/sparse/train_gluon.md    | 35 +++++++++----------
 .../packages/onnx/fine_tuning_gluon.md        |  2 +-
 .../python/tutorials/packages/viz/index.rst   |  2 +-
 .../backend/mkldnn/mkldnn_quantization.md     |  4 +--
 .../tutorials/performance/backend/profiler.md |  2 +-
 .../performance/backend/tensorrt/tensorrt.md  |  4 +--
 docs/static_site/src/pages/api/api.html       |  2 +-
 .../tutorials/mxnet_cpp_inference_tutorial.md | 16 ++++-----
 docs/static_site/src/pages/api/faq/float16.md |  2 +-
 docs/static_site/src/pages/api/faq/perf.md    |  6 ++--
 .../pages/get_started/build_from_source.md    |  2 +-
 julia/docs/src/api/io.md                      |  2 +-
 julia/docs/src/tutorial/char-lstm.md          |  2 +-
 julia/docs/src/tutorial/mnist.md              |  4 +--
 18 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/docs/python_docs/python/tutorials/extend/custom_layer.md b/docs/python_docs/python/tutorials/extend/custom_layer.md
index 6002a7812ec7..2fe795ba5439 100644
--- a/docs/python_docs/python/tutorials/extend/custom_layer.md
+++ b/docs/python_docs/python/tutorials/extend/custom_layer.md
@@ -57,7 +57,7 @@ The rest of methods of the `Block` class are already implemented, and majority o
 
 Looking into implementation of [existing layers](https://mxnet.apache.org/api/python/gluon/nn.html), one may find that more often a block inherits from a [HybridBlock](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/block.py#L428), instead of directly inheriting from `Block`.
 
-The reason for that is that `HybridBlock` allows to write custom layers that can be used in imperative programming as well as in symbolic programming. It is convinient to support both ways, because the imperative programming eases the debugging of the code and the symbolic one provides faster execution speed. You can learn more about the difference between symbolic vs. imperative programming from [this article](https://mxnet.apache.org/architecture/program_model.html).
+The reason for that is that `HybridBlock` allows to write custom layers that can be used in imperative programming as well as in symbolic programming. It is convinient to support both ways, because the imperative programming eases the debugging of the code and the symbolic one provides faster execution speed. You can learn more about the difference between symbolic vs. imperative programming from [this article](/api/architecture/program_model).
 
 Hybridization is a process that Apache MxNet uses to create a symbolic graph of a forward computation. This allows to increase computation performance by optimizing the computational symbolic graph. Once the symbolic graph is created, Apache MxNet caches and reuses it for subsequent computations.
 
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index b1f65e682263..47b629991650 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -99,14 +99,14 @@ ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
 batch_size = per_device_batch_size * max(num_gpus, 1)
 ```
 
-Now we will apply data augmentations on training images. This makes minor alterations on the training images, and our model will consider them as distinct images. This can be very useful for fine-tuning on a relatively small dataset, and it will help improve the model. We can use the Gluon [DataSet API](https://mxnet.apache.org/tutorials/gluon/datasets.html), [DataLoader API](https://mxnet.apache.org/tutorials/gluon/datasets.html), and [Transform API](https://mxnet.apache.org/tutorials/gluon/data_augmentation.html) to load the images and apply the following data augmentations:
+Now we will apply data augmentations on training images. This makes minor alterations on the training images, and our model will consider them as distinct images. This can be very useful for fine-tuning on a relatively small dataset, and it will help improve the model. We can use the Gluon [DataSet API](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.Dataset), [DataLoader API](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.DataLoader), and [Transform API](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.Dataset.transform) to load the images and apply the following data augmentations:
 1. Randomly crop the image and resize it to 224x224
 2. Randomly flip the image horizontally
 3. Randomly jitter color and add noise
 4. Transpose the data from `[height, width, num_channels]` to `[num_channels, height, width]`, and map values from [0, 255] to [0, 1]
 5. Normalize with the mean and standard deviation from the ImageNet dataset.
 
-For validation and inference, we only need to apply step 1, 4, and 5. We also need to save the mean and standard deviation values for [inference using C++](https://mxnet.apache.org/versions/master/tutorials/c++/mxnet_cpp_inference_tutorial.html).
+For validation and inference, we only need to apply step 1, 4, and 5. We also need to save the mean and standard deviation values for [inference using C++](/api/cpp/docs/tutorials/cpp_inference).
 
 ```python
 jitter_param = 0.4
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
index 9e4cbe2f5114..896e5f217aa3 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
@@ -252,7 +252,7 @@ with warnings.catch_warnings():
     Epoch 2, loss 0.3229 <!--notebook-skip-line-->
 ```
 
-You can load the saved model, by using the `load_parameters` API in Gluon. For more details refer to the [Loading model parameters from file tutorial](../blocks/save_load_params.html#saving-model-parameters-to-file)
+You can load the saved model, by using the `load_parameters` API in Gluon. For more details refer to the [Loading model parameters from file tutorial](/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html#saving-model-parameters-to-file)
 
 
 ```python
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
index 336185cf7583..23654fc6a33a 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
@@ -240,8 +240,8 @@ The function you will explore is: *y = x<sub>1</sub>  +  2x<sub>2</sub> + ... 10
 
 ### Preparing the Data
 
-In MXNet, both [mx.io.LibSVMIter](https://mxnet.apache.org/versions/master/api/python/io/io.html#mxnet.io.LibSVMIter)
-and [mx.io.NDArrayIter](https://mxnet.apache.org/versions/master/api/python/io/io.html#mxnet.io.NDArrayIter)
+In MXNet, both [mx.io.LibSVMIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter)
+and [mx.io.NDArrayIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.NDArrayIter)
 support loading sparse data in CSR format. In this example, we'll use the `NDArrayIter`.
 
 You may see some warnings from SciPy. You don't need to worry about those for this example.
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
index 402cc2aeb739..688071062e20 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
@@ -20,7 +20,7 @@
 
 When working on machine learning problems, you may encounter situations where the input data is sparse (i.e. the majority of values are zero). One example of this is in recommendation systems. You could have millions of user and product features, but only a few of these features are present for each sample. Without special treatment, the sheer magnitude of the feature space can lead to out-of-memory situations and cause significant slowdowns when training and making predictions.
 
-MXNet supports a number of sparse storage types (often called 'stype' for short) for these situations. In this tutorial, we'll start by generating some sparse data, write it to disk in the LibSVM format and then read back using the [`LibSVMIter`](https://mxnet.apache.org/api/python/io/io.html) for training. We use the Gluon API to train the model and leverage sparse storage types such as [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray) and [`RowSparseNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=rowsparsendarray#mxnet.ndarray.sparse.RowSparseNDArray) to maximise performance and memory efficiency.
+MXNet supports a number of sparse storage types (often called 'stype' for short) for these situations. In this tutorial, we'll start by generating some sparse data, write it to disk in the LibSVM format and then read back using the [`LibSVMIter`](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter) for training. We use the Gluon API to train the model and leverage sparse storage types such as [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) and [`RowSparseNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.RowSparseNDArray) to maximise performance and memory efficiency.
 
 
 ```python
@@ -63,7 +63,7 @@ print('{:,.0f} non-zero elements'.format(data.data.size))
 10,000 non-zero elements
 ```
 
-Our storage type is CSR (Compressed Sparse Row) which is the ideal type for sparse data along multiple axes. See [this in-depth tutorial](https://mxnet.apache.org/versions/master/tutorials/sparse/csr.html) for more information. Just to confirm the generation process ran correctly, we can see that the vast majority of values are indeed zero. One of the first questions to ask would be how much memory is saved by storing this data in a [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray) versus a standard [`NDArray`](https://mxnet.apache.org/versions/master/api/python/ndarray/sparse.html?highlight=ndarray#module-mxnet.ndarray). Since sparse arrays are constructed from many components (e.g. `data`, `indices` and `indptr`) we define a function called `get_nbytes` to calculate the number of bytes taken in memory to store an array. We compare the same data stored in a standard [`NDArray`](https://mxnet.apache.org/versions/master/api/python/ndarray/sparse.html?highlight=ndarray#module-mxnet.ndarray) (with `data.tostype('default')`) to the [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray).
+Our storage type is CSR (Compressed Sparse Row) which is the ideal type for sparse data along multiple axes. See [this in-depth tutorial](https://mxnet.apache.org/versions/master/tutorials/sparse/csr.html) for more information. Just to confirm the generation process ran correctly, we can see that the vast majority of values are indeed zero. One of the first questions to ask would be how much memory is saved by storing this data in a [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) versus a standard [`NDArray`](/api/python/docs/api/ndarray/ndarray.html#module-mxnet.ndarray). Since sparse arrays are constructed from many components (e.g. `data`, `indices` and `indptr`) we define a function called `get_nbytes` to calculate the number of bytes taken in memory to store an array. We compare the same data stored in a standard [`NDArray`](/api/python/docs/api/ndarray/ndarray.html#module-mxnet.ndarray) (with `data.tostype('default')`) to the [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray).
 
 
 ```python
@@ -94,9 +94,9 @@ Given the extremely high sparsity of the data, we observe a huge memory saving h
 
 ### Writing Sparse Data
 
-Since there is such a large size difference between dense and sparse storage formats here, we ideally want to store the data on disk in a sparse storage format too. MXNet supports a format called LibSVM and has a data iterator called [`LibSVMIter`](https://mxnet.apache.org/api/python/io/io.html?highlight=libsvmiter) specifically for data formatted this way.
+Since there is such a large size difference between dense and sparse storage formats here, we ideally want to store the data on disk in a sparse storage format too. MXNet supports a format called LibSVM and has a data iterator called [`LibSVMIter`](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter) specifically for data formatted this way.
 
-A LibSVM file has a row for each sample, and each row starts with the label: in this case `0.0` or `1.0` since we have a classification task. After this we have a variable number of `key:value` pairs separated by spaces, where the key is column/feature index and the value is the value of that feature. When working with your own sparse data in a custom format you should try to convert your data into this format. We define a `save_as_libsvm` function to save the `data` ([`CSRNDArray`](https://mxnet.apache.org/versions/master/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray)) and `label` (`NDArray`) to disk in LibSVM format.
+A LibSVM file has a row for each sample, and each row starts with the label: in this case `0.0` or `1.0` since we have a classification task. After this we have a variable number of `key:value` pairs separated by spaces, where the key is column/feature index and the value is the value of that feature. When working with your own sparse data in a custom format you should try to convert your data into this format. We define a `save_as_libsvm` function to save the `data` ([`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray)) and `label` (`NDArray`) to disk in LibSVM format.
 
 
 ```python
@@ -148,10 +148,9 @@ Some storage overhead is introduced by serializing the data as characters (with
 
 ### Reading Sparse Data
 
-Using [`LibSVMIter`](https://mxnet.apache.org/api/python/io/io.html?highlight=libsvmiter), we can quickly and easily load data into batches ready for training. Although Gluon [`Dataset`](https://mxnet.apache.org/versions/master/api/python/gluon/data.html?highlight=dataset#mxnet.gluon.data.Dataset)s can be written to return sparse arrays, Gluon [`DataLoader`](https://mxnet.apache.org/versions/master/api/python/gluon/data.html?highlight=dataloader#mxnet.gluon.data.DataLoader)s currently convert each sample to dense before stacking up to create the batch. As a result, [`LibSVMIter`](https://mxnet.apache.org/api/python/io/io.html?highlight=libsvmiter) is the recommended method of loading sparse data in batches.
-
-Similar to using a [`DataLoader`](https://mxnet.apache.org/versions/master/api/python/gluon/data.html?highlight=dataloader#mxnet.gluon.data.DataLoader), you must specify the required `batch_size`. Since we're dealing with sparse data and the column shape isn't explicitly stored in the LibSVM file, we additionally need to provide the shape of the data and label. Our [`LibSVMIter`](https://mxnet.apache.org/api/python/io/io.html?highlight=libsvmiter) returns batches in a slightly different form to a [`DataLoader`](https://mxnet.apache.org/versions/master/api/python/gluon/data.html?highlight=dataloader#mxnet.gluon.data.DataLoader). We get `DataBatch` objects instead of `tuple`. See the [appendix of this tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/datasets.html) for more information.
+Using [`LibSVMIter`](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter), we can quickly and easily load data into batches ready for training. Although Gluon [`Dataset`](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.Dataset)s can be written to return sparse arrays, Gluon [`DataLoader`](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.DataLoader)s currently convert each sample to dense before stacking up to create the batch. As a result, [`LibSVMIter`](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter) is the recommended method of loading sparse data in batches.
 
+Similar to using a [`DataLoader`](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.DataLoader), you must specify the required `batch_size`. Since we're dealing with sparse data and the column shape isn't explicitly stored in the LibSVM file, we additionally need to provide the shape of the data and label. Our [`LibSVMIter`](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter) returns batches in a slightly different form to a [`DataLoader`](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.DataLoader). We get `DataBatch` objects instead of `tuple`. 
 
 ```python
 data_iter = mx.io.LibSVMIter(data_libsvm=filepath, data_shape=(num_features,), label_shape=(1,), batch_size=10)
@@ -215,7 +214,7 @@ Although results will change depending on system specifications and degree of sp
 
 Our next step is to define a network. We have an input of 1,000,000 features and we want to make a binary prediction. We don't have any spatial or temporal relationships between features, so we'll use a 3 layer fully-connected network where the last layer has 1 output unit (with sigmoid activation). Since we're working with sparse data, we'd ideally like to use network operators that can exploit this sparsity for improved performance and memory efficiency.
 
-Gluon's [`nn.Dense`](https://mxnet.apache.org/versions/master/api/python/gluon/nn.html?highlight=dense#mxnet.gluon.nn.Dense) block can used with [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray) input arrays but it doesn't exploit the sparsity. Under the hood, [`Dense`](https://mxnet.apache.org/versions/master/api/python/gluon/nn.html?highlight=dense#mxnet.gluon.nn.Dense) uses the [`FullyConnected`](https://mxnet.apache.org/versions/master/api/python/ndarray/ndarray.html?highlight=fullyconnected#mxnet.ndarray.FullyConnected) operator which isn't optimized for [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray) arrays. We'll implement a `Block` that does exploit this sparsity, *but first*, let's just remind ourselves of the [`Dense`](https://mxnet.apache.org/versions/master/api/python/gluon/nn.html?highlight=dense#mxnet.gluon.nn.Dense) implementation by creating an equivalent `Block` called `FullyConnected`.
+Gluon's [`nn.Dense`](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) block can used with [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) input arrays but it doesn't exploit the sparsity. Under the hood, [`Dense`](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) uses the [`FullyConnected`](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operator which isn't optimized for [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) arrays. We'll implement a `Block` that does exploit this sparsity, *but first*, let's just remind ourselves of the [`Dense`](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) implementation by creating an equivalent `Block` called `FullyConnected`.
 
 
 ```python
@@ -235,11 +234,11 @@ class FullyConnected(mx.gluon.HybridBlock):
         return F.FullyConnected(x, weight, bias, num_hidden=self._units)
 ```
 
-Our `weight` and `bias` parameters are dense (see `stype='default'`) and so are their gradients (see `grad_stype='default'`). Our `weight` parameter has shape `(units, in_units)` because the [`FullyConnected`](https://mxnet.apache.org/versions/master/api/python/ndarray/ndarray.html?highlight=fullyconnected#mxnet.ndarray.FullyConnected) operator performs the following calculation:
+Our `weight` and `bias` parameters are dense (see `stype='default'`) and so are their gradients (see `grad_stype='default'`). Our `weight` parameter has shape `(units, in_units)` because the [`FullyConnected`](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operator performs the following calculation:
 
 $$Y = XW^T + b$$
 
-We could instead have created our parameter with shape `(in_units, units)` and avoid the transpose of the weight matrix. We'll see why this is so important later on. And instead of [`FullyConnected`](https://mxnet.apache.org/versions/master/api/python/ndarray/ndarray.html?highlight=fullyconnected#mxnet.ndarray.FullyConnected) we could have used [`mx.sparse.dot`](https://mxnet.apache.org/versions/master/api/python/ndarray/sparse.html?highlight=sparse.dot#mxnet.ndarray.sparse.dot) to fully exploit the sparsity of the [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray) input arrays. We'll now implement an alternative `Block` called `FullyConnectedSparse` using these ideas. We take `grad_stype` of the `weight` as an argument (called `weight_grad_stype`), since we're going to change this later on.
+We could instead have created our parameter with shape `(in_units, units)` and avoid the transpose of the weight matrix. We'll see why this is so important later on. And instead of [`FullyConnected`](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) we could have used [`mx.sparse.dot`](/api/python/docs/api/ndarray/sparse/index.html?#mxnet.ndarray.sparse.dot) to fully exploit the sparsity of the [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) input arrays. We'll now implement an alternative `Block` called `FullyConnectedSparse` using these ideas. We take `grad_stype` of the `weight` as an argument (called `weight_grad_stype`), since we're going to change this later on.
 
 
 ```python
@@ -261,7 +260,7 @@ class FullyConnectedSparse(mx.gluon.HybridBlock):
 
 Once again, we're using a dense `weight`, so both `FullyConnected` and `FullyConnectedSparse` will return dense array outputs. When constructing a multi-layer network therefore, only the first layer needs to be optimized for sparse inputs. Our first layer is often responsible for reducing the feature dimension dramatically (e.g. 1,000,000 features down to 128 features). We'll set the number of units in our 3 layers to be 128, 8 and 1.
 
-We will use [`timeit`](https://docs.python.org/2/library/timeit.html) to check the performance of these two variants, and analyse some [MXNet Profiler](https://mxnet.apache.org/versions/master/tutorials/python/profiler.html) traces that have been created from these benchmarks. Additionally, we will inspect the memory usage of the weights (and gradients) using the `print_memory_allocation` function defined below:
+We will use [`timeit`](https://docs.python.org/2/library/timeit.html) to check the performance of these two variants, and analyse some [MXNet Profiler](/api/python/docs/tutorials/performance/backend/profiler.html) traces that have been created from these benchmarks. Additionally, we will inspect the memory usage of the weights (and gradients) using the `print_memory_allocation` function defined below:
 
 
 ```python
@@ -324,7 +323,7 @@ for batch in data_iter:
 
 ![fully connected](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/ndarray/sparse/fully_connected.png)
 
-We can see the first [`FullyConnected`](https://mxnet.apache.org/versions/master/api/python/ndarray/ndarray.html?highlight=fullyconnected#mxnet.ndarray.FullyConnected) operator takes a significant proportion of time to execute (~25% of the iteration) because there are 1,000,000 input features (to 128). After this, the other [`FullyConnected`](https://mxnet.apache.org/versions/master/api/python/ndarray/ndarray.html?highlight=fullyconnected#mxnet.ndarray.FullyConnected) operators are much faster because they have input features of 128 (to 8) and 8 (to 1). On the backward pass, we see the same pattern (but in reverse). And finally, the parameter update step takes a large amount of time on the weight matrix of the first `FullyConnected` `Block`. When checking the memory allocations below, we can see the weight matrix of the first `FullyConnected` `Block` is responsible for 99.999% of the memory compared to other [`FullyConnected`](https://mxnet.apache.org/versions/master/api/python/ndarray/ndarray.html?highlight=fullyconnected#mxnet.ndarray.FullyConnected) weight matrices.
+We can see the first [`FullyConnected`](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operator takes a significant proportion of time to execute (~25% of the iteration) because there are 1,000,000 input features (to 128). After this, the other [`FullyConnected`](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operators are much faster because they have input features of 128 (to 8) and 8 (to 1). On the backward pass, we see the same pattern (but in reverse). And finally, the parameter update step takes a large amount of time on the weight matrix of the first `FullyConnected` `Block`. When checking the memory allocations below, we can see the weight matrix of the first `FullyConnected` `Block` is responsible for 99.999% of the memory compared to other [`FullyConnected`](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) weight matrices.
 
 
 ```python
@@ -384,7 +383,7 @@ for batch in data_iter:
 
 ![fully connected sparse](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/ndarray/sparse/fully_connected_sparse.png)
 
-We see the forward pass of `dot` and `add` (equivalent to [`FullyConnected`](https://mxnet.apache.org/versions/master/api/python/ndarray/ndarray.html?highlight=fullyconnected#mxnet.ndarray.FullyConnected) operator) is much faster now: 1.54ms vs 0.26ms. And this explains the reduction in overall time for the epoch. We didn't gain any benefit on the backward pass or parameter updates though.
+We see the forward pass of `dot` and `add` (equivalent to [`FullyConnected`](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operator) is much faster now: 1.54ms vs 0.26ms. And this explains the reduction in overall time for the epoch. We didn't gain any benefit on the backward pass or parameter updates though.
 
 ![fully connected sparse backward](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/ndarray/sparse/fully_connected_sparse_backward.png)
 
@@ -408,7 +407,7 @@ Memory Allocation for Weight Gradient:
 
 ### Benchmark: `FullyConnectedSparse` with `grad_stype=row_sparse` 
 
-One useful outcome of sparsity in our [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray) input is that our gradients will be row sparse. We can exploit this fact to give us potentially huge memory savings and speed improvements. Creating our `weight` parameter with shape `(units, in_units)` and not transposing in the forward pass are important pre-requisite for obtaining row sparse gradients. Using [`nn.Dense`](https://mxnet.apache.org/versions/master/api/python/gluon/nn.html?highlight=dense#mxnet.gluon.nn.Dense) would have led to column sparse gradients which are not supported in MXNet. We previously had `grad_stype` of the `weight` parameter in the first layer set to `'default'` so we were handling the gradient as a dense array. Switching this to `'row_sparse'` can give us these potential improvements.
+One useful outcome of sparsity in our [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) input is that our gradients will be row sparse. We can exploit this fact to give us potentially huge memory savings and speed improvements. Creating our `weight` parameter with shape `(units, in_units)` and not transposing in the forward pass are important pre-requisite for obtaining row sparse gradients. Using [`nn.Dense`](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) would have led to column sparse gradients which are not supported in MXNet. We previously had `grad_stype` of the `weight` parameter in the first layer set to `'default'` so we were handling the gradient as a dense array. Switching this to `'row_sparse'` can give us these potential improvements.
 
 
 ```python
@@ -472,12 +471,12 @@ You can optimize this example further by setting the weight's `stype` to `'row_s
 
 ## Conclusion
 
-As part of this tutorial, we learned how to write sparse data to disk in LibSVM format and load it back in sparse batches with the [`LibSVMIter`](https://mxnet.apache.org/api/python/io/io.html?highlight=libsvmiter). We learned how to improve the performance of Gluon's [`nn.Dense`](https://mxnet.apache.org/versions/master/api/python/gluon/nn.html?highlight=dense#mxnet.gluon.nn.Dense) on sparse arrays using `mx.nd.sparse`. And lastly, we set `grad_stype` to `'row_sparse'` to reduce the size of the gradient and speed up the parameter update step.
+As part of this tutorial, we learned how to write sparse data to disk in LibSVM format and load it back in sparse batches with the [`LibSVMIter`](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter). We learned how to improve the performance of Gluon's [`nn.Dense`](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) on sparse arrays using `mx.nd.sparse`. And lastly, we set `grad_stype` to `'row_sparse'` to reduce the size of the gradient and speed up the parameter update step.
 
 ## Recommended Next Steps
 
-* More detail on the [`CSRNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=csrndarray#mxnet.ndarray.sparse.CSRNDArray) sparse array format can be found in [this tutorial](https://mxnet.apache.org/versions/master/tutorials/sparse/csr.html).
-* More detail on the [`RowSparseNDArray`](https://mxnet.apache.org/api/python/ndarray/sparse.html?highlight=rowsparsendarray#mxnet.ndarray.sparse.RowSparseNDArray) sparse array format can be found in [this tutorial](https://mxnet.apache.org/versions/master/tutorials/sparse/row_sparse.html).
-* Users of the Module API can see a symbolic only example in [this tutorial](https://mxnet.apache.org/versions/master/tutorials/sparse/train.html).
+* More detail on the [`CSRNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) sparse array format can be found in [this tutorial](/api/python/docs/tutorials/packages/ndarray/sparse/csr.html).
+* More detail on the [`RowSparseNDArray`](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.RowSparseNDArray) sparse array format can be found in [this tutorial](/api/python/docs/tutorials/packages/ndarray/sparse/row_sparse.html).
+* Users of the Module API can see a symbolic only example in [this tutorial](/api/python/docs/tutorials/packages/ndarray/sparse/train.html).
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
index f77731494215..e1eb3044a9fa 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
@@ -36,7 +36,7 @@ To run the tutorial you will need to have installed the following python modules
 - matplotlib
 
 We recommend that you have first followed this tutorial:
-- [Inference using an ONNX model on MXNet Gluon](https://mxnet.apache.org/tutorials/onnx/inference_on_onnx_model.html)
+- [Inference using an ONNX model on MXNet Gluon](/api/python/docs/tutorials/packages/onnx/inference_on_onnx_model.html)
 
 
 ```python
diff --git a/docs/python_docs/python/tutorials/packages/viz/index.rst b/docs/python_docs/python/tutorials/packages/viz/index.rst
index c9254a983824..367c8ecc67fb 100644
--- a/docs/python_docs/python/tutorials/packages/viz/index.rst
+++ b/docs/python_docs/python/tutorials/packages/viz/index.rst
@@ -29,7 +29,7 @@ Visualization
 References
 ----------
 
-- `mxnet.viz <../api/symbol-related/mxnet.visualization.html>`_
+- `mxnet.viz </api/python/docs/api/mxnet/visualization/index.html>`_
 
 .. toctree::
    :hidden:
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md b/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md
index da442e9fb42f..8c15af267cd4 100644
--- a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md
+++ b/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md
@@ -23,7 +23,7 @@ If you are not familiar with Apache/MXNet quantization flow, please reference [q
 
 ## Installation and Prerequisites
 
-Installing MXNet with MKLDNN backend is an easy and essential process. You can follow [How to build and install MXNet with MKL-DNN backend](https://mxnet.apache.org/tutorials/mkldnn/MKLDNN_README.html) to build and install MXNet from source. Also, you can install the release or nightly version via PyPi and pip directly by running:
+Installing MXNet with MKLDNN backend is an easy and essential process. You can follow [How to build and install MXNet with MKL-DNN backend](/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_readme.html) to build and install MXNet from source. Also, you can install the release or nightly version via PyPi and pip directly by running:
 
 ```
 # release version
@@ -38,7 +38,7 @@ A quantization script [imagenet_gen_qsym_mkldnn.py](https://github.com/apache/in
 
 ## Integrate Quantization Flow to Your Project
 
-Quantization flow works for both symbolic and Gluon models. If you're using Gluon, you can first refer [Saving and Loading Gluon Models](https://mxnet.apache.org/versions/master/tutorials/gluon/save_load_params.html) to hybridize your computation graph and export it as a symbol before running quantization.
+Quantization flow works for both symbolic and Gluon models. If you're using Gluon, you can first refer [Saving and Loading Gluon Models](/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html) to hybridize your computation graph and export it as a symbol before running quantization.
 
 In general, the quantization flow includes 4 steps. The user can get the acceptable accuracy from step 1 to 3 with minimum effort. Most of thing in this stage is out-of-box and the data scientists and researchers only need to focus on how to represent data and layers in their model. After a quantized model is generated, you may want to deploy it online and the performance will be the next key point. Thus, step 4, calibration, can improve the performance a lot by reducing lots of runtime calculation.
 
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index 6969517cbd58..f90d5ba9559e 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -212,7 +212,7 @@ Let's zoom in to check the time taken by operators
 The above picture visualizes the sequence in which the operators were executed and the time taken by each operator.
 
 ### Profiling Custom Operators
-Should the existing NDArray operators fail to meet all your model's needs, MXNet supports [Custom Operators](https://mxnet.apache.org/versions/master/tutorials/gluon/customop.html) that you can define in Python. In `forward()` and `backward()` of a custom operator, there are two kinds of code: "pure Python" code (NumPy operators included) and "sub-operators" (NDArray operators called within `forward()` and `backward()`). With that said, MXNet can profile the execution time of both kinds without additional setup. Specifically, the MXNet profiler will break a single custom operator call into a pure Python event and several sub-operator events if there are any. Furthermore, all of those events will have a prefix in their names, which is, conveniently, the name of the custom operator you called.
+Should the existing NDArray operators fail to meet all your model's needs, MXNet supports [Custom Operators](/api/python/docs/tutorials/extend/customop.html) that you can define in Python. In `forward()` and `backward()` of a custom operator, there are two kinds of code: "pure Python" code (NumPy operators included) and "sub-operators" (NDArray operators called within `forward()` and `backward()`). With that said, MXNet can profile the execution time of both kinds without additional setup. Specifically, the MXNet profiler will break a single custom operator call into a pure Python event and several sub-operator events if there are any. Furthermore, all of those events will have a prefix in their names, which is, conveniently, the name of the custom operator you called.
 
 Let's try profiling custom operators with the following code example:
 
diff --git a/docs/python_docs/python/tutorials/performance/backend/tensorrt/tensorrt.md b/docs/python_docs/python/tutorials/performance/backend/tensorrt/tensorrt.md
index 8dc19f183729..63dd678f3f5f 100644
--- a/docs/python_docs/python/tutorials/performance/backend/tensorrt/tensorrt.md
+++ b/docs/python_docs/python/tutorials/performance/backend/tensorrt/tensorrt.md
@@ -39,7 +39,7 @@ nvidia-docker run -ti mxnet/tensorrt python
 
 ## Sample Models
 ### Resnet 18
-TensorRT is an inference only library, so for the purposes of this blog post we will be using a pre-trained network, in this case a Resnet 18.  Resnets are a computationally intensive model architecture that are often used as a backbone for various computer vision tasks. Resnets are also commonly used as a reference for benchmarking deep learning library performance.  In this section we'll use a pretrained Resnet 18 from the [Gluon Model Zoo](https://mxnet.apache.org/versions/master/api/python/gluon/model_zoo.html) and compare its inference speed with TensorRT using MXNet with TensorRT integration turned off as a baseline.
+TensorRT is an inference only library, so for the purposes of this blog post we will be using a pre-trained network, in this case a Resnet 18.  Resnets are a computationally intensive model architecture that are often used as a backbone for various computer vision tasks. Resnets are also commonly used as a reference for benchmarking deep learning library performance.  In this section we'll use a pretrained Resnet 18 from the [Gluon Model Zoo](/api/python/docs/api/gluon/model_zoo/index.html) and compare its inference speed with TensorRT using MXNet with TensorRT integration turned off as a baseline.
 
 ## Model Initialization
 ```python
@@ -128,7 +128,7 @@ This means that when an MXNet computation graph is constructed, it will be parse
 
 During this process MXNet will take care of passing along the input to the node and fetching the results.  MXNet will also attempt to remove any duplicated weights (parameters) during the graph initialization to keep memory usage low.  That is, if there are graph weights that are used only in the TensorRT sections of the graph, they will be removed from the MXNet set of parameters, and their memory will be freed.
 
-The examples below shows a Gluon implementation of a Wavenet before and after a TensorRT graph pass. You can see that for this network TensorRT supports a subset of the operators involved. This makes it an interesting example to visualize, as several subgraphs are extracted and replaced with special TensorRT nodes. The Resnet used as an example above would be less interesting to visualization. The entire Resnet graph is supported by TensorRT, and hence the optimized graph would be a single TensorRT node.  If your browser is unable to render svg files you can view the graphs in png format: [unoptimized](_static/tutorials/tensorrt/wavenet_unoptimized.png) and [optimized](_static/tutorials/tensorrt/wavenet_optimized.png).
+The examples below shows a Gluon implementation of a Wavenet before and after a TensorRT graph pass. You can see that for this network TensorRT supports a subset of the operators involved. This makes it an interesting example to visualize, as several subgraphs are extracted and replaced with special TensorRT nodes. The Resnet used as an example above would be less interesting to visualization. The entire Resnet graph is supported by TensorRT, and hence the optimized graph would be a single TensorRT node.  If your browser is unable to render svg files you can view the graphs in png format: [unoptimized](wavenet_unoptimized.svg) and [optimized](wavenet_optimized.svg).
 
 ## Before
 ![before](wavenet_unoptimized.svg)
diff --git a/docs/static_site/src/pages/api/api.html b/docs/static_site/src/pages/api/api.html
index a1f4ae140701..824756898606 100644
--- a/docs/static_site/src/pages/api/api.html
+++ b/docs/static_site/src/pages/api/api.html
@@ -52,7 +52,7 @@
 - title: Julia
   guide_link: /api/julia
   api_link: /api/julia/docs/api
-  tutorial_link: https://github.com/apache/incubator-mxnet/tree/master/julia/examples
+  tutorial_link: https://mxnet.incubator.apache.org/api/julia/docs/api/#tutorials
   description:
   icon: /assets/img/julia_logo.svg
   tag: julia
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
index 0d96817560d0..6d9998d7a7a9 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
@@ -28,23 +28,23 @@ tag: cpp
 
 ## Overview
 MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
-Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python]({{'/api/python/docs/api/symbol-related/mxnet.module'|relative_url}}),    [Java]({{'/api/java/docs/api'|relative_url}}), [Scala]({{'/api/scala/docs/api'|relative_url}}), and [C++]({{'/api/cpp/docs/api'|relative_url}}) APIs.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](/api/python/docs/api/), [Java](/api/java/docs/api/#package), [Scala](/api/scala/docs/api), and [C++](/api/cpp/docs/api/) APIs.
 We will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) for our use case.
 
 ## Prerequisites
 
-To complete this tutorial, you need:
-- Complete the training part of [Gluon end to end tutorial]({{'api/python/docs/tutorials/packages/gluon/image-augmentation.html'|relative_url}})
-- Learn the basics about [MXNet C++ API]({{'/api/cpp'|relative_url}})
+To complete this tutorial, you need to:
+- Complete the training part of [Gluon end to end tutorial](/api/python/docs/tutorials/getting-started/gluon_from_experiment_to_deployment.html)
+- Learn the basics about [MXNet C++ API](/api/cpp)
 
 
 ## Setup the MXNet C++ API
-To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide]({{'/get_started/ubuntu_setup.html'|relative_url}}), and [C++ Package documentation]({{'/api/cpp'|relative_url}})
+To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](/get_started/ubuntu_setup.html), and [C++ Package documentation](/api/cpp)
 The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1. For example: `make -j USE_CPP_PACKAGE=1`.
 
 ## Load the model and run inference
 
-After you complete [the previous tutorial]({{'/api/python/docs/tutorials/packages/gluon/gluon_from_experiment_to_deployment.html'|relative_url}}), you will get the following output files:
+After you complete [the previous tutorial](/api/python/docs/tutorials/getting-started/gluon_from_experiment_to_deployment.html), you will get the following output files:
 1. Model Architecture stored in `flower-recognition-symbol.json`
 2. Model parameter values stored in `flower-recognition-0040.params` (`0040` is for 40 epochs we ran)
 3. Label names stored in `synset.txt`
@@ -280,8 +280,8 @@ Then it will predict your image:
 
 Now you can explore more ways to run inference and deploy your models:
 1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-2. [Scala Inference examples](/api/scala/docs/tutorials)
-3. [ONNX model inference examples](/api/python/docs/tutorials/deploy/index.html)
+2. [Scala Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer)
+3. [ONNX model inference examples](/api/python/docs/tutorials/packages/onnx/inference_on_onnx_model.html)
 4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
diff --git a/docs/static_site/src/pages/api/faq/float16.md b/docs/static_site/src/pages/api/faq/float16.md
index e63bf87ac68f..6ffb04054554 100644
--- a/docs/static_site/src/pages/api/faq/float16.md
+++ b/docs/static_site/src/pages/api/faq/float16.md
@@ -133,7 +133,7 @@ if dtype == 'float16':
 output = mx.sym.SoftmaxOutput(data=net_out, name='softmax')
 ```
 
-If you would like to train ResNet50 model on ImageNet using float16 precision, you can find the full script [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classificatiIfon/train_imagenet.py)
+If you would like to train ResNet50 model on ImageNet using float16 precision, you can find the full script [here](https://github.com/apache/incubator-mxnet/blob/master/docs/static_site/src/pages/api/faq/float16.md)
 
 If you don't have ImageNet dataset at your disposal, you can still run the script above using synthetic float16 data by providing the following command:
 
diff --git a/docs/static_site/src/pages/api/faq/perf.md b/docs/static_site/src/pages/api/faq/perf.md
index 675304f01241..202a099b324f 100644
--- a/docs/static_site/src/pages/api/faq/perf.md
+++ b/docs/static_site/src/pages/api/faq/perf.md
@@ -64,7 +64,7 @@ Note that _MXNet_ treats all CPUs on a single machine as a single device.
 So whether you specify `cpu(0)` or `cpu()`, _MXNet_ will use all CPU cores on the machine.
 
 ### Scoring results
-The following table shows performance of [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz),
+The following table shows performance of MXNet-1.2.0.rc1,
 namely number of images that can be predicted per second.
 We used [example/image-classification/benchmark_score.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py)
 to measure the performance on different AWS EC2 machines.
@@ -151,7 +151,7 @@ and V100 (EC2 p3.2xlarge).
 
 Based on
 [example/image-classification/benchmark_score.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark_score.py)
-and  [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz), with cuDNN 7.0.5
+and  MXNet-1.2.0.rc1, with cuDNN 7.0.5
 
 - K80 (single GPU)
 
@@ -214,7 +214,7 @@ Below is the performance result on V100 using float 16.
 
 Based on
 [example/image-classification/train_imagenet.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_imagenet.py)
-and  [MXNet-1.2.0.rc1](https://github.com/apache/incubator-mxnet/releases/download/1.2.0.rc1/apache-mxnet-src-1.2.0.rc1-incubating.tar.gz), with CUDNN 7.0.5. The benchmark script is available at
+and  MXNet-1.2.0.rc1, with CUDNN 7.0.5. The benchmark script is available at
 [here](https://github.com/mli/mxnet-benchmark/blob/master/run_vary_batch.sh),
 where the batch size for Alexnet is increased by 16x.
 
diff --git a/docs/static_site/src/pages/get_started/build_from_source.md b/docs/static_site/src/pages/get_started/build_from_source.md
index 20a4542461c4..1dfa95a82ade 100644
--- a/docs/static_site/src/pages/get_started/build_from_source.md
+++ b/docs/static_site/src/pages/get_started/build_from_source.md
@@ -50,7 +50,7 @@ Building from source follows this general two-step flow of building the shared l
             * [non-Intel CPUs](#recommended-for-Systems-with-non-Intel-CPUs)
 2. [Install the language API binding(s)](#installing-mxnet-language-bindings) you would like to use for MXNet.
 MXNet's newest and most popular API is Gluon. Gluon is built into the Python binding. If Python isn't your preference, you still have more options. MXNet supports several other language APIs:
-    - [Python (includes Gluon)]({{'/api/python/index'|relative_url}})
+    - [Python (includes Gluon)]({{'/api/python/docs/api/index.html'|relative_url}})
     - [C++]({{'/api/cpp'|relative_url}})
     - [Clojure]({{'/api/clojure'|relative_url}})
     - [Java]({{'/api/java'|relative_url}})
diff --git a/julia/docs/src/api/io.md b/julia/docs/src/api/io.md
index 34ad3c42bce7..52d172010af4 100644
--- a/julia/docs/src/api/io.md
+++ b/julia/docs/src/api/io.md
@@ -54,7 +54,7 @@ end
 By default, `eachbatch` simply returns the provider itself, so the iterator interface
 is implemented on the provider type itself. But the extra layer of abstraction allows us to
 implement a data provider easily via a Julia `Task` coroutine. See the
-data provider defined in [the char-lstm example](tutorial/char-lstm) for an example of using coroutine to define data
+data provider defined in [the char-lstm example](/api/julia/docs/api/tutorial/char-lstm/) for an example of using coroutine to define data
 providers.
 
 The detailed interface functions for the iterator API is listed below:
diff --git a/julia/docs/src/tutorial/char-lstm.md b/julia/docs/src/tutorial/char-lstm.md
index ab7e9352b5ab..1109f3554c17 100644
--- a/julia/docs/src/tutorial/char-lstm.md
+++ b/julia/docs/src/tutorial/char-lstm.md
@@ -38,7 +38,7 @@ network models directly.
 
 The most important code snippets of this example is shown and explained
 here. To see and run the complete code, please refer to the
-[examples/char-lstm](https://github.com/dmlc/MXNet.jl/tree/master/examples/char-lstm)
+[examples/char-lstm](https://github.com/apache/incubator-mxnet/blob/master/julia/docs/src/tutorial/char-lstm.md)
 directory. You will need to install
 [Iterators.jl](https://github.com/JuliaLang/Iterators.jl) and
 [StatsBase.jl](https://github.com/JuliaStats/StatsBase.jl) to run this
diff --git a/julia/docs/src/tutorial/mnist.md b/julia/docs/src/tutorial/mnist.md
index a404f75efe12..942752364526 100644
--- a/julia/docs/src/tutorial/mnist.md
+++ b/julia/docs/src/tutorial/mnist.md
@@ -23,7 +23,7 @@ multi-layer perceptron and then a convolutional neural network (the
 LeNet architecture) on the [MNIST handwritten digit
 dataset](http://yann.lecun.com/exdb/mnist/). The code for this tutorial
 could be found in
-[examples/mnist](/api/julia/docs/api/tutorial/mnist/).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
+[examples/mnist](https://github.com/apache/incubator-mxnet/tree/master/julia/examples/mnist).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
 
 Simple 3-layer MLP
 ------------------
@@ -36,7 +36,7 @@ using MXNet
 ```
 
 to load the `MXNet` module. Then we are ready to define the network
-architecture via the [symbolic API](../user-guide/overview.md). We start
+architecture via the [symbolic API](/api/julia/docs/api/user-guide/overview/). We start
 with a placeholder `data` symbol,
 
 ```julia

From 22e5ae39d0be39b9f280e89baeaf002c3572bd83 Mon Sep 17 00:00:00 2001
From: Xi Wang <xidulu@gmail.com>
Date: Mon, 28 Oct 2019 03:37:55 +0800
Subject: [PATCH 28/32] add type switch to weight tensor (#16543)

---
 src/operator/numpy/random/np_choice_op.h | 20 +++++++++++++-------
 tests/python/unittest/test_numpy_op.py   | 21 +++++++++++----------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/operator/numpy/random/np_choice_op.h b/src/operator/numpy/random/np_choice_op.h
index 335cc2741759..a6a7cecfefd5 100644
--- a/src/operator/numpy/random/np_choice_op.h
+++ b/src/operator/numpy/random/np_choice_op.h
@@ -118,15 +118,17 @@ struct random_indices {
 
 // Weighted sample without replacement.
 // Use perturbed Gumbel variates as keys.
+template <typename IType>
 struct generate_keys {
-  MSHADOW_XINLINE static void Map(index_t i, float *uniforms, float *weights) {
+  MSHADOW_XINLINE static void Map(index_t i, float *uniforms, IType *weights) {
     uniforms[i] = -logf(-logf(uniforms[i])) + logf(weights[i]);
   }
 };
 
 // Weighted sample with replacement.
+template <typename IType>
 struct categorical_sampling {
-  MSHADOW_XINLINE static void Map(index_t i, float *weights, size_t length,
+  MSHADOW_XINLINE static void Map(index_t i, IType *weights, size_t length,
                                   float *uniforms, int64_t *outs) {
     outs[i] = 0;
     float acc = 0.0;
@@ -179,15 +181,19 @@ void NumpyChoiceForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
     prnd->SampleUniform(&random_numbers, 0, 1);
     workspace_ptr += ((random_tensor_size * sizeof(float) / 7 + 1) * 8);
     if (replace) {
-      Kernel<categorical_sampling, xpu>::Launch(
-          s, output_size, inputs[weight_index].dptr<float>(), input_size,
-          random_numbers.dptr_, outputs[0].dptr<int64_t>());
+      MSHADOW_REAL_TYPE_SWITCH(inputs[weight_index].type_flag_, IType, {
+        Kernel<categorical_sampling<IType>, xpu>::Launch(
+            s, output_size, inputs[weight_index].dptr<IType>(), input_size,
+            random_numbers.dptr_, outputs[0].dptr<int64_t>());
+      });
     } else {
       Tensor<xpu, 1, int64_t> indices = Tensor<xpu, 1, int64_t>(
           reinterpret_cast<int64_t *>(workspace_ptr), Shape1(indices_size), s);
       indices = expr::range((int64_t)0, input_size);
-      Kernel<generate_keys, xpu>::Launch(s, input_size, random_numbers.dptr_,
-                                         inputs[weight_index].dptr<float>());
+      MSHADOW_REAL_TYPE_SWITCH(inputs[weight_index].type_flag_, IType, {
+        Kernel<generate_keys<IType>, xpu>::Launch(s, input_size, random_numbers.dptr_,
+                                           inputs[weight_index].dptr<IType>());
+      });
       _sort<xpu>(random_numbers.dptr_, indices.dptr_, input_size);
       Copy(outputs[0].FlatTo1D<xpu, int64_t>(s), indices.Slice(0, output_size), s);
     }
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 98a7b05dca9f..01778099412a 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2490,16 +2490,17 @@ def test_indexing_mode(sampler, set_size, samples_size, replace, weight=None):
     #     test_sample_without_replacement(np.random.choice, num_classes, shape, 10 ** 5, weight)
 
     # Test hypridize mode:
-    for hybridize in [True, False]:
-        for replace in [True, False]:
-            test_choice = TestUniformChoice(num_classes // 2, replace)
-            test_choice_weighted = TestWeightedChoice(num_classes // 2, replace)
-            if hybridize:
-                test_choice.hybridize()
-                test_choice_weighted.hybridize()
-            weight = np.array(_np.random.dirichlet([1.0] * num_classes))
-            test_indexing_mode(test_choice, num_classes, num_classes // 2, replace, None)
-            test_indexing_mode(test_choice_weighted, num_classes, num_classes // 2, replace, weight)
+    for wtype in ['float16', 'float32', 'float64']:
+        for hybridize in [True, False]:
+            for replace in [True, False]:
+                test_choice = TestUniformChoice(num_classes // 2, replace)
+                test_choice_weighted = TestWeightedChoice(num_classes // 2, replace)
+                if hybridize:
+                    test_choice.hybridize()
+                    test_choice_weighted.hybridize()
+                weight = np.array(_np.random.dirichlet([1.0] * num_classes)).astype(wtype)
+                test_indexing_mode(test_choice, num_classes, num_classes // 2, replace, None)
+                test_indexing_mode(test_choice_weighted, num_classes, num_classes // 2, replace, weight)
 
 
 @with_seed()

From 6ab4220e8d50fe3874c5f5280b40b7de1ca444ae Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 27 Oct 2019 16:31:39 -0700
Subject: [PATCH 29/32] numpy doc enhancement (#16637)

* Change NDArray to ndarray for npx ops

Add nonzero

boolean mask supports boolean ndarray

Add argmin op and interoperability test for nonzero

Fix vdot, inner, outter docs

Add nonzero to mx.nd.np

Add docs

Fix

* Fix lint

* Fix

* Fix

* Fix get_constant
---
 python/mxnet/_numpy_op_doc.py                 |  60 ++
 python/mxnet/base.py                          |   3 +
 python/mxnet/gluon/parameter.py               |   3 +-
 python/mxnet/ndarray/numpy/_op.py             | 156 ++++-
 python/mxnet/ndarray/numpy/random.py          |  12 +-
 python/mxnet/numpy/linalg.py                  |  25 +
 python/mxnet/numpy/multiarray.py              | 545 +++++++++++++++++-
 python/mxnet/numpy/random.py                  |  80 ++-
 python/mxnet/numpy/stride_tricks.py           |   9 +
 python/mxnet/numpy/utils.py                   |   4 +-
 python/mxnet/numpy_dispatch_protocol.py       |   2 +
 python/mxnet/numpy_extension/random.py        |   2 +-
 python/mxnet/symbol/numpy/_symbol.py          |  55 +-
 python/mxnet/symbol/numpy/random.py           |   8 +-
 python/mxnet/util.py                          |  63 +-
 src/operator/contrib/boolean_mask.cc          |   2 +-
 src/operator/contrib/boolean_mask.cu          |   2 +-
 .../numpy/np_broadcast_reduce_op_index.cc     |  11 +
 .../numpy/np_broadcast_reduce_op_index.cu     |   3 +
 src/operator/numpy/np_nonzero_op.cc           |   3 +-
 src/operator/numpy/np_nonzero_op.cu           |   2 +-
 tests/python/unittest/test_numpy_gluon.py     |  23 +
 .../unittest/test_numpy_interoperability.py   |  28 +
 tests/python/unittest/test_numpy_op.py        |  63 +-
 24 files changed, 1062 insertions(+), 102 deletions(-)

diff --git a/python/mxnet/_numpy_op_doc.py b/python/mxnet/_numpy_op_doc.py
index bcbef9d047d1..33158baf10a5 100644
--- a/python/mxnet/_numpy_op_doc.py
+++ b/python/mxnet/_numpy_op_doc.py
@@ -34,6 +34,24 @@ def _np_ones_like(a):
     -------
     out : ndarray
         Array of ones with the same shape and type as `a`.
+
+    Examples
+    --------
+    >>> x = np.arange(6)
+    >>> x = x.reshape((2, 3))
+    >>> x
+    array([[0., 1., 2.],
+           [3., 4., 5.]])
+    >>> np.ones_like(x)
+    array([[1., 1., 1.],
+           [1., 1., 1.]])
+
+    >>> y = np.arange(3, dtype=float)
+    >>> y
+    array([0., 1., 2.], dtype=float64)
+    >>>
+    >>> np.ones_like(y)
+    array([1., 1., 1.], dtype=float64)
     """
     pass
 
@@ -52,6 +70,23 @@ def _np_zeros_like(a):
     -------
     out : ndarray
         Array of zeros with the same shape and type as `a`.
+
+    Examples
+    --------
+    >>> x = np.arange(6)
+    >>> x = x.reshape((2, 3))
+    >>> x
+    array([[0., 1., 2.],
+           [3., 4., 5.]])
+    >>> np.zeros_like(x)
+    array([[0., 0., 0.],
+           [0., 0., 0.]])
+    >>> y = np.arange(3, dtype=float)
+    >>> y
+    array([0., 1., 2.], dtype=float64)
+    >>>
+    >>> np.zeros_like(y)
+    array([0., 0., 0.], dtype=float64)
     """
     pass
 
@@ -477,6 +512,31 @@ def _np_reshape(a, newshape, order='C', out=None):
     See Also
     --------
     ndarray.reshape : Equivalent method.
+
+    Examples
+    --------
+    >>> a = np.arange(6).reshape((3, 2))
+    >>> a
+    array([[0., 1.],
+           [2., 3.],
+           [4., 5.]])
+
+    >>> np.reshape(a, (2, 3)) # C-like index ordering
+    array([[0., 1., 2.],
+           [3., 4., 5.]])
+
+    >>> np.reshape(np.ravel(a), (2, 3)) # equivalent to C ravel then C reshape
+    array([[0., 1., 2.],
+           [3., 4., 5.]])
+
+    >>> a = np.array([[1,2,3], [4,5,6]])
+    >>> np.reshape(a, 6)
+    array([1., 2., 3., 4., 5., 6.])
+
+    >>> np.reshape(a, (3,-1))       # the unspecified value is inferred to be 2
+    array([[1., 2.],
+           [3., 4.],
+           [5., 6.]])
     """
 
 
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index cbd9abe9d754..db1fa29ab9b4 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -20,6 +20,7 @@
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
+import re
 import atexit
 import ctypes
 import os
@@ -853,3 +854,5 @@ def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op
 
         if hasattr(_np_op_doc, name):
             function.__doc__ = getattr(_np_op_doc, name).__doc__
+        else:
+            function.__doc__ = re.sub('NDArray', 'ndarray', function.__doc__)
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 8800684ad0b4..957dc2cd69b7 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -674,7 +674,8 @@ def __init__(self, **kwargs):
     """
     def __init__(self, name, value):
         if not isinstance(value, ndarray.NDArray):
-            value = ndarray.array(value)
+            array_fn = _mx_np.array if is_np_array() else ndarray.array
+            value = array_fn(value)
         self.value = value
 
         class Init(initializer.Initializer):
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index cf66e29d6205..fdb9694146b5 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -34,11 +34,11 @@
            'log1p', 'rint', 'radians', 'reciprocal', 'square', 'negative', 'fix', 'ceil', 'floor',
            'trunc', 'logical_not', 'arcsinh', 'arccosh', 'arctanh', 'tensordot', 'histogram', 'eye',
            'linspace', 'logspace', 'expand_dims', 'tile', 'arange', 'split', 'vsplit', 'concatenate',
-           'stack', 'vstack', 'dstack', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip', 'argmax',
+           'stack', 'vstack', 'dstack', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip', 'argmax', 'argmin',
            'std', 'var', 'indices', 'copysign', 'ravel', 'hanning', 'hamming', 'blackman', 'flip',
            'around', 'hypot', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril', 'identity', 'take',
            'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less', 'greater_equal', 'less_equal',
-           'hsplit', 'rot90', 'einsum', 'true_divide']
+           'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero']
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -3165,8 +3165,6 @@ def clip(a, a_min, a_max, out=None):
 @set_module('mxnet.ndarray.numpy')
 def argmax(a, axis=None, out=None):
     r"""
-    argmax(a, axis=None, out=None)
-
     Returns the indices of the maximum values along an axis.
 
     Parameters
@@ -3234,6 +3232,75 @@ def argmax(a, axis=None, out=None):
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
 
 
+@set_module('mxnet.ndarray.numpy')
+def argmin(a, axis=None, out=None):
+    r"""
+    Returns the indices of the maximum values along an axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array. Only support ndarrays of dtype `float16`, `float32`, and `float64`.
+    axis : int, optional
+        By default, the index is into the flattened array, otherwise
+        along the specified axis.
+    out : ndarray or None, optional
+        If provided, the result will be inserted into this array. It should
+        be of the appropriate shape and dtype.
+
+    Returns
+    -------
+    index_array : ndarray of indices whose dtype is same as the input ndarray.
+        Array of indices into the array. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+
+    Notes
+    -----
+    In case of multiple occurrences of the maximum values, the indices
+    corresponding to the first occurrence are returned.
+
+    This function differs from the original `numpy.argmax
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - Output has dtype that is same as the input ndarray.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> a
+    array([[10., 11., 12.],
+           [13., 14., 15.]])
+    >>> np.argmin(a)
+    array(0.)
+    >>> np.argmin(a, axis=0)
+    array([0., 0., 0.])
+    >>> np.argmin(a, axis=1)
+    array([0., 0.])
+
+    >>> b = np.arange(6)
+    >>> b[2] = 0
+    >>> b
+    array([0., 1., 0., 3., 4., 5.])
+    >>> np.argmax(b)  # Only the first occurrence is returned.
+    array(0.)
+
+    Specify ``out`` ndarray:
+
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> b = np.zeros((2,))
+    >>> np.argmin(a, axis=1, out=b)
+    array([0., 0.])
+    >>> b
+    array([0., 0.])
+    """
+    return _npi.argmin(a, axis=axis, keepdims=False, out=out)
+
+
 @set_module('mxnet.ndarray.numpy')
 def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
     """
@@ -4761,3 +4828,84 @@ def einsum(*operands, **kwargs):
     subscripts = operands[0]
     operands = operands[1:]
     return _npi.einsum(*operands, subscripts=subscripts, out=out, optimize=int(optimize_arg))
+
+
+@set_module('mxnet.ndarray.numpy')
+def nonzero(a):
+    """
+    Return the indices of the elements that are non-zero.
+
+    Returns a tuple of arrays, one for each dimension of `a`,
+    containing the indices of the non-zero elements in that
+    dimension. The values in `a` are always returned in
+    row-major, C-style order.
+
+    To group the indices by element, rather than dimension, use `argwhere`,
+    which returns a row for each non-zero element.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+
+    Returns
+    -------
+    tuple_of_arrays : tuple
+        Indices of elements that are non-zero.
+
+    See Also
+    --------
+    ndarray.nonzero :
+        Equivalent ndarray method.
+
+    Notes
+    -----
+    While the nonzero values can be obtained with ``a[nonzero(a)]``, it is
+    recommended to use ``x[x.astype(bool)]`` or ``x[x != 0]`` instead, which
+    will correctly handle 0-d arrays.
+
+    Examples
+    --------
+    >>> x = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]])
+    >>> x
+    array([[3, 0, 0],
+           [0, 4, 0],
+           [5, 6, 0]], dtype=int32)
+    >>> np.nonzero(x)
+    (array([0, 1, 2, 2], dtype=int64), array([0, 1, 0, 1], dtype=int64))
+
+    >>> x[np.nonzero(x)]
+    array([3, 4, 5, 6])
+    >>> np.transpose(np.stack(np.nonzero(x)))
+    array([[0, 0],
+           [1, 1],
+           [2, 0],
+           [2, 1]], dtype=int64)
+
+    A common use for ``nonzero`` is to find the indices of an array, where
+    a condition is True.  Given an array `a`, the condition `a` > 3 is a
+    boolean array and since False is interpreted as 0, np.nonzero(a > 3)
+    yields the indices of the `a` where the condition is true.
+
+    >>> a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
+    >>> a > 3
+    array([[False, False, False],
+           [ True,  True,  True],
+           [ True,  True,  True]])
+    >>> np.nonzero(a > 3)
+    (array([1, 1, 1, 2, 2, 2], dtype=int64), array([0, 1, 2, 0, 1, 2], dtype=int64))
+
+    Using this result to index `a` is equivalent to using the mask directly:
+
+    >>> a[np.nonzero(a > 3)]
+    array([4, 5, 6, 7, 8, 9], dtype=int32)
+    >>> a[a > 3]
+    array([4, 5, 6, 7, 8, 9], dtype=int32)
+
+    ``nonzero`` can also be called as a method of the array.
+
+    >>> (a > 3).nonzero()
+    (array([1, 1, 1, 2, 2, 2], dtype=int64), array([0, 1, 2, 0, 1, 2], dtype=int64))
+    """
+    out = _npi.nonzero(a).transpose()
+    return tuple([out[i] for i in range(len(out))])
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 583f56e046f3..9d1a6f9119ee 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -23,11 +23,11 @@
 from ..ndarray import NDArray
 
 
-__all__ = ['randint', 'uniform', 'normal', "choice", "rand"]
+__all__ = ['randint', 'uniform', 'normal', "choice", "rand", "multinomial"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
-    """Return random integers from `low` (inclusive) to `high` (exclusive).
+    r"""Return random integers from `low` (inclusive) to `high` (exclusive).
 
     Return random integers from the "discrete uniform" distribution of
     the specified dtype in the "half-open" interval [`low`, `high`). If
@@ -88,7 +88,7 @@ def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
 
 
 def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
-    """Draw samples from a uniform distribution.
+    r"""Draw samples from a uniform distribution.
 
     Samples are uniformly distributed over the half-open interval
     ``[low, high)`` (includes low, but excludes high).  In other words,
@@ -143,7 +143,7 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
 
 
 def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
-    """Draw random samples from a normal (Gaussian) distribution.
+    r"""Draw random samples from a normal (Gaussian) distribution.
 
     Samples are distributed according to a normal distribution parametrized
     by *loc* (mean) and *scale* (standard deviation).
@@ -194,7 +194,7 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
 
 
 def multinomial(n, pvals, size=None):
-    """multinomial(n, pvals, size=None)
+    r"""multinomial(n, pvals, size=None)
 
     Draw samples from a multinomial distribution.
 
@@ -246,7 +246,7 @@ def multinomial(n, pvals, size=None):
 
 
 def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
-    """Generates a random sample from a given 1-D array
+    r"""Generates a random sample from a given 1-D array
 
     Parameters
     -----------
diff --git a/python/mxnet/numpy/linalg.py b/python/mxnet/numpy/linalg.py
index 1ca34716d7d4..9ee5156c3bb1 100644
--- a/python/mxnet/numpy/linalg.py
+++ b/python/mxnet/numpy/linalg.py
@@ -54,10 +54,35 @@ def norm(x, ord=None, axis=None, keepdims=False):
     n : float or ndarray
         Norm of the matrix or vector(s).
 
+    Notes
+    -----
+    This operator differs from NumPy in the aspect that it always returns a
+    zero-dim tensor for the cases where Python float values are expected
+    in NumPy.
+
     References
     ----------
     .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*,
            Baltimore, MD, Johns Hopkins University Press, 1985, pg. 15
+
+    Examples
+    --------
+    >>> from numpy import linalg as LA
+    >>> a = np.arange(9) - 4
+    >>> a
+    array([-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.])
+    >>> b = a.reshape((3, 3))
+    >>> b
+    array([[-4., -3., -2.],
+           [-1.,  0.,  1.],
+           [ 2.,  3.,  4.]])
+    >>> LA.norm(a)
+    array(7.745967)
+    >>>
+    >>> LA.norm(b)
+    array(7.745967)
+    >>> LA.norm(b, 'fro')
+    array(7.745967)
     """
     return _mx_nd_np.linalg.norm(x, ord, axis, keepdims)
 
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 623b5fc482d7..5c9de8194a74 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -52,10 +52,10 @@
            'fix', 'ceil', 'floor', 'trunc', 'logical_not', 'arcsinh', 'arccosh', 'arctanh',
            'tensordot', 'histogram', 'eye', 'linspace', 'logspace', 'expand_dims', 'tile', 'arange',
            'split', 'vsplit', 'concatenate', 'stack', 'vstack', 'dstack', 'mean', 'maximum', 'minimum',
-           'swapaxes', 'clip', 'argmax', 'std', 'var', 'indices', 'copysign', 'ravel', 'hanning', 'hamming',
+           'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'hanning', 'hamming',
            'blackman', 'flip', 'around', 'arctan2', 'hypot', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril',
            'identity', 'take', 'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less',
-           'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide']
+           'greater_equal', 'less_equal', 'hsplit', 'rot90', 'einsum', 'true_divide', 'nonzero']
 
 # Return code for dispatching indexing function call
 _NDARRAY_UNSUPPORTED_INDEXING = -1
@@ -478,7 +478,7 @@ def __getitem__(self, key):
             for i in range(key_ndim):
                 if key_shape[i] != shape[i]:
                     raise IndexError('boolean index did not match indexed array along dimension {};'
-                                     'dimension is {} but corresponding boolean dimension is {}'
+                                     ' dimension is {} but corresponding boolean dimension is {}'
                                      .format(i, shape[i], key_shape[i]))
             remaining_dims = shape[key_ndim:]
             data = _reshape_view(self, -1, *remaining_dims)
@@ -831,6 +831,17 @@ def item(self, *args):
         # TODO(junwu): no need to call asnumpy() on the whole array.
         return self.asnumpy().item(*args)
 
+    def nonzero(self):
+        """Return the indices of the elements that are non-zero.
+
+        Refer to `numpy.nonzero` for full documentation.
+
+        See Also
+        --------
+        numpy.nonzero : equivalent function
+        """
+        return nonzero(self)
+
     @property
     # pylint: disable= invalid-name, undefined-variable
     def T(self):
@@ -1369,13 +1380,10 @@ def argmax_channel(self, *args, **kwargs):
         """
         raise AttributeError('mxnet.numpy.ndarray object has no attribute argmax_channel')
 
-    def argmin(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`argmin`.
-
-        The arguments are the same as for :py:func:`argmin`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def argmin(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        """Return indices of the minium values along the given axis.
+        Refer to `mxnet.numpy.argmin` for full documentation."""
+        return argmin(self, axis, out)
 
     def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
         """Return an array whose values are limited to [min, max].
@@ -1925,6 +1933,16 @@ def empty(shape, dtype=_np.float32, order='C', ctx=None):
     -------
     out : ndarray
         Array of uninitialized (arbitrary) data of the given shape, dtype, and order.
+
+    Examples
+    --------
+    >>> np.empty([2, 2])
+    array([[ 0.000000e+00, -2.524355e-29],
+           [          nan, -8.592023e+09]])  # uninitialized
+
+    >>> np.empty([2, 2], dtype=int)
+    array([[8751743591039004782, 3196766424264760104],
+           [7583328881310196768,     562950123910254]], dtype=int64)  # uninitialized
     """
     if order != 'C':
         raise NotImplementedError('`empty` only supports order equal to `C`, while received {}'
@@ -1958,6 +1976,19 @@ def array(object, dtype=None, ctx=None):
     -------
     out : ndarray
         An array object satisfying the specified requirements.
+
+    Examples
+    --------
+    >>> np.array([1, 2, 3])
+    array([1., 2., 3.])
+
+    >>> np.array([[1, 2], [3, 4]])
+    array([[1., 2.],
+           [3., 4.]])
+
+    >>> np.array([[1, 0], [0, 1]], dtype=bool)
+    array([[ True, False],
+           [False,  True]])
     """
     if ctx is None:
         ctx = current_context()
@@ -2003,6 +2034,18 @@ def zeros(shape, dtype=_np.float32, order='C', ctx=None):
     -------
     out : ndarray
         Array of zeros with the given shape, dtype, and ctx.
+
+    Examples
+    --------
+    >>> np.zeros(5)
+    array([0., 0., 0., 0., 0.])
+
+    >>> np.zeros((5,), dtype=int)
+    array([0, 0, 0, 0, 0], dtype=int64)
+
+    >>> np.zeros((2, 1))
+    array([[0.],
+           [0.]])
     """
     return _mx_nd_np.zeros(shape, dtype, order, ctx)
 
@@ -2032,6 +2075,23 @@ def ones(shape, dtype=_np.float32, order='C', ctx=None):
     -------
     out : ndarray
         Array of ones with the given shape, dtype, and ctx.
+
+    Examples
+    --------
+    >>> np.ones(5)
+    array([1., 1., 1., 1., 1.])
+
+    >>> np.ones((5,), dtype=int)
+    array([1, 1, 1, 1, 1], dtype=int64)
+
+    >>> np.ones((2, 1))
+    array([[1.],
+           [1.]])
+
+    >>> s = (2,2)
+    >>> np.ones(s)
+    array([[1., 1.],
+           [1., 1.]])
     """
     return _mx_nd_np.ones(shape, dtype, order, ctx)
 
@@ -2332,6 +2392,18 @@ def add(x1, x2, out=None, **kwargs):
     -------
     add : ndarray or scalar
         The sum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.add(1.0, 4.0)
+    5.0
+    >>>
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> np.add(x1, x2)
+    array([[ 0.,  2.,  4.],
+           [ 3.,  5.,  7.],
+           [ 6.,  8., 10.]])
     """
     return _mx_nd_np.add(x1, x2, out)
 
@@ -2358,6 +2430,17 @@ def subtract(x1, x2, out=None, **kwargs):
     -------
     subtract : ndarray or scalar
         The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.subtract(1.0, 4.0)
+    -3.0
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> np.subtract(x1, x2)
+    array([[0., 0., 0.],
+           [3., 3., 3.],
+           [6., 6., 6.]])
     """
     return _mx_nd_np.subtract(x1, x2, out)
 
@@ -2383,6 +2466,17 @@ def multiply(x1, x2, out=None, **kwargs):
     -------
     out : ndarray or scalar
         The difference of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.multiply(2.0, 4.0)
+    8.0
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> np.multiply(x1, x2)
+    array([[ 0.,  1.,  4.],
+           [ 0.,  4., 10.],
+           [ 0.,  7., 16.]])
     """
     return _mx_nd_np.multiply(x1, x2, out)
 
@@ -2410,6 +2504,11 @@ def divide(x1, x2, out=None, **kwargs):
     -------
     out : ndarray or scalar
         This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.true_divide(x, 4)
+    array([0.  , 0.25, 0.5 , 0.75, 1.  ])
     """
     return _mx_nd_np.divide(x1, x2, out=out)
 
@@ -2439,6 +2538,12 @@ def true_divide(x1, x2, out=None):
     -------
     out : ndarray or scalar
         This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> x = np.arange(5)
+    >>> np.true_divide(x, 4)
+    array([0.  , 0.25, 0.5 , 0.75, 1.  ])
     """
     return _mx_nd_np.true_divide(x1, x2, out=out)
 
@@ -2466,6 +2571,11 @@ def mod(x1, x2, out=None, **kwargs):
     -------
     out : ndarray or scalar
         This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.mod(np.arange(7), 5)
+    array([0., 1., 2., 3., 4., 0., 1.])
     """
     return _mx_nd_np.mod(x1, x2, out=out)
 
@@ -2493,6 +2603,11 @@ def remainder(x1, x2, out=None, **kwargs):
     -------
     out : ndarray or scalar
         This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.remainder(np.arange(7), 5)
+    array([0., 1., 2., 3., 4., 0., 1.])
     """
     return _mx_nd_np.remainder(x1, x2, out=out)
 
@@ -2521,6 +2636,29 @@ def power(x1, x2, out=None, **kwargs):
     out : ndarray or scalar
         The bases in x1 raised to the exponents in x2.
         This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> x1 = np.arange(6)
+    >>> np.power(x1, 3)
+    array([  0.,   1.,   8.,  27.,  64., 125.])
+
+    Raise the bases to different exponents.
+
+    >>> x2 = np.array([1.0, 2.0, 3.0, 3.0, 2.0, 1.0])
+    >>> np.power(x1, x2)
+    array([ 0.,  1.,  8., 27., 16.,  5.])
+
+    The effect of broadcasting.
+
+    >>> x2 = np.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]])
+    >>> x2
+    array([[1., 2., 3., 3., 2., 1.],
+           [1., 2., 3., 3., 2., 1.]])
+
+    >>> np.power(x1, x2)
+    array([[ 0.,  1.,  8., 27., 16.,  5.],
+           [ 0.,  1.,  8., 27., 16.,  5.]])
     """
     return _mx_nd_np.power(x1, x2, out=out)
 
@@ -3610,7 +3748,7 @@ def negative(x, out=None, **kwargs):
     y : ndarray or scalar
         Returned array or scalar: y = -x. This is a scalar if x is a scalar.
 
-    Examples:
+    Examples
     --------
     >>> np.negative(1)
     -1
@@ -3637,7 +3775,7 @@ def fix(x, out=None, **kwargs):
     y : ndarray or scalar
     Returned array or scalar: y = -x. This is a scalar if x is a scalar.ndarray of floats
 
-    Examples:
+    Examples
     ---------
     >>> np.fix(3.14)
     3
@@ -3667,10 +3805,10 @@ def tan(x, out=None, **kwargs):
     y : ndarray
     The corresponding tangent values. This is a scalar if x is a scalar.
 
-    Examples:
+    Examples
     ---------
-    >>> np.tan(0.5)
-    0.5463024898437905
+    >>> np.tan(np.array([-np.pi, np.pi/2, np.pi]))
+    array([-8.7422777e-08, -2.2877332e+07,  8.7422777e-08])
     """
 
     return _mx_nd_np.tan(x, out=out, **kwargs)
@@ -4044,7 +4182,7 @@ def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
     ----------
     a : ndarray
         Input data. The histogram is computed over the flattened array.
-    bins : int or NDArray
+    bins : int or ndarray
         If `bins` is an int, it defines the number of equal-width
         bins in the given range (10, by default). If `bins` is a
         sequence, it defines a monotonically increasing array of bin edges,
@@ -4062,6 +4200,11 @@ def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
         Not supported yet, coming soon.
     density : bool, optional
         Not supported yet, coming soon.
+
+    Examples
+    --------
+    >>> np.histogram(np.arange(4), bins=np.arange(5))
+    [array([1, 1, 1, 1], dtype=int64), array([0., 1., 2., 3., 4.])]
     """
     return _mx_nd_np.histogram(a, bins=bins, range=range, normed=normed, weights=weights, density=density)
 
@@ -4089,6 +4232,16 @@ def eye(N, M=None, k=0, dtype=_np.float32, **kwargs):
     I : ndarray of shape (N,M)
         An array where all elements are equal to zero,
         except for the k-th diagonal, whose values are equal to one.
+
+    Examples
+    --------
+    >>> np.eye(2, dtype=int)
+    array([[1, 0],
+           [0, 1]], dtype=int64)
+    >>> np.eye(3, k=1)
+    array([[0., 1., 0.],
+           [0., 0., 1.],
+           [0., 0., 0.]])
     """
     return _mx_nd_np.eye(N, M, k, dtype, **kwargs)
 
@@ -4274,6 +4427,37 @@ def expand_dims(a, axis):
     res : ndarray
         Output array. The number of dimensions is one greater than that of
         the input array.
+
+    See Also
+    --------
+    squeeze : The inverse operation, removing singleton dimensions
+    reshape : Insert, remove, and combine dimensions, and resize existing ones
+
+    Examples
+    --------
+    >>> x = np.array([1,2])
+    >>> x.shape
+    (2,)
+
+    >>> y = np.expand_dims(x, axis=0)
+    >>> y
+    array([[1., 2.]])
+
+    >>> y.shape
+    (1, 2)
+
+    >>> y = np.expand_dims(x, axis=1)  # Equivalent to x[:,np.newaxis]
+    >>> y
+    array([[1.],
+           [2.]])
+
+    >>> y.shape
+    (2, 1)
+
+    Note that some examples may use None instead of np.newaxis. These are the same objects:
+
+    >>> np.newaxis is None
+    True
     """
     return _npi.expand_dims(a, axis)
 
@@ -4417,6 +4601,20 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
         ``ceil((stop - start)/step)``.  Because of floating point overflow,
         this rule may result in the last element of `out` being greater
         than `stop`.
+
+    Examples
+    --------
+    >>> np.arange(3)
+    array([0., 1., 2.])
+
+    >>> np.arange(3.0)
+    array([0., 1., 2.])
+
+    >>> np.arange(3,7)
+    array([3., 4., 5., 6.])
+
+    >>> np.arange(3,7,2)
+    array([3., 5.])
     """
     return _mx_nd_np.arange(start, stop, step, dtype, ctx)
 
@@ -4424,6 +4622,7 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
 @set_module('mxnet.numpy')
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
+
     Parameters
     ----------
     ary : ndarray
@@ -4442,15 +4641,38 @@ def split(ary, indices_or_sections, axis=0):
         an empty sub-array is returned correspondingly.
     axis : int, optional
         The axis along which to split, default is 0.
+
     Returns
     -------
     sub-arrays : list of ndarrays
         A list of sub-arrays.
+
     Raises
     ------
     ValueError
         If `indices_or_sections` is given as an integer, but
-        a split does not result in equal division."""
+        a split does not result in equal division.
+
+    See Also
+    --------
+    hsplit : Split array into multiple sub-arrays horizontally (column-wise).
+    vsplit : Split array into multiple sub-arrays vertically (row wise).
+    dsplit : Split array into multiple sub-arrays along the 3rd axis (depth).
+    concatenate : Join a sequence of arrays along an existing axis.
+    stack : Join a sequence of arrays along a new axis.
+    hstack : Stack arrays in sequence horizontally (column wise).
+    vstack : Stack arrays in sequence vertically (row wise).
+    dstack : Stack arrays in sequence depth wise (along third dimension).
+
+    Examples
+    --------
+    >>> x = np.arange(9.0)
+    >>> np.split(x, 3)
+    [array([0., 1., 2.]), array([3., 4., 5.]), array([6., 7., 8.])]
+
+    >>> np.split(x, [3, 5, 6, 8])
+    [array([0., 1., 2.]), array([3., 4.]), array([5.]), array([6., 7.]), array([])]
+    """
     return _mx_nd_np.split(ary, indices_or_sections, axis=axis)
 
 
@@ -4533,6 +4755,7 @@ def vsplit(ary, indices_or_sections):
 @set_module('mxnet.numpy')
 def concatenate(seq, axis=0, out=None):
     """Join a sequence of arrays along an existing axis.
+
     Parameters
     ----------
     a1, a2, ... : sequence of array_like
@@ -4545,10 +4768,35 @@ def concatenate(seq, axis=0, out=None):
         If provided, the destination to place the result. The shape must be
         correct, matching that of what concatenate would have returned if no
         out argument were specified.
+
     Returns
     -------
     res : ndarray
         The concatenated array.
+
+    See Also
+    --------
+    split : Split array into a list of multiple sub-arrays of equal size.
+    hsplit : Split array into multiple sub-arrays horizontally (column wise)
+    vsplit : Split array into multiple sub-arrays vertically (row wise)
+    dsplit : Split array into multiple sub-arrays along the 3rd axis (depth).
+    stack : Stack a sequence of arrays along a new axis.
+    hstack : Stack arrays in sequence horizontally (column wise)
+    vstack : Stack arrays in sequence vertically (row wise)
+    dstack : Stack arrays in sequence depth wise (along third dimension)
+
+    Examples
+    --------
+    >>> a = np.array([[1, 2], [3, 4]])
+    >>> b = np.array([[5, 6]])
+    >>> np.concatenate((a, b), axis=0)
+    array([[1., 2.],
+           [3., 4.],
+           [5., 6.]])
+
+    >>> np.concatenate((a, b.T), axis=1)
+    array([[1., 2., 5.],
+           [3., 4., 6.]])
     """
     return _mx_nd_np.concatenate(seq, axis=axis, out=out)
 
@@ -4558,6 +4806,7 @@ def stack(arrays, axis=0, out=None):
     """Join a sequence of arrays along a new axis.
         The axis parameter specifies the index of the new axis in the dimensions of the result.
         For example, if `axis=0` it will be the first dimension and if `axis=-1` it will be the last dimension.
+
     Parameters
     ----------
     arrays : sequence of array_like
@@ -4567,10 +4816,40 @@ def stack(arrays, axis=0, out=None):
     out : ndarray, optional
         If provided, the destination to place the result. The shape must be correct,
         matching that of what stack would have returned if no out argument were specified.
+
     Returns
     -------
     stacked : ndarray
-        The stacked array has one more dimension than the input arrays."""
+        The stacked array has one more dimension than the input arrays.
+
+    See Also
+    --------
+    concatenate : Join a sequence of arrays along an existing axis.
+    split : Split array into a list of multiple sub-arrays of equal size.
+
+    Examples
+    --------
+    >>> arrays = [np.random.rand(3, 4) for _ in range(10)]
+    >>> np.stack(arrays, axis=0).shape
+    (10, 3, 4)
+
+    >>> np.stack(arrays, axis=1).shape
+    (3, 10, 4)
+
+    >>> np.stack(arrays, axis=2).shape
+    (3, 4, 10)
+
+    >>> a = np.array([1, 2, 3])
+    >>> b = np.array([2, 3, 4])
+    >>> np.stack((a, b))
+    array([[1., 2., 3.],
+           [2., 3., 4.]])
+
+    >>> np.stack((a, b), axis=-1)
+    array([[1., 2.],
+           [2., 3.],
+           [3., 4.]])
+    """
     return _mx_nd_np.stack(arrays, axis=axis, out=out)
 
 
@@ -4678,7 +4957,17 @@ def maximum(x1, x2, out=None, **kwargs):
     Returns
     -------
     out : mxnet.numpy.ndarray or scalar
-        The maximum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
+        The maximum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.maximum(np.array([2, 3, 4]), np.array([1, 5, 2]))
+    array([2., 5., 4.])
+
+    >>> np.maximum(np.eye(2), np.array([0.5, 2])) # broadcasting
+    array([[1. , 2. ],
+           [0.5, 2. ]])
+    """
     return _mx_nd_np.maximum(x1, x2, out=out)
 
 
@@ -4697,7 +4986,17 @@ def minimum(x1, x2, out=None, **kwargs):
     Returns
     -------
     out : mxnet.numpy.ndarray or scalar
-        The minimum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars."""
+        The minimum of x1 and x2, element-wise. This is a scalar if both x1 and x2 are scalars.
+
+    Examples
+    --------
+    >>> np.minimum(np.array([2, 3, 4]), np.array([1, 5, 2]))
+    array([1., 3., 2.])
+
+    >>> np.minimum(np.eye(2), np.array([0.5, 2])) # broadcasting
+    array([[0.5, 0. ],
+           [0. , 1. ]])
+    """
     return _mx_nd_np.minimum(x1, x2, out=out)
 
 
@@ -4718,6 +5017,29 @@ def swapaxes(a, axis1, axis2):
     -------
     a_swapped : ndarray
         Swapped array. This is always a copy of the input array.
+
+    Examples
+    --------
+    >>> x = np.array([[1,2,3]])
+    >>> np.swapaxes(x,0,1)
+    array([[1.],
+           [2.],
+           [3.]])
+
+    >>> x = np.array([[[0,1],[2,3]],[[4,5],[6,7]]])
+    >>> x
+    array([[[0., 1.],
+            [2., 3.]],
+
+           [[4., 5.],
+            [6., 7.]]])
+
+    >>> np.swapaxes(x,0,2)
+    array([[[0., 4.],
+            [2., 6.]],
+
+           [[1., 5.],
+            [3., 7.]]])
     """
     return _npi.swapaxes(a, dim1=axis1, dim2=axis2)
 
@@ -4776,8 +5098,6 @@ def clip(a, a_min, a_max, out=None):
 @set_module('mxnet.numpy')
 def argmax(a, axis=None, out=None):
     r"""
-    argmax(a, axis=None, out=None)
-
     Returns the indices of the maximum values along an axis.
 
     Parameters
@@ -4844,13 +5164,82 @@ def argmax(a, axis=None, out=None):
     return _mx_nd_np.argmax(a, axis, out)
 
 
+@set_module('mxnet.numpy')
+def argmin(a, axis=None, out=None):
+    r"""
+    Returns the indices of the minimum values along an axis.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array. Only support ndarrays of dtype `float16`, `float32`, and `float64`.
+    axis : int, optional
+        By default, the index is into the flattened array, otherwise
+        along the specified axis.
+    out : ndarray or None, optional
+        If provided, the result will be inserted into this array. It should
+        be of the appropriate shape and dtype.
+
+    Returns
+    -------
+    index_array : ndarray of indices whose dtype is same as the input ndarray.
+        Array of indices into the array. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+
+    Notes
+    -----
+    In case of multiple occurrences of the minimum values, the indices
+    corresponding to the first occurrence are returned.
+
+    This function differs from the original `numpy.argmin
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmin.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - Output has dtype that is same as the input ndarray.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` ndarray's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    Examples
+    --------
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> a
+    array([[10., 11., 12.],
+           [13., 14., 15.]])
+    >>> np.argmin(a)
+    array(0.)
+    >>> np.argmin(a, axis=0)
+    array([0., 0., 0.])
+    >>> np.argmin(a, axis=1)
+    array([0., 0.])
+
+    >>> b = np.arange(6)
+    >>> b[2] = 0
+    >>> b
+    array([0., 1., 0., 3., 4., 5.])
+    >>> np.argmax(b)  # Only the first occurrence is returned.
+    array(0.)
+
+    Specify ``out`` ndarray:
+
+    >>> a = np.arange(6).reshape(2,3) + 10
+    >>> b = np.zeros((2,))
+    >>> np.argmin(a, axis=1, out=b)
+    array([0., 0.])
+    >>> b
+    array([0., 0.])
+    """
+    return _mx_nd_np.argmin(a, axis, out)
+
+
 @set_module('mxnet.numpy')
 def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
     """
-    mean(a, axis=None, dtype=None, out=None, keepdims=None)
     Compute the arithmetic mean along the specified axis.
     Returns the average of the array elements.
     The average is taken over the flattened array by default, otherwise over the specified axis.
+
     Parameters
     ----------
     a : ndarray
@@ -4872,11 +5261,13 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable
         If the default value is passed, then keepdims will not be passed through to the mean
         method of sub-classes of ndarray, however any non-default value will be. If the sub-class
         method does not implement keepdims any exceptions will be raised.
+
     Returns
     -------
     m : ndarray, see dtype parameter above
         If out=None, returns a new array containing the mean values,
         otherwise a reference to the output array is returned.
+
     Notes
     -----
     This function differs from the original `numpy.mean
@@ -4884,6 +5275,7 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable
     the following way(s):
     - only ndarray is accepted as valid input, python iterables or scalar is not supported
     - default data type for integer input is float32
+
     Examples
     --------
     >>> a = np.array([[1, 2], [3, 4]])
@@ -5758,16 +6150,19 @@ def inner(a, b):
     Examples
     --------
     Ordinary inner product for vectors:
+
     >>> a = np.array([1,2,3])
     >>> b = np.array([0,1,0])
     >>> np.inner(a, b)
-    2
+    array(2.)
+
     A multidimensional example:
+
     >>> a = np.arange(24).reshape((2,3,4))
     >>> b = np.arange(4)
     >>> np.inner(a, b)
-    array([[ 14,  38,  62],
-           [ 86, 110, 134]])
+    array([[ 14.,  38.,  62.],
+           [ 86., 110., 134.]])
     """
     return tensordot(a, b, [-1, -1])
 
@@ -5796,6 +6191,7 @@ def outer(a, b):
     -------
     out : (M, N) ndarray
         ``out[i, j] = a[i] * b[j]``
+
     See also
     --------
     inner
@@ -5812,13 +6208,14 @@ def outer(a, b):
     Examples
     --------
     Make a (*very* coarse) grid for computing a Mandelbrot set:
+
     >>> rl = np.outer(np.ones((5,)), np.linspace(-2, 2, 5))
     >>> rl
     array([[-2., -1.,  0.,  1.,  2.],
-        [-2., -1.,  0.,  1.,  2.],
-        [-2., -1.,  0.,  1.,  2.],
-        [-2., -1.,  0.,  1.,  2.],
-        [-2., -1.,  0.,  1.,  2.]])
+           [-2., -1.,  0.,  1.,  2.],
+           [-2., -1.,  0.,  1.,  2.],
+           [-2., -1.,  0.,  1.,  2.],
+           [-2., -1.,  0.,  1.,  2.]])
     """
     return tensordot(a.flatten(), b.flatten(), 0)
 
@@ -5851,12 +6248,13 @@ def vdot(a, b):
     Examples
     --------
     Note that higher-dimensional arrays are flattened!
+
     >>> a = np.array([[1, 4], [5, 6]])
     >>> b = np.array([[4, 1], [2, 2]])
     >>> np.vdot(a, b)
-    30
+    array(30.)
     >>> np.vdot(b, a)
-    30
+    array(30.)
     >>> 1*4 + 4*1 + 5*2 + 6*2
     30
     """
@@ -6060,6 +6458,7 @@ def rot90(m, k=1, axes=(0, 1)):
     """
     Rotate an array by 90 degrees in the plane specified by axes.
     Rotation direction is from the first towards the second axis.
+
     Parameters
     ----------
     m : ndarray
@@ -6075,9 +6474,11 @@ def rot90(m, k=1, axes=(0, 1)):
     y : ndarray
         A rotated view of `m`.
 
+    Notes
     -----
     rot90(m, k=1, axes=(1,0)) is the reverse of rot90(m, k=1, axes=(0,1))
     rot90(m, k=1, axes=(1,0)) is equivalent to rot90(m, k=-1, axes=(0,1))
+
     Examples
     --------
     >>> m = np.array([[1,2],[3,4]], 'int')
@@ -6419,3 +6820,83 @@ def einsum(*operands, **kwargs):
     ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=True)
     """
     return _mx_nd_np.einsum(*operands, **kwargs)
+
+
+@set_module('mxnet.numpy')
+def nonzero(a):
+    """
+    Return the indices of the elements that are non-zero.
+
+    Returns a tuple of arrays, one for each dimension of `a`,
+    containing the indices of the non-zero elements in that
+    dimension. The values in `a` are always returned in
+    row-major, C-style order.
+
+    To group the indices by element, rather than dimension, use `argwhere`,
+    which returns a row for each non-zero element.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+
+    Returns
+    -------
+    tuple_of_arrays : tuple
+        Indices of elements that are non-zero.
+
+    See Also
+    --------
+    ndarray.nonzero :
+        Equivalent ndarray method.
+
+    Notes
+    -----
+    While the nonzero values can be obtained with ``a[nonzero(a)]``, it is
+    recommended to use ``x[x.astype(bool)]`` or ``x[x != 0]`` instead, which
+    will correctly handle 0-d arrays.
+
+    Examples
+    --------
+    >>> x = np.array([[3, 0, 0], [0, 4, 0], [5, 6, 0]])
+    >>> x
+    array([[3, 0, 0],
+           [0, 4, 0],
+           [5, 6, 0]], dtype=int32)
+    >>> np.nonzero(x)
+    (array([0, 1, 2, 2], dtype=int64), array([0, 1, 0, 1], dtype=int64))
+
+    >>> x[np.nonzero(x)]
+    array([3, 4, 5, 6])
+    >>> np.transpose(np.stack(np.nonzero(x)))
+    array([[0, 0],
+           [1, 1],
+           [2, 0],
+           [2, 1]], dtype=int64)
+
+    A common use for ``nonzero`` is to find the indices of an array, where
+    a condition is True.  Given an array `a`, the condition `a` > 3 is a
+    boolean array and since False is interpreted as 0, np.nonzero(a > 3)
+    yields the indices of the `a` where the condition is true.
+
+    >>> a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
+    >>> a > 3
+    array([[False, False, False],
+           [ True,  True,  True],
+           [ True,  True,  True]])
+    >>> np.nonzero(a > 3)
+    (array([1, 1, 1, 2, 2, 2], dtype=int64), array([0, 1, 2, 0, 1, 2], dtype=int64))
+
+    Using this result to index `a` is equivalent to using the mask directly:
+
+    >>> a[np.nonzero(a > 3)]
+    array([4, 5, 6, 7, 8, 9], dtype=int32)
+    >>> a[a > 3]
+    array([4, 5, 6, 7, 8, 9], dtype=int32)
+
+    ``nonzero`` can also be called as a method of the array.
+
+    >>> (a > 3).nonzero()
+    (array([1, 1, 1, 2, 2, 2], dtype=int64), array([0, 1, 2, 0, 1, 2], dtype=int64))
+    """
+    return _mx_nd_np.nonzero(a)
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index d0ae237a5b92..1cad4a55c466 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -20,11 +20,11 @@
 from __future__ import absolute_import
 from ..ndarray import numpy as _mx_nd_np
 
-__all__ = ["randint", "uniform", "normal", "choice", "rand"]
+__all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial"]
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
-    """Return random integers from `low` (inclusive) to `high` (exclusive).
+    r"""Return random integers from `low` (inclusive) to `high` (exclusive).
 
     Return random integers from the "discrete uniform" distribution of
     the specified dtype in the "half-open" interval [`low`, `high`). If
@@ -76,7 +76,7 @@ def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
 
 
 def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
-    """Draw samples from a uniform distribution.
+    r"""Draw samples from a uniform distribution.
 
     Samples are uniformly distributed over the half-open interval
     ``[low, high)`` (includes low, but excludes high).  In other words,
@@ -95,7 +95,8 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
         Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a scalar tensor containing a single value is returned if
-        ``low`` and ``high`` are both scalars.
+        ``low`` and ``high`` are both scalars. Otherwise,
+        ``np.broadcast(low, high).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
     ctx : Context, optional
@@ -105,12 +106,33 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
     -------
     out : ndarray
         Drawn samples from the parameterized uniform distribution.
+
+    See Also
+    --------
+    randint : Discrete uniform distribution, yielding integers.
+    rand : Convenience function that accepts dimensions as input, e.g.,
+           ``rand(2,2)`` would generate a 2-by-2 array of floats,
+           uniformly distributed over ``[0, 1)``.
+
+    Notes
+    -----
+    The probability density function of the uniform distribution is
+
+    .. math:: p(x) = \frac{1}{b - a}
+
+    anywhere within the interval ``[a, b)``, and zero elsewhere.
+
+    When ``high`` == ``low``, values of ``low`` will be returned.
+    If ``high`` < ``low``, the results are officially undefined
+    and may eventually raise an error, i.e. do not rely on this
+    function to behave when passed arguments satisfying that
+    inequality condition.
     """
     return _mx_nd_np.random.uniform(low, high, size=size, ctx=ctx, dtype=dtype, out=out)
 
 
 def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
-    """Draw random samples from a normal (Gaussian) distribution.
+    r"""Draw random samples from a normal (Gaussian) distribution.
 
     Samples are distributed according to a normal distribution parametrized
     by *loc* (mean) and *scale* (standard deviation).
@@ -125,7 +147,8 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
     size : int or tuple of ints, optional
         Output shape. If the given shape is, e.g., `(m, n, k)`, then `m * n * k`
         samples are drawn. If size is `None` (default), a scalar tensor containing
-        a single value is returned if loc and scale are both scalars.
+        a single value is returned if loc and scale are both scalars. Otherwise,
+        ``np.broadcast(low, high).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
     ctx : Context, optional
@@ -137,17 +160,53 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
     -------
     out : ndarray
         Drawn samples from the parameterized normal distribution.
+
+    Notes
+    -----
+    The probability density for the Gaussian distribution is
+
+    .. math:: p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }}
+                     e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} },
+
+    where :math:`\mu` is the mean and :math:`\sigma` the standard
+    deviation. The square of the standard deviation, :math:`\sigma^2`,
+    is called the variance.
+
+    The function has its peak at the mean, and its "spread" increases with
+    the standard deviation (the function reaches 0.607 times its maximum at
+    :math:`x + \sigma` and :math:`x - \sigma` [2]_).  This implies that
+    `numpy.random.normal` is more likely to return samples lying close to
+    the mean, rather than those far away.
+
+    References
+    ----------
+    .. [1] Wikipedia, "Normal distribution",
+           https://en.wikipedia.org/wiki/Normal_distribution
+    .. [2] P. R. Peebles Jr., "Central Limit Theorem" in "Probability,
+           Random Variables and Random Signal Principles", 4th ed., 2001,
+           pp. 51, 51, 125.
+
+    Examples
+    --------
+    >>> mu, sigma = 0, 0.1 # mean and standard deviation
+    >>> s = np.random.normal(mu, sigma, 1000)
+
+    Verify the mean and the variance:
+
+    >>> np.abs(mu - np.mean(s)) < 0.01
+    array(True)
     """
     return _mx_nd_np.random.normal(loc, scale, size, dtype, ctx, out)
 
 
 def multinomial(n, pvals, size=None, **kwargs):
-    """multinomial(n, pvals, size=None)
+    r"""
     Draw samples from a multinomial distribution.
     The multinomial distribution is a multivariate generalisation of the binomial distribution.
     Take an experiment with one of ``p`` possible outcomes. An example of such an experiment is throwing a dice,
     where the outcome can be 1 through 6. Each sample drawn from the distribution represents n such experiments.
     Its values, ``X_i = [X_0, X_1, ..., X_p]``, represent the number of times the outcome was ``i``.
+
     Parameters
     ----------
     n : int
@@ -157,18 +216,23 @@ def multinomial(n, pvals, size=None, **kwargs):
     size : int or tuple of ints, optional
         Output shape. If the given shape is, e.g., ``(m, n, k)``, then ``m * n * k`` samples
         are drawn. Default is None, in which case a single value is returned.
+
     Returns
     -------
     out : ndarray
         The drawn samples, of shape size, if that was provided. If not, the shape is ``(N,)``.
         In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution.
+
     Examples
     --------
     Throw a dice 1000 times, and 1000 times again:
+
     >>> np.random.multinomial(1000, [1/6.]*6, size=2)
     array([[164, 161, 179, 158, 150, 188],
            [178, 162, 177, 143, 163, 177]])
+
     A loaded die is more likely to land on number 6:
+
     >>> np.random.multinomial(100, [1/7.]*5 + [2/7.])
     array([19, 14, 12, 11, 21, 23])
     >>> np.random.multinomial(100, [1.0 / 3, 2.0 / 3])
@@ -178,7 +242,7 @@ def multinomial(n, pvals, size=None, **kwargs):
 
 
 def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
-    """Generates a random sample from a given 1-D array
+    r"""Generates a random sample from a given 1-D array
 
     Parameters
     -----------
diff --git a/python/mxnet/numpy/stride_tricks.py b/python/mxnet/numpy/stride_tricks.py
index 0b2fe523b0f3..b4a4d0a7b44a 100644
--- a/python/mxnet/numpy/stride_tricks.py
+++ b/python/mxnet/numpy/stride_tricks.py
@@ -46,6 +46,15 @@ def broadcast_arrays(*args):
         These arrays are copies of the original arrays unless that all the input
         arrays have the same shape, the input list of arrays are returned
         instead of a list of copies.
+
+    Examples
+    --------
+    >>> x = np.array([[1,2,3]])
+    >>> y = np.array([[4],[5]])
+    >>> np.broadcast_arrays(x, y)
+    [array([[1., 2., 3.],
+           [1., 2., 3.]]), array([[4., 4., 4.],
+           [5., 5., 5.]])]
     """
     shape = _broadcast_shape(*args)
 
diff --git a/python/mxnet/numpy/utils.py b/python/mxnet/numpy/utils.py
index b2335e29855d..c34650a61f31 100644
--- a/python/mxnet/numpy/utils.py
+++ b/python/mxnet/numpy/utils.py
@@ -23,7 +23,7 @@
 import numpy as onp
 
 __all__ = ['float16', 'float32', 'float64', 'uint8', 'int32', 'int8', 'int64',
-           'bool', 'bool_', 'pi', 'inf', 'nan', 'PZERO', 'NZERO']
+           'bool', 'bool_', 'pi', 'inf', 'nan', 'PZERO', 'NZERO', 'newaxis']
 
 float16 = onp.float16
 float32 = onp.float32
@@ -40,3 +40,5 @@
 nan = onp.nan
 PZERO = onp.PZERO
 NZERO = onp.NZERO
+
+newaxis = None
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index 6db44fad7780..cec2f245a5e1 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -83,6 +83,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
 
 
 _NUMPY_ARRAY_FUNCTION_LIST = [
+    'argmin',
     'argmax',
     'around',
     'broadcast_arrays',
@@ -99,6 +100,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'max',
     'mean',
     'min',
+    'nonzero',
     'ones_like',
     'prod',
     'ravel',
diff --git a/python/mxnet/numpy_extension/random.py b/python/mxnet/numpy_extension/random.py
index 5aa58a0cc69d..9c059ca9ade4 100644
--- a/python/mxnet/numpy_extension/random.py
+++ b/python/mxnet/numpy_extension/random.py
@@ -25,7 +25,7 @@
 
 
 def seed(seed, ctx='all'):  # pylint: disable=redefined-outer-name
-    """Seeds the random number generators in MXNet.
+    r"""Seeds the random number generators in MXNet.
 
     This affects the behavior of modules in MXNet that uses random number generators,
     like the dropout operator and `ndarray`'s random sampling operators.
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index aa456c8e5166..ddf2feb30b18 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -36,7 +36,7 @@
            'rint', 'radians', 'reciprocal', 'square', 'negative', 'fix', 'ceil', 'floor',
            'trunc', 'logical_not', 'arcsinh', 'arccosh', 'arctanh', 'tensordot', 'histogram', 'eye',
            'linspace', 'logspace', 'expand_dims', 'tile', 'arange', 'split', 'vsplit', 'concatenate',
-           'stack', 'vstack', 'dstack', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip', 'argmax',
+           'stack', 'vstack', 'dstack', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip', 'argmax', 'argmin',
            'std', 'var', 'indices', 'copysign', 'ravel', 'hanning', 'hamming', 'blackman', 'flip',
            'around', 'hypot', 'rad2deg', 'deg2rad', 'unique', 'lcm', 'tril', 'identity', 'take',
            'ldexp', 'vdot', 'inner', 'outer', 'equal', 'not_equal', 'greater', 'less', 'greater_equal',
@@ -385,13 +385,10 @@ def argmax_channel(self, *args, **kwargs):
         """
         raise AttributeError('_Symbol object has no attribute argmax_channel')
 
-    def argmin(self, *args, **kwargs):
-        """Convenience fluent method for :py:func:`argmin`.
-
-        The arguments are the same as for :py:func:`argmin`, with
-        this array as data.
-        """
-        raise NotImplementedError
+    def argmin(self, axis=None, out=None):  # pylint: disable=arguments-differ
+        """Return indices of the minimum values along the given axis.
+        Refer to `mxnet.numpy.argmax` for full documentation."""
+        return argmin(self, axis, out)
 
     def clip(self, min=None, max=None, out=None):  # pylint: disable=arguments-differ
         """Return an array whose values are limited to [min, max].
@@ -3187,8 +3184,6 @@ def swapaxes(a, axis1, axis2):
 @set_module('mxnet.symbol.numpy')
 def argmax(a, axis=None, out=None):
     r"""
-    argmax(a, axis=None, out=None)
-
     Returns the indices of the maximum values along an axis.
 
     Parameters
@@ -3226,6 +3221,46 @@ def argmax(a, axis=None, out=None):
     return _npi.argmax(a, axis=axis, keepdims=False, out=out)
 
 
+@set_module('mxnet.symbol.numpy')
+def argmin(a, axis=None, out=None):
+    r"""
+    Returns the indices of the minimum values along an axis.
+
+    Parameters
+    ----------
+    a : _Symbol
+        Input array. Only support dtype `float16`, `float32`, and `float64`.
+    axis : int, optional
+        By default, the index is into the flattened array, otherwise
+        along the specified axis.
+    out : _Symbol or None, optional
+        Dummy parameter to keep the consistency with the ndarray counterpart.
+
+    Returns
+    -------
+    index_array : _Symbol of indices whose dtype is same as the input ndarray.
+        Array of indices into the array. It has the same shape as `a.shape`
+        with the dimension along `axis` removed.
+
+    Notes
+    -----
+    In case of multiple occurrences of the minimum values, the indices
+    corresponding to the first occurrence are returned.
+
+    This function differs from the original `numpy.argmin
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmin.html>`_ in
+    the following aspects:
+
+    - Input type does not support Python native iterables(list, tuple, ...).
+    - Output has dtype that is same as the input ndarray.
+    - ``out`` param: cannot perform auto broadcasting. ``out`` symbol's shape must be the same as the expected output.
+    - ``out`` param: cannot perform auto type cast. ``out`` symnbol's dtype must be the same as the expected output.
+    - ``out`` param does not support scalar input case.
+
+    """
+    return _npi.argmin(a, axis=axis, keepdims=False, out=out)
+
+
 @set_module('mxnet.symbol.numpy')
 def mean(a, axis=None, dtype=None, out=None, keepdims=False):  # pylint: disable=arguments-differ
     """
diff --git a/python/mxnet/symbol/numpy/random.py b/python/mxnet/symbol/numpy/random.py
index d891ea0c21a0..48bccb64a2b4 100644
--- a/python/mxnet/symbol/numpy/random.py
+++ b/python/mxnet/symbol/numpy/random.py
@@ -25,7 +25,7 @@
 
 
 def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
-    """Return random integers from `low` (inclusive) to `high` (exclusive).
+    r"""Return random integers from `low` (inclusive) to `high` (exclusive).
 
     Return random integers from the "discrete uniform" distribution of
     the specified dtype in the "half-open" interval [`low`, `high`). If
@@ -113,7 +113,7 @@ def rand(*size, **kwargs):
 
 
 def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
-    """Draw samples from a uniform distribution.
+    r"""Draw samples from a uniform distribution.
 
     Samples are uniformly distributed over the half-open interval
     ``[low, high)`` (includes low, but excludes high).  In other words,
@@ -168,7 +168,7 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
 
 
 def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
-    """Draw random samples from a normal (Gaussian) distribution.
+    r"""Draw random samples from a normal (Gaussian) distribution.
 
     Samples are distributed according to a normal distribution parametrized
     by *loc* (mean) and *scale* (standard deviation).
@@ -217,7 +217,7 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
 
 
 def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
-    """Generates a random sample from a given 1-D array
+    r"""Generates a random sample from a given 1-D array
 
     Parameters
     -----------
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 9e15caae9698..3a85e31e7e43 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -690,15 +690,76 @@ def _set_np_array(active):
 
 def set_np(shape=True, array=True):
     """Setting NumPy shape and array semantics at the same time.
-    It is required to keep NumPy shape semantics active when activating NumPy array semantics.
+    It is required to keep NumPy shape semantics active while activating NumPy array semantics.
     Deactivating NumPy shape semantics while NumPy array semantics is still active is not allowed.
+    It is highly recommended to set these two flags to `True` at the same time to fully enable
+    NumPy-like behaviors. Please refer to the Examples section for a better understanding.
 
     Parameters
     ----------
     shape : bool
         A boolean value indicating whether the NumPy-shape semantics should be turned on or off.
+        When this flag is set to `True`, zero-size and zero-dim shapes are all valid shapes in
+        shape inference process, instead of treated as unknown shapes in legacy mode.
     array : bool
         A boolean value indicating whether the NumPy-array semantics should be turned on or off.
+        When this flag is set to `True`, it enables Gluon code flow to use or generate `mxnet.numpy.ndarray`s
+        instead of `mxnet.ndarray.NDArray`. For example, a `Block` would create parameters of type
+        `mxnet.numpy.ndarray`.
+
+    Examples
+    --------
+    >>> import mxnet as mx
+
+    Creating zero-dim ndarray in legacy mode would fail at shape inference.
+
+    >>> mx.nd.ones(shape=())
+    mxnet.base.MXNetError: Operator _ones inferring shapes failed.
+
+    >>> mx.nd.ones(shape=(2, 0, 3))
+    mxnet.base.MXNetError: Operator _ones inferring shapes failed.
+
+    In legacy mode, Gluon layers would create parameters and outputs of type `mx.nd.NDArray`.
+
+    >>> from mxnet.gluon import nn
+    >>> dense = nn.Dense(2)
+    >>> dense.initialize()
+    >>> dense(mx.nd.ones(shape=(3, 2)))
+    [[0.01983214 0.07832371]
+     [0.01983214 0.07832371]
+     [0.01983214 0.07832371]]
+    <NDArray 3x2 @cpu(0)>
+
+    >>> [p.data() for p in dense.collect_params().values()]
+    [
+    [[0.0068339  0.01299825]
+     [0.0301265  0.04819721]]
+    <NDArray 2x2 @cpu(0)>,
+    [0. 0.]
+    <NDArray 2 @cpu(0)>]
+
+    When the `shape` flag is `True`, both shape inferences are successful.
+
+    >>> from mxnet import np, npx
+    >>> npx.set_np()  # this is required to activate NumPy-like behaviors
+
+    >>> np.ones(shape=())
+    array(1.)
+    >>> np.ones(shape=(2, 0, 3))
+    array([], shape=(2, 0, 3))
+
+    When the `array` flag is `True`, Gluon layers would create parameters and outputs of type `mx.np.ndarray`.
+
+    >>> dense = nn.Dense(2)
+    >>> dense.initialize()
+    >>> dense(np.ones(shape=(3, 2)))
+    array([[0.01983214, 0.07832371],
+           [0.01983214, 0.07832371],
+           [0.01983214, 0.07832371]])
+
+    >>> [p.data() for p in dense.collect_params().values()]
+    [array([[0.0068339 , 0.01299825],
+           [0.0301265 , 0.04819721]]), array([0., 0.])]
     """
     if not shape and array:
         raise ValueError('NumPy Shape semantics is required in using NumPy array semantics.')
diff --git a/src/operator/contrib/boolean_mask.cc b/src/operator/contrib/boolean_mask.cc
index a54cc917776d..cd2fd8e42f8e 100644
--- a/src/operator/contrib/boolean_mask.cc
+++ b/src/operator/contrib/boolean_mask.cc
@@ -129,7 +129,7 @@ inline void BooleanMaskForward<cpu>(const nnvm::NodeAttrs& attrs,
 
   const_cast<NDArray &>(out).Init(s);
   // do the copy
-  MSHADOW_TYPE_SWITCH(data.dtype(), DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(data.dtype(), DType, {
     size_t input_size = data.shape().Size();
     size_t col_size = input_size / idx_size;
     mshadow::Stream<cpu> *stream = ctx.get_stream<cpu>();
diff --git a/src/operator/contrib/boolean_mask.cu b/src/operator/contrib/boolean_mask.cu
index 71d91c63f64e..f6c1df0c62a8 100644
--- a/src/operator/contrib/boolean_mask.cu
+++ b/src/operator/contrib/boolean_mask.cu
@@ -86,7 +86,7 @@ inline void BooleanMaskForward<gpu>(const nnvm::NodeAttrs& attrs,
   size_t input_size = data.shape().Size();
   size_t col_size = input_size / idx.shape()[0];
   // Do the copy
-  MSHADOW_TYPE_SWITCH(out.dtype(), DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(out.dtype(), DType, {
     if (valid_num > 0) {
       mxnet_op::Kernel<BooleanMaskForwardKernel, gpu>::Launch(
         s, input_size, out.data().dptr<DType>(), data.data().dptr<DType>(), prefix_sum, col_size);
diff --git a/src/operator/numpy/np_broadcast_reduce_op_index.cc b/src/operator/numpy/np_broadcast_reduce_op_index.cc
index bd6915cc9b27..15831c7e79ba 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_index.cc
+++ b/src/operator/numpy/np_broadcast_reduce_op_index.cc
@@ -57,5 +57,16 @@ NNVM_REGISTER_OP(_npi_argmax)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_arguments(ReduceAxisParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_npi_argmin)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ReduceAxisParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyReduceAxisShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.add_argument("data", "NDArray-or-Symbol", "The input")
+.set_attr<FCompute>("FCompute<cpu>", SearchAxisCompute<cpu, mshadow::red::minimum>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(ReduceAxisParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_broadcast_reduce_op_index.cu b/src/operator/numpy/np_broadcast_reduce_op_index.cu
index a07baa9c070c..0420133ee7c0 100644
--- a/src/operator/numpy/np_broadcast_reduce_op_index.cu
+++ b/src/operator/numpy/np_broadcast_reduce_op_index.cu
@@ -30,5 +30,8 @@ namespace op {
 NNVM_REGISTER_OP(_npi_argmax)
 .set_attr<FCompute>("FCompute<gpu>", SearchAxisCompute<gpu, mshadow::red::maximum>);
 
+NNVM_REGISTER_OP(_npi_argmin)
+.set_attr<FCompute>("FCompute<gpu>", SearchAxisCompute<gpu, mshadow::red::minimum>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_nonzero_op.cc b/src/operator/numpy/np_nonzero_op.cc
index 00f9081ba984..0eaf0878a24a 100644
--- a/src/operator/numpy/np_nonzero_op.cc
+++ b/src/operator/numpy/np_nonzero_op.cc
@@ -91,7 +91,7 @@ void NonzeroForwardCPU(const nnvm::NodeAttrs& attrs,
   std::vector<int32_t> prefix_sum(in_size, 0);
   size_t valid_num = 0;
   // Calculate prefix sum
-  MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(in.dtype(), DType, {
     DType* in_dptr = in.data().dptr<DType>();
     for (size_t i = 0; i < in_size; i++) {
       prefix_sum[i] = (i == 0) ? 0 : prefix_sum[i - 1];
@@ -113,6 +113,7 @@ void NonzeroForwardCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_npx_nonzero)
+.add_alias("_npi_nonzero")
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
diff --git a/src/operator/numpy/np_nonzero_op.cu b/src/operator/numpy/np_nonzero_op.cu
index 33925ea2e156..c732d2c78493 100644
--- a/src/operator/numpy/np_nonzero_op.cu
+++ b/src/operator/numpy/np_nonzero_op.cu
@@ -80,7 +80,7 @@ void NonzeroForwardGPU(const nnvm::NodeAttrs& attrs,
     ctx.requested[0].get_space_typed<gpu, 1, char>(Shape1(temp_storage_bytes), stream);
   prefix_sum = reinterpret_cast<int32_t*>(workspace.dptr_);
   d_temp_storage = workspace.dptr_ + buffer_size;
-  MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(in.dtype(), DType, {
     mxnet_op::Kernel<PrefixSumInit, gpu>::Launch(
       stream, in_size, prefix_sum, in.data().dptr<DType>());
   });
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
index af5425336699..12e89a2d9b39 100644
--- a/tests/python/unittest/test_numpy_gluon.py
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -156,6 +156,29 @@ def test_np_loss_ndarray():
     assert_almost_equal(L, _np.array([1.06346405,  0.04858733]), use_broadcast=False)
 
 
+@with_seed()
+@use_np
+def test_np_get_constant():
+    const_arr = _np.random.uniform(0, 100, size=(10, 10)).astype(_np.float32)
+
+    class Foo(gluon.HybridBlock):
+        def __init__(self, prefix=None, params=None):
+            super(Foo, self).__init__(prefix=prefix, params=params)
+            self.weight = self.params.get_constant('const', const_arr)
+
+        def hybrid_forward(self, F, x, weight):
+            return x + weight.astype(np.float32)
+
+    x = np.random.uniform(size=const_arr.shape, dtype=const_arr.dtype)
+    for hybridize in [False, True]:
+        foo = Foo()
+        if hybridize:
+            foo.hybridize()
+        foo.initialize()
+        out = foo(x)
+        assert_almost_equal(out.asnumpy(), (x.asnumpy() + const_arr), atol=1e-5, rtol=1e-4, use_broadcast=False)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 62004ac6d263..860fecc5cda0 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -383,6 +383,22 @@ def _add_workload_argmax():
     OpArgMngr.add_workload('argmax', np.array([True, False, True, False, False]))
 
 
+def _add_workload_argmin():
+    OpArgMngr.add_workload('argmin', np.random.uniform(size=(4, 5, 6, 7, 8)), 0)
+    OpArgMngr.add_workload('argmin', np.random.uniform(size=(4, 5, 6, 7, 8)), 1)
+    OpArgMngr.add_workload('argmin', np.random.uniform(size=(4, 5, 6, 7, 8)), 2)
+    OpArgMngr.add_workload('argmin', np.random.uniform(size=(4, 5, 6, 7, 8)), 3)
+    OpArgMngr.add_workload('argmin', np.random.uniform(size=(4, 5, 6, 7, 8)), 4)
+    # OpArgMngr.add_workload('argmin', np.array([0, 1, 2, 3, np.nan]))
+    # OpArgMngr.add_workload('argmin', np.array([0, 1, 2, np.nan, 3]))
+    # OpArgMngr.add_workload('argmin', np.array([np.nan, 0, 1, 2, 3]))
+    # OpArgMngr.add_workload('argmin', np.array([np.nan, 0, np.nan, 2, 3]))
+    OpArgMngr.add_workload('argmin', np.array([False, False, False, False, True]))
+    OpArgMngr.add_workload('argmin', np.array([False, False, False, True, False]))
+    OpArgMngr.add_workload('argmin', np.array([True, False, False, False, False]))
+    OpArgMngr.add_workload('argmin', np.array([True, False, True, False, False]))
+
+
 def _add_workload_around():
     OpArgMngr.add_workload('around', np.array([1.56, 72.54, 6.35, 3.25]), decimals=1)
 
@@ -1059,6 +1075,16 @@ def _add_workload_less_equal(array_pool):
     # OpArgMngr.add_workload('less_equal', np.array([np.nan]), np.array([np.nan]))
 
 
+def _add_workload_nonzero():
+    OpArgMngr.add_workload('nonzero', np.random.randint(0, 2))
+    OpArgMngr.add_workload('nonzero', np.random.randint(0, 2, size=()))
+    OpArgMngr.add_workload('nonzero', np.random.randint(0, 2, size=(0, 1, 2)))
+    OpArgMngr.add_workload('nonzero', np.random.randint(0, 2, size=(0, 1, 0)))
+    OpArgMngr.add_workload('nonzero', np.random.randint(0, 2, size=(2, 3, 4)))
+    OpArgMngr.add_workload('nonzero', np.array([False, False, False], dtype=np.bool_))
+    OpArgMngr.add_workload('nonzero', np.array([True, False, False], dtype=np.bool_))
+
+
 @use_np
 def _prepare_workloads():
     array_pool = {
@@ -1067,6 +1093,7 @@ def _prepare_workloads():
         '1x1x0': np.array([[[]]])
     }
 
+    _add_workload_argmin()
     _add_workload_argmax()
     _add_workload_around()
     _add_workload_broadcast_arrays(array_pool)
@@ -1083,6 +1110,7 @@ def _prepare_workloads():
     _add_workload_max(array_pool)
     _add_workload_min(array_pool)
     _add_workload_mean(array_pool)
+    _add_workload_nonzero()
     _add_workload_ones_like(array_pool)
     _add_workload_prod(array_pool)
     _add_workload_repeat(array_pool)
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 01778099412a..c4f756d56a33 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2173,7 +2173,7 @@ def hybrid_forward(self, F, x):
 
 @with_seed()
 @use_np
-def test_np_argmax():
+def test_np_argmin_argmax():
     workloads = [
         ((), 0, False),
         ((), -1, False),
@@ -2188,49 +2188,52 @@ def test_np_argmax():
         ((5, 0, 3), 1, True),
     ]
     dtypes = ['float16', 'float32', 'float64']
+    ops = ['argmin', 'argmax']
 
-    class TestArgMax(HybridBlock):
-        def __init__(self, axis=None):
-            super(TestArgMax, self).__init__()
+    class TestArgExtreme(HybridBlock):
+        def __init__(self, op_name, axis=None):
+            super(TestArgExtreme, self).__init__()
+            self._op_name = op_name
             self._axis = axis
 
         def hybrid_forward(self, F, x):
-            return F.np.argmax(x, self._axis)
-
-    for shape, axis, throw_exception in workloads:
-        for dtype in dtypes:
-            a = np.random.uniform(size=shape, dtype=dtype)
-            if throw_exception:
-                # Cannot use assert_exception because sometimes the main thread
-                # proceeds to `assert False` before the exception is thrown
-                # in the worker thread. Have to use mx.nd.waitall() here
-                # to block the main thread.
-                try:
-                    np.argmax(a, axis)
-                    mx.nd.waitall()
-                    assert False
-                except mx.MXNetError:
-                    pass
-            else:
-                mx_ret = np.argmax(a, axis=axis)
-                np_ret = _np.argmax(a.asnumpy(), axis=axis)
-                assert same(mx_ret.asnumpy(), np_ret)
+            return getattr(x, self._op_name)(self._axis)
 
-            for hybridize in [False, True]:
-                net = TestArgMax(axis)
-                if hybridize:
-                    net.hybridize()
+    for op_name in ops:
+        for shape, axis, throw_exception in workloads:
+            for dtype in dtypes:
+                a = np.random.uniform(size=shape, dtype=dtype)
                 if throw_exception:
+                    # Cannot use assert_exception because sometimes the main thread
+                    # proceeds to `assert False` before the exception is thrown
+                    # in the worker thread. Have to use mx.nd.waitall() here
+                    # to block the main thread.
                     try:
-                        net(a)
+                        getattr(np, op_name)(a, axis)
                         mx.nd.waitall()
                         assert False
                     except mx.MXNetError:
                         pass
                 else:
-                    mx_ret = net(a)
+                    mx_ret = getattr(np, op_name)(a, axis=axis)
+                    np_ret = getattr(_np, op_name)(a.asnumpy(), axis=axis)
                     assert same(mx_ret.asnumpy(), np_ret)
 
+                for hybridize in [False, True]:
+                    net = TestArgExtreme(op_name, axis)
+                    if hybridize:
+                        net.hybridize()
+                    if throw_exception:
+                        try:
+                            net(a)
+                            mx.nd.waitall()
+                            assert False
+                        except mx.MXNetError:
+                            pass
+                    else:
+                        mx_ret = net(a)
+                        assert same(mx_ret.asnumpy(), np_ret)
+
 
 @with_seed()
 @use_np

From ffc5392d6c1328258cbdb8ceae724f055a8cae76 Mon Sep 17 00:00:00 2001
From: Haozheng Fan <fhztc1997618@gmail.com>
Date: Mon, 28 Oct 2019 07:36:26 +0800
Subject: [PATCH 30/32] Disable float16 test (#16643)

---
 tests/python/unittest/test_numpy_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index c4f756d56a33..391a07411b15 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3507,7 +3507,7 @@ def dbg(name, data):
         # ('abiz,abjz->abij', [(64, 8, 128, 512), (64, 8, 128, 512)], lambda *args: (_np.matmul(_np.ones((64, 8, 128, 128)), args[1]),
         #                                                                            _np.matmul(_np.ones((64, 8, 128, 128)), args[0]))),
     ]
-    dtypes = ['float16', 'float32', 'float64', 'int32']
+    dtypes = ['float32', 'float64', 'int32']
     acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
                 'int32': 'int64'}
     for hybridize in [False, True]:

From 11dff51f71c4de0f564cca3f1c22ae2bbeeba43a Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Mon, 28 Oct 2019 10:11:28 +0800
Subject: [PATCH 31/32] Fix GetMKLDNNData for delay alloc (#16618)

* Fix GetMKLDNNData for delay alloc

* Run CI

* Run CI

* Run CI

* Run CI

* Run CI

Change-Id: I7ac2796e0ee8439c92fd2bd7a70a23a359b76b12
---
 src/ndarray/ndarray.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index aaa7aedf8bcd..78a6cfb15fd2 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -647,6 +647,7 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const {
     // If this is a view, we can't create a MKLDNN memory for the chunk
     // because we don't have the complete data type and shape information for
     // the chunk.
+    CheckAndAlloc();
     void *off_addr = static_cast<char *>(ptr_->shandle.dptr) + byte_offset_;
     // Create the primitive desc for the new mkldnn memory.
     mkldnn::memory::dims dims(shape().ndim());
@@ -665,6 +666,7 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const {
   } else {
     // If this isn't a view, we can create a MKLDNN memory and store it in the
     // chunk.
+    CheckAndAlloc();
     ptr_->SetMKLMem(shape_, dtype_);
     MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
     return ptr_->mkl_mem_->GetRaw();

From d7a8ccf1c8e1fab2cd6c33704bafe1ce04957e0d Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Mon, 28 Oct 2019 10:31:42 +0800
Subject: [PATCH 32/32] Revert "[mkldnn-1.0]Rebase to master (#16648)"

This reverts commit dea3dd23d1982c913b3af6cfc7f4115c2cfa7244.
---
 .travis.yml                                   |   1 -
 SECURITY.md                                   |  25 ---
 ci/docker/runtime_functions.sh                |  19 +-
 ci/windows/test_py2_cpu.ps1                   |   1 -
 ci/windows/test_py2_gpu.ps1                   |   1 -
 ci/windows/test_py3_cpu.ps1                   |   1 -
 ci/windows/test_py3_gpu.ps1                   |   1 -
 cpp-package/README.md                         |   2 +-
 .../python/api/gluon/data/index.rst           |  63 -------
 .../api/gluon/data/vision/datasets/index.rst  |  26 ---
 .../python/api/gluon/data/vision/index.rst    |  53 ------
 .../gluon/data/vision/transforms/index.rst    |  48 -----
 .../python/api/mxnet/log/index.rst            |  23 ---
 .../python/api/mxnet/model/index.rst          |  23 ---
 .../getting-started/crash-course/5-predict.md |   2 +-
 .../crash-course/6-use_gpus.md                |   2 +-
 .../gluon_from_experiment_to_deployment.md    |  10 +-
 .../getting-started/to-mxnet/pytorch.md       |  12 +-
 .../tutorials/packages/gluon/image/mnist.md   |   4 +-
 .../tutorials/packages/ndarray/sparse/csr.md  |   2 +-
 .../packages/ndarray/sparse/row_sparse.md     |   2 +-
 .../packages/ndarray/sparse/train.md          |  10 +-
 .../get_started/devices/nvidia-jetson.md      |   2 +-
 .../_includes/get_started/get_started.html    |   8 +-
 .../_includes/get_started/linux/java/cpu.md   |   2 +-
 .../_includes/get_started/linux/java/gpu.md   |   2 +-
 .../linux/julia/build-from-source.md          |   2 +-
 .../src/_includes/get_started/linux/r/cpu.md  |   2 +-
 .../src/_includes/get_started/linux/r/gpu.md  |   2 +-
 .../_includes/get_started/macos/java/cpu.md   |   2 +-
 .../src/_includes/get_started/pip_snippet.md  |   2 +-
 .../windows/julia/build-from-source.md        |   2 +-
 .../get_started/windows/perl/perl.md          |   2 +-
 .../windows/python/cpu/build-from-source.md   |   2 +-
 .../windows/python/gpu/build-from-source.md   |   2 +-
 .../_includes/get_started/windows/r/cpu.md    |   2 +-
 .../_includes/get_started/windows/r/gpu.md    |   2 +-
 .../api/architecture/note_data_loading.md     |   2 +-
 .../tutorials/mxnet_cpp_inference_tutorial.md |  10 +-
 .../src/pages/api/faq/distributed_training.md |   6 +-
 docs/static_site/src/pages/api/faq/float16.md |   8 +-
 .../src/pages/api/faq/gradient_compression.md |   2 +-
 .../src/pages/api/faq/model_parallel_lstm.md  |   2 +-
 .../static_site/src/pages/api/faq/recordio.md |   1 +
 .../api/r/docs/tutorials/callback_function.md |   8 +-
 .../r/docs/tutorials/custom_loss_function.md  |   8 +-
 .../api/r/docs/tutorials/multi_dim_lstm.md    |   4 +-
 .../src/pages/api/r/docs/tutorials/ndarray.md |  12 +-
 .../src/pages/api/r/docs/tutorials/symbol.md  |  12 +-
 .../pages/get_started/build_from_source.md    |   2 +-
 .../src/pages/get_started/index.html          |   2 +-
 include/mxnet/c_api.h                         |   2 +-
 include/mxnet/imperative.h                    |  10 +-
 julia/docs/Project.toml                       |   2 +-
 julia/docs/make.jl                            |  33 ----
 julia/docs/mkdocs.yml                         |   1 -
 julia/docs/src/api.md                         |  15 +-
 julia/docs/src/api/ndarray.md                 |  20 +-
 julia/docs/src/api/symbolic-node.md           |  11 +-
 julia/docs/src/index.md                       |  16 +-
 julia/docs/src/tutorial/char-lstm.md          |   6 +-
 julia/docs/src/tutorial/mnist.md              |   2 +-
 julia/docs/src/user-guide/overview.md         |   2 +
 julia/examples/char-lstm/README.md            |   2 +-
 julia/src/executor.jl                         |   2 +-
 julia/src/symbolic-node/show.jl               |   2 +-
 perl-package/AI-MXNet/t/test_autograd.t       |   1 -
 perl-package/AI-MXNet/t/test_gluon_trainer.t  |   2 +-
 perl-package/AI-MXNet/t/test_module.t         |   1 -
 perl-package/AI-MXNet/t/test_sparse_ndarray.t |   1 -
 python/mxnet/kvstore.py                       |   7 +-
 python/mxnet/metric.py                        |  20 +-
 python/mxnet/profiler.py                      |   3 +-
 python/mxnet/rtc.py                           |   3 +-
 python/mxnet/runtime.py                       |  22 ++-
 python/mxnet/test_utils.py                    |  12 +-
 python/mxnet/util.py                          |   6 +-
 .../native/org_apache_mxnet_native_c_api.cc   |   4 +-
 src/c_api/c_api_ndarray.cc                    |   2 +-
 src/executor/graph_executor.cc                |  22 +--
 src/imperative/cached_op.cc                   |  76 +++-----
 src/imperative/cached_op.h                    |   7 -
 src/imperative/imperative_utils.h             |   2 -
 src/ndarray/ndarray.cc                        |  30 ++-
 src/operator/quantization/dequantize.cc       |   2 +
 src/operator/subgraph/build_subgraph.cc       |   8 +-
 src/operator/tensor/dot-inl.h                 | 177 ++++++++++++------
 src/operator/tensor/dot.cc                    |  84 ++-------
 src/operator/tensor/dot.cu                    |   3 +
 tests/nightly/test_large_array.py             |   4 +-
 tests/nightly/test_large_vector.py            |  85 +--------
 tests/python/unittest/test_numpy_op.py        | 119 ------------
 tests/python/unittest/test_operator.py        |   5 +-
 93 files changed, 405 insertions(+), 875 deletions(-)
 delete mode 100644 SECURITY.md
 delete mode 100644 docs/python_docs/python/api/gluon/data/index.rst
 delete mode 100644 docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
 delete mode 100644 docs/python_docs/python/api/gluon/data/vision/index.rst
 delete mode 100644 docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
 delete mode 100644 docs/python_docs/python/api/mxnet/log/index.rst
 delete mode 100644 docs/python_docs/python/api/mxnet/model/index.rst

diff --git a/.travis.yml b/.travis.yml
index 485faadee277..b0aa26c1a3a1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,6 @@ script:
 # Temporarily disable travis build due to travis constantly time out, tracked in
 # https://github:com/apache/incubator-mxnet/issues/16535:
   - export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-  - export MXNET_SUBGRAPH_VERBOSE=0
   - mv make/osx.mk config.mk
 #  - make -j 2
 
diff --git a/SECURITY.md b/SECURITY.md
deleted file mode 100644
index bbb4505499c1..000000000000
--- a/SECURITY.md
+++ /dev/null
@@ -1,25 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Security Policy
-
-## Reporting a Vulnerability
-The Apache Software Foundation takes a very active stance in eliminating security problems and denial of service attacks against its products.
-
-We strongly encourage folks to report such problems to our private security mailing list first, before disclosing them in a public forum.
-
-For instructions how to report a security vulnerability, please consult our [security guide](https://mxnet.apache.org/api/faq/security).
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c2acc0f40d7d..0112d6bb1704 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1007,7 +1007,6 @@ cd_unittest_ubuntu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     export CD_JOB=1 # signal this is a CD run so any unecessary tests can be skipped
 
@@ -1050,7 +1049,6 @@ unittest_ubuntu_python2_cpu_cython() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
     check_cython 2
@@ -1064,7 +1062,6 @@ unittest_ubuntu_python2_cpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_train.xml --verbose tests/python/train
@@ -1076,7 +1073,6 @@ unittest_ubuntu_python3_cpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization.xml --verbose tests/python/quantization
@@ -1087,7 +1083,6 @@ unittest_ubuntu_python3_cpu_mkldnn() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_unittest.xml --verbose tests/python/unittest
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_mkl.xml --verbose tests/python/mkl
@@ -1098,7 +1093,6 @@ unittest_ubuntu_python2_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
 }
@@ -1108,7 +1102,6 @@ unittest_ubuntu_python3_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
@@ -1119,7 +1112,6 @@ unittest_ubuntu_python3_gpu_cython() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=1 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
@@ -1131,7 +1123,6 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_OFF_TEST_ONLY=true
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
@@ -1141,7 +1132,6 @@ unittest_ubuntu_tensorrt_gpu() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
@@ -1156,7 +1146,6 @@ unittest_ubuntu_python2_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-2.7 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
@@ -1169,7 +1158,6 @@ unittest_ubuntu_python3_quantization_gpu() {
     export PYTHONPATH=./python/
     export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_quantization_gpu.xml --verbose tests/python/quantization_gpu
@@ -1332,7 +1320,6 @@ integrationtest_ubuntu_gpu_python() {
     set -ex
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     python example/image-classification/test_score.py
 }
 
@@ -1361,7 +1348,6 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
     pushd .
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_USE_OPERATOR_TUNING=0
     cd tests/nightly/
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu
@@ -1396,7 +1382,6 @@ integrationtest_ubuntu_gpu_dist_kvstore() {
     pushd .
     export PYTHONPATH=./python/
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     cd tests/nightly/
     ../../tools/launch.py -n 4 --launcher local python dist_device_sync_kvstore.py
     ../../tools/launch.py -n 4 --launcher local python dist_sync_kvstore.py --type=init_gpu
@@ -1584,7 +1569,6 @@ nightly_tutorial_test_ubuntu_python3_gpu() {
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python3
     cd /work/mxnet/tests/tutorials
@@ -1598,7 +1582,6 @@ nightly_tutorial_test_ubuntu_python2_gpu() {
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
     export PYTHONPATH=/work/mxnet/python/
     export MXNET_TUTORIAL_TEST_KERNEL=python2
     cd /work/mxnet/tests/tutorials
@@ -1992,7 +1975,7 @@ cd_package_pypi() {
     popd
 }
 
-# Sanity checks wheel file
+# Sanity checks wheel file 
 cd_integration_test_pypi() {
     set -ex
     local python_cmd=${1:?"This function requires a python command as the first argument"}
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index c39d1fa45328..df9b15ba1ec3 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -20,7 +20,6 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index b2ea62fc7cd4..f2974ff6f7b6 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -20,7 +20,6 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python27\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index 1e09b5c98ce1..900bfd161cd0 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -20,7 +20,6 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index 9bf7d04d8a88..b6e951b291fb 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -20,7 +20,6 @@
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-$env:MXNET_SUBGRAPH_VERBOSE=0
 $env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
 
 C:\Python37\Scripts\pip install -r tests\requirements.txt
diff --git a/cpp-package/README.md b/cpp-package/README.md
index 77ff0ee36e80..05fb506db42b 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -55,7 +55,7 @@ In order to consume the C++ API please follow the steps below.
 
 ## Tutorial
 
-A basic tutorial can be found at <https://mxnet.apache.org/api/cpp/docs/tutorials/basics>.
+A basic tutorial can be found at <https://mxnet.apache.org/tutorials/c++/basics.html>.
 
 ## Examples
 
diff --git a/docs/python_docs/python/api/gluon/data/index.rst b/docs/python_docs/python/api/gluon/data/index.rst
deleted file mode 100644
index f9e8a21e69d2..000000000000
--- a/docs/python_docs/python/api/gluon/data/index.rst
+++ /dev/null
@@ -1,63 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-gluon.data
-==========
-
-.. automodule:: mxnet.gluon.data
-
-Datasets
---------
-
-.. autosummary::
-
-   Dataset
-   ArrayDataset
-   RecordFileDataset
-   SimpleDataset
-
-Sampling
---------
-
-.. autosummary::
-
-   Sampler
-   SequentialSampler
-   RandomSampler
-   BatchSampler
-
-DataLoader
-----------
-
-.. autosummary::
-
-   DataLoader
-
-
-API Reference
--------------
-.. automodule:: mxnet.gluon.data
-    :members:
-    :imported-members:
-    :autosummary:
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-   :glob:
-
-   */index
\ No newline at end of file
diff --git a/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst b/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
deleted file mode 100644
index 6b007526607a..000000000000
--- a/docs/python_docs/python/api/gluon/data/vision/datasets/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-vision.datasets
-===============
-
-Gluon provides pre-defined vision datasets functions in the :py:mod:`mxnet.gluon.data.vision.datasets`
-module.
-
-.. automodule:: mxnet.gluon.data.vision.datasets
-    :members:
-    :autosummary:
diff --git a/docs/python_docs/python/api/gluon/data/vision/index.rst b/docs/python_docs/python/api/gluon/data/vision/index.rst
deleted file mode 100644
index 2731b5f4245a..000000000000
--- a/docs/python_docs/python/api/gluon/data/vision/index.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-data.vision
-============
-
-.. automodule:: mxnet.gluon.data.vision
-
-Datasets
-^^^^^^^^
-
-.. autosummary::
-    :nosignatures:
-
-    mxnet.gluon.data.vision.datasets
-
-
-Data transformations
-^^^^^^^^^^^^^^^^^^^^
-
-
-.. autosummary::
-    :nosignatures:
-
-    mxnet.gluon.data.vision.transforms
-
-
-API Reference
--------------
-.. automodule:: mxnet.gluon.data.vision
-    :members:
-    :autosummary:
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-   :glob:
-
-   */index
\ No newline at end of file
diff --git a/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst b/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
deleted file mode 100644
index 60d975d87aff..000000000000
--- a/docs/python_docs/python/api/gluon/data/vision/transforms/index.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-vision.transforms
-=================
-
-Gluon provides pre-defined vision transformation and data augmentation functions in the :py:mod:`mxnet.gluon.data.vision.transforms`
-module.
-
-.. currentmodule:: mxnet.gluon.data.vision
-
-.. autosummary::
-   :nosignatures:
-
-   transforms.Compose
-   transforms.Cast
-   transforms.ToTensor
-   transforms.Normalize
-   transforms.RandomResizedCrop
-   transforms.CenterCrop
-   transforms.Resize
-   transforms.RandomFlipLeftRight
-   transforms.RandomFlipTopBottom
-   transforms.RandomBrightness
-   transforms.RandomContrast
-   transforms.RandomSaturation
-   transforms.RandomHue
-   transforms.RandomColorJitter
-   transforms.RandomLighting
-
-API Reference
--------------
-.. automodule:: mxnet.gluon.data.vision.transforms
-    :members:
diff --git a/docs/python_docs/python/api/mxnet/log/index.rst b/docs/python_docs/python/api/mxnet/log/index.rst
deleted file mode 100644
index fd4d8788c28a..000000000000
--- a/docs/python_docs/python/api/mxnet/log/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-mxnet.log
-=========
-
-.. automodule:: mxnet.log
-    :members:
-    :autosummary:
\ No newline at end of file
diff --git a/docs/python_docs/python/api/mxnet/model/index.rst b/docs/python_docs/python/api/mxnet/model/index.rst
deleted file mode 100644
index 69bcddce6bc1..000000000000
--- a/docs/python_docs/python/api/mxnet/model/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-mxnet.model
-===========
-
-.. automodule:: mxnet.model
-    :members:
-    :autosummary:
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md b/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
index 9afe95b58403..7a7738d8df1b 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/5-predict.md
@@ -21,7 +21,7 @@ A saved model can be used in multiple places, such as to continue training, to f
 
 ## Prerequisites
 
-Please run the [previous tutorial](4-train.html) to train the network and save its parameters to file. You will need this file to run the following steps.
+Please run the [previous tutorial](train.md) to train the network and save its parameters to file. You will need this file to run the following steps.
 
 ```{.python .input  n=1}
 from mxnet import nd
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
index a0788ba7df2d..b78c38ab7077 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
@@ -99,7 +99,7 @@ net(x)
 
 Finally, we show how to use multiple GPUs to jointly train a neural network through data parallelism. Let's assume there are *n* GPUs. We split each data batch into *n* parts, and then each GPU will run the forward and backward passes using one part of the data.
 
-Let's first copy the data definitions and the transform function from the [previous tutorial](5-predict.html).
+Let's first copy the data definitions and the transform function from the [previous tutorial](predict.md).
 
 ```{.python .input}
 batch_size = 256
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index b1f65e682263..8d2c4e100c76 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -20,7 +20,7 @@
 
 ## Overview
 MXNet Gluon API comes with a lot of great features, and it can provide you everything you need: from experimentation to deploying the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
-This tutorial covers training and inference in Python, please continue to [C++ inference part](/api/cpp/docs/tutorials/cpp_inference) after you finish.
+This tutorial covers training and inference in Python, please continue to [C++ inference part](https://mxnet.apache.org/versions/master/tutorials/c++/mxnet_cpp_inference_tutorial.html) after you finish.
 
 Let's say you need to build a service that provides flower species recognition. A common problem is that you don't have enough data to train a good model. In such cases, a technique called Transfer Learning can be used to make a more robust model.
 In Transfer Learning we make use of a pre-trained model that solves a related task, and was trained on a very large standard dataset, such as ImageNet. ImageNet is from a different domain, but we can utilize the knowledge in this pre-trained model to perform the new task at hand.
@@ -77,7 +77,7 @@ from mxnet.gluon.data.vision import transforms
 from mxnet.gluon.model_zoo.vision import resnet50_v2
 ```
 
-Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](/api/python/docs/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training.
+Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](../packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training.
 Here we set the `epochs` to 1 for quick demonstration, please change to 40 for actual training.
 
 ```python
@@ -161,7 +161,7 @@ test_data = gluon.data.DataLoader(
 
 We will use pre-trained ResNet50_v2 model which was pre-trained on the [ImageNet Dataset](http://www.image-net.org/) with 1000 classes. To match the classes in the Flower dataset, we must redefine the last softmax (output) layer to be 102, then initialize the parameters.
 
-Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](/api/python/docs/tutorials/packages/gluon/blocks/hybridize.html).
+Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](https://mxnet.apache.org/tutorials/gluon/hybrid.html).
 
 
 
@@ -265,7 +265,7 @@ finetune_net.export("flower-recognition", epoch=epochs)
 ## Load the model and run inference using the MXNet Module API
 
 MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
-Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](/api/python.html),    [Java](/api/java.html), [Scala](/api/scala.html), and [C++](/api/cpp) APIs.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.apache.org/api/python/module/module.html),    [Java](https://mxnet.apache.org/api/java/index.html), [Scala](https://mxnet.apache.org/api/scala/index.html), and [C++](https://mxnet.apache.org/api/c++/index.html) APIs.
 
 Here we will briefly introduce how to run inference using Module API in Python. There is more detailed explanation available in the [Predict Image Tutorial](https://mxnet.apache.org/tutorials/python/predict_image.html).
 In general, prediction consists of the following steps:
@@ -315,7 +315,7 @@ You can continue to the [next tutorial](https://mxnet.apache.org/versions/master
 
 You can also find more ways to run inference and deploy your models here:
 1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-2. [Scala Inference examples](/api/scala/docs/tutorials/infer)
+2. [Scala Inference examples](https://mxnet.apache.org/tutorials/scala/)
 4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
diff --git a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
index 1ab490fbaa42..d7720bac4348 100644
--- a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
+++ b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
@@ -164,7 +164,7 @@ mx_trainer = gluon.Trainer(mx_net.collect_params(),
                            'sgd', {'learning_rate': 0.1})
 ```
 
-The code difference between frameworks is small. The main difference is that in Apache MXNet we use [Trainer](/api/python/docs/api/gluon/trainer.html) class, which accepts optimization algorithm as an argument. We also use [.collect_params()](/api/python/docs/api/gluon/block.html#mxnet.gluon.Block.collect_params) method to get parameters of the network.
+The code difference between frameworks is small. The main difference is that in Apache MXNet we use [Trainer](https://mxnet.apache.org/api/python/docs/api/gluon/mxnet.gluon.Trainer.html) class, which accepts optimization algorithm as an argument. We also use [.collect_params()](/api/python/docs/api/gluon/_autogen/mxnet.gluon.nn.Block.collect_params.html) method to get parameters of the network.
 
 ### 4. Training
 
@@ -212,13 +212,13 @@ Some of the differences in Apache MXNet when compared to PyTorch are as follows:
 
 * In Apache MXNet, you don't need to flatten the 4-D input into 2-D when feeding the data into forward pass.
 
-* In Apache MXNet, you need to perform the calculation within the [autograd.record()](/api/python/docs/api/autograd/index.html?autograd%20record#mxnet.autograd.record) scope so that it can be automatically differentiated in the backward pass.
+* In Apache MXNet, you need to perform the calculation within the [autograd.record()](/api/python/docs/api/gluon-related/_autogen/mxnet.autograd.record.html) scope so that it can be automatically differentiated in the backward pass.
 
 * It is not necessary to clear the gradient every time as with PyTorch's `trainer.zero_grad()` because by default the new gradient is written in, not accumulated.
 
-* You need to specify the update step size (usually batch size) when performing [step()](/api/python/docs/api/gluon/trainer.html?#mxnet.gluon.Trainer.step) on the trainer.
+* You need to specify the update step size (usually batch size) when performing [step()](/api/python/docs/api/gluon/_autogen/mxnet.gluon.Trainer.step.html) on the trainer.
 
-* You need to call [.asscalar()](/api/python/docs/api/ndarray/ndarray.html?#mxnet.ndarray.NDArray.asscalar) to turn a multidimensional array into a scalar.
+* You need to call [.asscalar()](/api/python/docs/api/ndarray/_autogen/mxnet.ndarray.NDArray.asscalar.html) to turn a multidimensional array into a scalar.
 
 * In this sample, Apache MXNet is twice as fast as PyTorch. Though you need to be cautious with such toy comparisons.
 
@@ -230,9 +230,9 @@ As we saw above, Apache MXNet Gluon API and PyTorch have many similarities. The
 
 While Apache MXNet Gluon API is very similar to PyTorch, there are some extra functionality that can make your code even faster.
 
-* Check out [Hybridize tutorial](/api/python/docs/tutorials/packages/gluon/blocks/hybridize.html) to learn how to write imperative code which can be converted to symbolic one.
+* Check out [Hybridize tutorial](/api/python/docs/guide/packages/gluon/hybridize.html) to learn how to write imperative code which can be converted to symbolic one.
 
-* Also, check out how to extend Apache MXNet with your own [custom layers](/api/python/docs/tutorials/packages/gluon/blocks/custom-layer.html?custom_layers).
+* Also, check out how to extend Apache MXNet with your own [custom layers](/api/python/docs/guide/extend/custom_layer.html).
 
 ## Appendix
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index 39726a3a511c..8a3d8229413b 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -112,8 +112,8 @@ to train the MLP network we defined above.
 
 For our training, we will make use of the stochastic gradient descent (SGD) optimizer. In particular, we'll be using mini-batch SGD. Standard SGD processes train data one example at a time. In practice, this is very slow and one can speed up the process by processing examples in small batches. In this case, our batch size will be 100, which is a reasonable choice. Another parameter we select here is the learning rate, which controls the step size the optimizer takes in search of a solution. We'll pick a learning rate of 0.02, again a reasonable choice. Settings such as batch size and learning rate are what are usually referred to as hyper-parameters. What values we give them can have a great impact on training performance.
 
-We will use [Trainer](/api/python/docs/api/gluon/trainer.html) class to apply the
-[SGD optimizer](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.SGD) on the
+We will use [Trainer](https://mxnet.io/api/python/docs/api/gluon/mxnet.gluon.Trainer.html) class to apply the
+[SGD optimizer](https://mxnet.io/api/python/docs/api/gluon-related/_autogen/mxnet.optimizer.SGD.html) on the
 initialized parameters.
 
 ```python
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
index b91279cff4d4..0b362513c0ae 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/csr.md
@@ -556,7 +556,7 @@ except mx.MXNetError as err:
 
 ## Next 
 
-[Train a Linear Regression Model with Sparse Symbols](/api/python/docs/tutorials/packages/ndarray/sparse/train.html)
+[Train a Linear Regression Model with Sparse Symbols](http://mxnet.apache.org/tutorials/sparse/train.html)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
index 7500e82cf9e6..1241182af85b 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/row_sparse.md
@@ -578,7 +578,7 @@ except mx.MXNetError as err:
 
 ## Next
 
-[Train a Linear Regression Model with Sparse Symbols](/api/python/docs/tutorials/packages/ndarray/sparse/train.html)
+[Train a Linear Regression Model with Sparse Symbols](http://mxnet.apache.org/tutorials/sparse/train.html)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
index 336185cf7583..71669e142a4b 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
@@ -27,18 +27,18 @@ then train a linear regression model using sparse symbols with the Module API.
 
 To complete this tutorial, we need:
 
-- MXNet. See the instructions for your operating system in [Setup and Installation](/get_started).  
+- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.io/get_started).  
 
-- [Jupyter Notebook](https://jupyter.org/index.html) and [Python Requests](https://3.python-requests.org/) packages.
+- [Jupyter Notebook](https://jupyter.org/index.html) and [Python Requests](http://docs.python-requests.org/en/master/) packages.
 ```
 pip install jupyter requests
 ```
 
 - Basic knowledge of Symbol in MXNet. See the detailed tutorial for Symbol in [Symbol - Neural Network Graphs and Auto-differentiation](https://mxnet.apache.org/tutorials/basic/symbol.html).
 
-- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](/api/python/docs/tutorials/packages/ndarray/sparse/csr.html).
+- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](https://mxnet.apache.org/versions/master/tutorials/sparse/csr.html).
 
-- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](/api/python/docs/tutorials/packages/ndarray/sparse/row_sparse.html).
+- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](https://mxnet.apache.org/versions/master/tutorials/sparse/row_sparse.html).
 
 ## Variables
 
@@ -155,7 +155,7 @@ f = mx.sym.sparse.elemwise_add(c, c)
 ### Storage Type Inference
 
 What will be the output storage types of sparse symbols? In MXNet, for any sparse symbol, the result storage types are inferred based on storage types of inputs.
-You can read the [Sparse Symbol API](/api/python/docs/api/symbol/sparse/index.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
+You can read the [Sparse Symbol API](https://mxnet.apache.org/versions/master/api/python/symbol/sparse.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
 
 
 ```python
diff --git a/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md b/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
index 40fb1d2e82f5..fe515f3392d7 100644
--- a/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
+++ b/docs/static_site/src/_includes/get_started/devices/nvidia-jetson.md
@@ -1,4 +1,4 @@
 # NVIDIA Jetson Devices
 
 To install MXNet on a Jetson TX or Nano, please refer to the [Jetson installation
-guide](/get_started/jetson_setup).
\ No newline at end of file
+guide](get_started/jetson_setup).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/get_started.html b/docs/static_site/src/_includes/get_started/get_started.html
index 77367c7ed337..4905d28ce2d3 100644
--- a/docs/static_site/src/_includes/get_started/get_started.html
+++ b/docs/static_site/src/_includes/get_started/get_started.html
@@ -256,8 +256,8 @@ <h2>Installing MXNet</h2>
             </div> <!-- END - C++-->
 
             <br>
-            For more installation options, refer to the <a href="/get_started/ubuntu_setup.html">Ubuntu installation guide</a> and
-            <a href="/get_started/centos_setup.html">CentOS installation guide</a>.
+            For more installation options, refer to the <a href="get_started/ubuntu_setup.html">Ubuntu installation guide</a> and
+            <a href="get_started/centos_setup.html">CentOS installation guide</a>.
         </div> <!-- END - Linux -->
 
 
@@ -354,7 +354,7 @@ <h2>Installing MXNet</h2>
                 </div> <!-- End of cpu gpu -->
             </div>
             <br>
-            For more installation options, refer to the <a href="/get_started/osx_setup.html">MXNet macOS installation guide</a>.
+            For more installation options, refer to the <a href="get_started/osx_setup.html">MXNet macOS installation guide</a>.
         </div> <!-- END - Mac OS -->
 
 
@@ -440,7 +440,7 @@ <h2>Installing MXNet</h2>
                 </div> <!-- End of cpu gpu -->
             </div> <!-- End of C++ -->
 
-            For more installation options, refer to the <a href="/get_started/windows_setup.html">MXNet Windows installation guide</a>.
+            For more installation options, refer to the <a href="get_started/windows_setup.html">MXNet Windows installation guide</a>.
         </div> <!-- End of Windows -->
 
 
diff --git a/docs/static_site/src/_includes/get_started/linux/java/cpu.md b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
index fc6f598fa5ee..5345a2d754b2 100644
--- a/docs/static_site/src/_includes/get_started/linux/java/cpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/java/cpu.md
@@ -1,6 +1,6 @@
 You can use the Maven packages defined in the following dependency to include MXNet in your Java
 project. The Java API is provided as a subset of the Scala API and is intended for inference only.
-Please refer to the <a href="/get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
+Please refer to the <a href="get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
 instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~">
diff --git a/docs/static_site/src/_includes/get_started/linux/java/gpu.md b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
index 6f6757f6e2ea..5e687a353fe4 100644
--- a/docs/static_site/src/_includes/get_started/linux/java/gpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/java/gpu.md
@@ -1,6 +1,6 @@
 You can use the Maven packages defined in the following dependency to include MXNet in your Java
 project. The Java API is provided as a subset of the Scala API and is intended for inference only.
-Please refer to the <a href="/get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
+Please refer to the <a href="get_started/java_setup.html">MXNet-Java setup guide</a> for a detailed set of
 instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~">
diff --git a/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
index 018aca9d7387..fbbc0bd248a9 100644
--- a/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/linux/julia/build-from-source.md
@@ -1,2 +1,2 @@
-Refer to the [Julia section of the MXNet Ubuntu installation guide](/get_started/ubuntu_setup#install-the-mxnet-package-for-julia).
+Refer to the [Julia section of the MXNet Ubuntu installation guide](get_started/ubuntu_setup#install-the-mxnet-package-for-julia).
 
diff --git a/docs/static_site/src/_includes/get_started/linux/r/cpu.md b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
index 88ca5dd39933..c0a4e015b61d 100644
--- a/docs/static_site/src/_includes/get_started/linux/r/cpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/r/cpu.md
@@ -1,5 +1,5 @@
 The default version of R that is installed with `apt-get` is insufficient. You will need
-to first [install R v3.4.4+ and build MXNet from source](/get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
+to first [install R v3.4.4+ and build MXNet from source](get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
 
 After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings with the following, assuming that `incubator-mxnet` is the source directory you used to build MXNet as follows:
 
diff --git a/docs/static_site/src/_includes/get_started/linux/r/gpu.md b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
index 16fbfd09d4d4..57afe7a8d65e 100644
--- a/docs/static_site/src/_includes/get_started/linux/r/gpu.md
+++ b/docs/static_site/src/_includes/get_started/linux/r/gpu.md
@@ -1,7 +1,7 @@
 The default version of R that is installed with `apt-get` is insufficient. You will need
 to first
 [install R v3.4.4+ and build MXNet from
-source](/get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
+source](get_started/ubuntu_setup.html#install-the-mxnet-package-for-r).
 
 After you have setup R v3.4.4+ and MXNet, you can build and install the MXNet R bindings
 with the
diff --git a/docs/static_site/src/_includes/get_started/macos/java/cpu.md b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
index 002037a15771..2050149fd33d 100644
--- a/docs/static_site/src/_includes/get_started/macos/java/cpu.md
+++ b/docs/static_site/src/_includes/get_started/macos/java/cpu.md
@@ -1,7 +1,7 @@
 You can use the Maven packages defined in the following dependency to include MXNet in
 your Java project. The Java API is provided as a subset of the Scala API and is intended for
 inference only.
-Please refer to the [MXNet-Java setup guide](/get_started/java_setup.html) for a detailed set of instructions to help you with the setup process.
+Please refer to the [MXNet-Java setup guide](get_started/java_setup.html) for a detailed set of instructions to help you with the setup process.
 
 <a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0~~"><img
 src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg"
diff --git a/docs/static_site/src/_includes/get_started/pip_snippet.md b/docs/static_site/src/_includes/get_started/pip_snippet.md
index 2c4d932fc816..f5cc4ea12803 100644
--- a/docs/static_site/src/_includes/get_started/pip_snippet.md
+++ b/docs/static_site/src/_includes/get_started/pip_snippet.md
@@ -1,6 +1,6 @@
 MXNet offers MKL pip packages that will be much faster when running on Intel hardware.
 Check the chart below for other options, refer to <a href="https://pypi.org/project/mxnet/">PyPI for
-other MXNet pip packages</a>, or <a href="/get_started/validate_mxnet.html">validate your MXNet installation</a>.
+other MXNet pip packages</a>, or <a href="get_started/validate_mxnet.html">validate your MXNet installation</a>.
 
 <div style="text-align: center">
     <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages-1.5.1.png"
diff --git a/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
index 4fc600468ad1..f9e61cb1c64e 100644
--- a/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/julia/build-from-source.md
@@ -1 +1 @@
-Refer to the [Julia section of the MXNet Windows installation guide](/get_started/windows_setup.html#install-the-mxnet-package-for-julia).
+Refer to the [Julia section of the MXNet Windows installation guide](get_started/windows_setup.html#install-the-mxnet-package-for-julia).
diff --git a/docs/static_site/src/_includes/get_started/windows/perl/perl.md b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
index 1a8eea5261ba..a24ae0aa13f0 100644
--- a/docs/static_site/src/_includes/get_started/windows/perl/perl.md
+++ b/docs/static_site/src/_includes/get_started/windows/perl/perl.md
@@ -1 +1 @@
-Refer to the [Perl section of the MXNet Windows installation guide](/get_started/windows_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
+Refer to the [Perl section of the MXNet Windows installation guide](get_started/windows_setup.html#install-the-mxnet-package-for-perl).
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
index af36205337d2..fe0ac055c914 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/cpu/build-from-source.md
@@ -1 +1 @@
-Refer to the [MXNet Windows installation guide](/get_started/windows_setup.html)
\ No newline at end of file
+Refer to the [MXNet Windows installation guide](get_started/windows_setup.html)
\ No newline at end of file
diff --git a/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
index 55bca3a129d8..762f720b5403 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/gpu/build-from-source.md
@@ -1 +1 @@
-To build from source, refer to the [MXNet Windows installation guide](/get_started/windows_setup.html).
+To build from source, refer to the [MXNet Windows installation guide](get_started/windows_setup.html).
diff --git a/docs/static_site/src/_includes/get_started/windows/r/cpu.md b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
index 926b8355c984..3110475a2abd 100644
--- a/docs/static_site/src/_includes/get_started/windows/r/cpu.md
+++ b/docs/static_site/src/_includes/get_started/windows/r/cpu.md
@@ -1,7 +1,7 @@
 Note: packages for 3.6.x are not yet available.
 Install 3.5.x of R from [CRAN](https://cran.r-project.org/bin/windows/base/old/).
 
-You can [build MXNet-R from source](/get_started/windows_setup.html#install-mxnet-package-for-r), or
+You can [build MXNet-R from source](get_started/windows_setup.html#install-mxnet-package-for-r), or
 you can use a
 pre-built binary:
 
diff --git a/docs/static_site/src/_includes/get_started/windows/r/gpu.md b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
index 084f1a5a4012..0840d2d2acbc 100644
--- a/docs/static_site/src/_includes/get_started/windows/r/gpu.md
+++ b/docs/static_site/src/_includes/get_started/windows/r/gpu.md
@@ -1,4 +1,4 @@
-You can [build MXNet-R from source](/get_started/windows_setup.html#install-mxnet-package-for-r), or
+You can [build MXNet-R from source](get_started/windows_setup.html#install-mxnet-package-for-r), or
 you can use a
 pre-built binary:
 
diff --git a/docs/static_site/src/pages/api/architecture/note_data_loading.md b/docs/static_site/src/pages/api/architecture/note_data_loading.md
index 01bf1f23f600..1279d0361e5f 100644
--- a/docs/static_site/src/pages/api/architecture/note_data_loading.md
+++ b/docs/static_site/src/pages/api/architecture/note_data_loading.md
@@ -125,7 +125,7 @@ then compress into JPEG format.
 After that, we save a header that indicates the index and label
 for that image to be used when constructing the *Data* field for that record.
 We then pack several images together into a file.
-You may want to also review the [example using im2rec.py to create a RecordIO dataset](https://mxnet.apache.org/api/faq/recordio).
+You may want to also review the [example using im2rec.py to create a RecordIO dataset](https://mxnet.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators).
 
 ### Access Arbitrary Parts Of Data
 
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
index 0d96817560d0..9392eca2977f 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/mxnet_cpp_inference_tutorial.md
@@ -29,7 +29,7 @@ tag: cpp
 ## Overview
 MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
 Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python]({{'/api/python/docs/api/symbol-related/mxnet.module'|relative_url}}),    [Java]({{'/api/java/docs/api'|relative_url}}), [Scala]({{'/api/scala/docs/api'|relative_url}}), and [C++]({{'/api/cpp/docs/api'|relative_url}}) APIs.
-We will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) for our use case.
+We will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](https://github.com/apache/incubator-mxnet/tree/master/example/inference) for our use case.
 
 ## Prerequisites
 
@@ -105,7 +105,7 @@ class Predictor {
 
 ### Load the model, synset file, and normalization values
 
-In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/imagenet_inference.cpp).
+In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp).
 
 Next, we need to load synset file, and normalization values. We have made the following change since our synset file contains flower names and we used both mean and standard deviation for image normalization.
 
@@ -280,12 +280,12 @@ Then it will predict your image:
 
 Now you can explore more ways to run inference and deploy your models:
 1. [Java Inference examples](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
-2. [Scala Inference examples](/api/scala/docs/tutorials)
-3. [ONNX model inference examples](/api/python/docs/tutorials/deploy/index.html)
+2. [Scala Inference examples]({{'/api/scala/docs/tutorials'|relative_url}})
+3. [ONNX model inference examples]({{'/api/python/docs/tutorials/deploy/index.html'|relative_url}})
 4. [MXNet Model Server Examples](https://github.com/awslabs/mxnet-model-server/tree/master/examples)
 
 ## References
 
-1. [Gluon end to end tutorial](/api/python/docs/tutorials/getting-started/gluon_from_experiment_to_deployment.html)
+1. [Gluon end to end tutorial]({{'/api/python/docs/tutorials/packages/gluon/gluon_from_experiment_to_deployment.html'|relative_url}})
 2. [Gluon C++ inference example](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
 3. [Gluon C++ package](https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
diff --git a/docs/static_site/src/pages/api/faq/distributed_training.md b/docs/static_site/src/pages/api/faq/distributed_training.md
index 622ace60f780..caf0123b7aea 100644
--- a/docs/static_site/src/pages/api/faq/distributed_training.md
+++ b/docs/static_site/src/pages/api/faq/distributed_training.md
@@ -91,7 +91,7 @@ In the case of distributed training though, we would need to divide the dataset
 
 Typically, this split of data for each worker happens through the data iterator,
 on passing the number of parts and the index of parts to iterate over.
-Some iterators in MXNet that support this feature are [mxnet.io.MNISTIterator](/api/python/docs/api/mxnet/io/index.html?MNISTIter#mxnet.io.MNISTIter) and [mxnet.io.ImageRecordIter](api/python/docs/api/mxnet/io/index.html?imagerecorditer#mxnet.io.ImageRecordIter).
+Some iterators in MXNet that support this feature are [mxnet.io.MNISTIterator]({{'//api/mxnet/io/index.html#mxnet.io.MNISTIter'|relative_url}}) and [mxnet.io.ImageRecordIter]({{'/api/mxnet/io/index.html#mxnet.io.ImageRecordIter'|relative_url}}).
 If you are using a different iterator, you can look at how the above iterators implement this.
 We can use the kvstore object to get the number of workers (`kv.num_workers`) and rank of the current worker (`kv.rank`).
 These can be passed as arguments to the iterator.
@@ -101,7 +101,7 @@ to see an example usage.
 ### Updating weights
 KVStore server supports two modes, one which aggregates the gradients and updates the weights using those gradients, and second where the server only aggregates gradients. In the latter case, when a worker process pulls from kvstore, it gets the aggregated gradients. The worker then uses these gradients and applies the weights locally.
 
-When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer](/api/python/docs/api/gluon/trainer.html) object like this:
+When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer]({{'/api/python/docs/api/gluon/mxnet.gluon.Trainer.html'|relative_url}}) object like this:
 
 ```
 trainer = gluon.Trainer(net.collect_params(), optimizer='sgd',
@@ -190,7 +190,7 @@ git clone --recursive https://github.com/apache/incubator-mxnet
 ```
 
 #### Example
-Let us consider training a VGG11 model on the CIFAR10 dataset using [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/tools/launch.py).
+Let us consider training a VGG11 model on the CIFAR10 dataset using [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py).
 ```
 cd example/gluon/
 ```
diff --git a/docs/static_site/src/pages/api/faq/float16.md b/docs/static_site/src/pages/api/faq/float16.md
index e63bf87ac68f..d824acb3ce6d 100644
--- a/docs/static_site/src/pages/api/faq/float16.md
+++ b/docs/static_site/src/pages/api/faq/float16.md
@@ -39,7 +39,7 @@ The float16 data type is a 16 bit floating point representation according to the
 - CUDA 9 or higher
 - cuDNN v7 or higher
 
-This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](/api/python/docs/tutorials/getting-started/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
+This tutorial also assumes understanding of how to train a network with float32 (the default). Please refer to [logistic regression tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/logistic_regression_explained.html) to get started with Apache MXNet and Gluon API. This tutorial focuses on the changes needed to switch from float32 to mixed precision and tips on achieving the best performance with mixed precision.
 
 ## Using the Gluon API
 
@@ -47,13 +47,13 @@ This tutorial also assumes understanding of how to train a network with float32
 
 With Gluon API, you need to take care of three things to convert a model to support computation with float16.
 
-1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast](/api/python/docs/api/gluon/block.html?cast#mxnet.gluon.Block.cast) method of the `Block` representing the network.
+1. Cast Gluon `Block`'s parameters and expected input type to float16 by calling the [cast]({{'/api/python/docs/api/gluon/mxnet.gluon.nn.Block.html#mxnet.gluon.nn.Block.cast'|relative_url}}) method of the `Block` representing the network.
 
 ```python
 net.cast('float16')
 ```
 
-2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype](/api/python/docs/api/ndarray/ndarray.html?astype#mxnet.ndarray.NDArray.astype) method of NDArrays.
+2. Ensure the data input to the network is of float16 type. If your `DataLoader` or `Iterator` produces output in another datatype, then you would have to cast your data. There are different ways you can do this. The easiest would be to use the [astype]({{'/api/python/docs/api/ndarray/_autogen/mxnet.ndarray.NDArray.astype.html#mxnet.ndarray.NDArray.astype'|relative_url}}) method of NDArrays.
 
 ```python
 data = data.astype('float16', copy=False)
@@ -98,7 +98,7 @@ net.features = pretrained_net.features
 net.cast('float16')
 ```
 
-You can check the parameters of the model by calling [summary](/api/python/docs/api/gluon/block.html?block%20summary#mxnet.gluon.Block.summary) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
+You can check the parameters of the model by calling [summary]({{'/api/python/docs/api/gluon/mxnet.gluon.nn.Block.html#mxnet.gluon.nn.Block.summary'|relative_url}}) with some fake data. Notice the provided `dtype=np.float16` in the line below. As it was mentioned earlier, we have to provide data as float16 as well.
 
 ```python
 net.summary(mx.nd.uniform(shape=(1, 3, 224, 224), dtype=np.float16))
diff --git a/docs/static_site/src/pages/api/faq/gradient_compression.md b/docs/static_site/src/pages/api/faq/gradient_compression.md
index e2b47c646ada..1f4c5fb21903 100644
--- a/docs/static_site/src/pages/api/faq/gradient_compression.md
+++ b/docs/static_site/src/pages/api/faq/gradient_compression.md
@@ -110,7 +110,7 @@ A reference `gluon` implementation with a gradient compression option can be fou
 mod = mx.mod.Module(..., compression_params={'type’:'2bit', 'threshold':0.5})
 ```
 
-A `module` example is provided with [this guide for setting up MXNet with distributed training](/api/faq/distributed_training). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
+A `module` example is provided with [this guide for setting up MXNet with distributed training](https://mxnet.apache.org/versions/master/faq/multi_devices.html#distributed-training-with-multiple-machines). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
 
 ### Configuration Details
 
diff --git a/docs/static_site/src/pages/api/faq/model_parallel_lstm.md b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
index 08cf6be76a90..60df280b38fe 100644
--- a/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
+++ b/docs/static_site/src/pages/api/faq/model_parallel_lstm.md
@@ -37,7 +37,7 @@ One key strength of _MXNet_ is its ability to leverage
 powerful heterogeneous hardware environments to achieve significant speedups.
 
 There are two primary ways that we can spread a workload across multiple devices.
-In a previous document, [we addressed data parallelism](/api/faq/distributed_training),
+In a previous document, [we addressed data parallelism](multi_devices),
 an approach in which samples within a batch are divided among the available devices.
 With data parallelism, each device stores a complete copy of the model.
 Here, we explore _model parallelism_, a different approach.
diff --git a/docs/static_site/src/pages/api/faq/recordio.md b/docs/static_site/src/pages/api/faq/recordio.md
index 2e8fcdd647f3..75407cb3da5f 100644
--- a/docs/static_site/src/pages/api/faq/recordio.md
+++ b/docs/static_site/src/pages/api/faq/recordio.md
@@ -38,6 +38,7 @@ We provide two tools for creating a RecordIO dataset.
 * [im2rec.py](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py) - implements the tool using the Python API.
 
 Both provide the same output: a RecordIO dataset.
+You may want to also review the [example using real-world data with im2rec.py.](https://mxnet.apache.org/tutorials/basic/data.html#loading-data-using-image-iterators)
 
 ### Prerequisites
 
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
index d74112db98b5..52e4db92f84b 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
@@ -272,7 +272,7 @@ Yes! You can stop the training early with `return(FALSE)`. See the following exa
 When the validation metric dips below the threshold we set, the training process stops.
 
 ## Next Steps
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
-* [Classify Real-World Images with a Pretrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
-* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
+* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with a Pretrained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model Using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
index a4ca967d8e2c..0f37123f23da 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
@@ -225,7 +225,7 @@ sum(abs(test.y - pred6[1,])) / length(test.y)
 
 
 ## Next Steps
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
-* [Classify Real-World Images with a PreTrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
-* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
+* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with a PreTrained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model Using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
index 2c24cdf92fc3..e6e218f2ae13 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/multi_dim_lstm.md
@@ -55,7 +55,7 @@ PM2.5 concentration levels.
 
 Load and pre-process the data
 ---------
-The first step is to load in the data and preprocess it. It is assumed that the data has been downloaded in a .csv file: data.csv from the [pollution dataset](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data).
+The first step is to load in the data and preprocess it. It is assumed that the data has been downloaded in a .csv file: data.csv from the [pollution dataset](https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data)
 
  ```r
 ## Loading required packages
@@ -324,4 +324,4 @@ We also repeated the above experiments to generate the next 100 samples to 301st
 
 The above tutorial is just for demonstration purposes and has not been tuned extensively for accuracy.
 
-For more tutorials on MXNet-R, head on to [MXNet-R tutorials](/api/r/docs/tutorials)
+For more tutorials on MXNet-R, head on to [MXNet-R tutorials](https://mxnet.apache.org/tutorials/r/index.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
index dc3d1c5a028e..9113b0d313d5 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
@@ -223,9 +223,9 @@ The actual computations are finished, allowing us to copy the results someplace
 the results.
 
 ## Next Steps
-* [Symbol](/api/r/docs/tutorials/symbol)
-* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
-* [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
-* [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
+* [Symbol](https://mxnet.io/tutorials/r/symbol.html)
+* [Write and use callback functions](https://mxnet.io/tutorials/r/CallbackFunction.html)
+* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
index b5d6b8fd32a7..9c3150f97157 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
@@ -130,7 +130,7 @@ In the example, *net* is used as a function to apply to an existing symbol
 
 The [model API](https://github.com/apache/incubator-mxnet/blob/master/R-package/R/model.R) is a thin wrapper around the symbolic executors to support neural net training.
 
-We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](/api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
+We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](../../api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
 
 ## How Efficient Is the Symbolic API?
 
@@ -147,8 +147,8 @@ be more memory efficient than CXXNet and gets to the same runtime with
 greater flexibility.
 
 ## Next Steps
-* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
-* [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
-* [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
+* [Write and use callback functions](https://mxnet.io/tutorials/r/CallbackFunction.html)
+* [Neural Networks with MXNet in Five Minutes](https://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)
+* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
+* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
+* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/get_started/build_from_source.md b/docs/static_site/src/pages/get_started/build_from_source.md
index 20a4542461c4..e8f7d468b399 100644
--- a/docs/static_site/src/pages/get_started/build_from_source.md
+++ b/docs/static_site/src/pages/get_started/build_from_source.md
@@ -98,7 +98,7 @@ Those can be extended with [LAPACK (Linear Algebra Package)](https://github.com/
 
 MXNet supports multiple mathematical backends for computations on the CPU:
 * [Apple Accelerate](https://developer.apple.com/documentation/accelerate)
-* [ATLAS](http://math-atlas.sourceforge.net/)
+* [ATLAS](https://math-atlas.sourceforge.net/)
 * [MKL](https://software.intel.com/en-us/intel-mkl) (MKL, MKLML)
 * [MKL-DNN](https://github.com/intel/mkl-dnn)
 * [OpenBLAS](https://www.openblas.net/)
diff --git a/docs/static_site/src/pages/get_started/index.html b/docs/static_site/src/pages/get_started/index.html
index 02e7cf1b8641..e89b5e3b36e8 100644
--- a/docs/static_site/src/pages/get_started/index.html
+++ b/docs/static_site/src/pages/get_started/index.html
@@ -28,6 +28,6 @@
 <div class="get-started-from-source">
 <div class="wrapper">
     <h2>Download from source</h2>
-    <p>The signed source code for Apache MXNet (incubating) is available for download <a href="/get_started/download">here</a></p>
+    <p>The signed source code for Apache MXNet (incubating) is available for download <a href="get_started/download">here</a></p>
 </div>
 </div>
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index ac0c6726f2c7..177ec5d40146 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1170,7 +1170,7 @@ MXNET_DLL int MXAutogradIsTraining(bool* curr);
  * \param curr returns the current status
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXIsNumpyShape(int* curr);
+MXNET_DLL int MXIsNumpyShape(bool* curr);
 /*!
  * \brief set numpy compatibility switch
  * \param is_np_shape 1 when numpy shape semantics is thread local on,
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index dbd81e575872..18f6424e54f7 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -108,14 +108,12 @@ class Imperative {
       is_recording_ = is_recording;
       return old;
   }
-  /*! \brief return current numpy compatibility status,
-   *  GlobalOn(2), ThreadLocalOn(1), Off(0).
-   * */
-  int is_np_shape() const {
+  /*! \brief whether numpy compatibility is on. */
+  bool is_np_shape() const {
     if (is_np_shape_global_) {
-      return 2;
+      return true;
     }
-    return is_np_shape_thread_local_ ? 1 : 0;
+    return is_np_shape_thread_local_;
   }
   /*! \brief specify numpy compatibility off, thread local on or global on. */
   bool set_is_np_shape(int is_np_shape) {
diff --git a/julia/docs/Project.toml b/julia/docs/Project.toml
index 023a222beba6..a4b243b0ffea 100644
--- a/julia/docs/Project.toml
+++ b/julia/docs/Project.toml
@@ -4,4 +4,4 @@ DocumenterMarkdown = "997ab1e6-3595-5248-9280-8efb232c3433"
 MXNet = "a7949054-b901-59c6-b8e3-7238c29bf7f0"
 
 [compat]
-Documenter = "~0.23"
+Documenter = "~0.21"
diff --git a/julia/docs/make.jl b/julia/docs/make.jl
index 3ea9b07d1056..3e541c636888 100644
--- a/julia/docs/make.jl
+++ b/julia/docs/make.jl
@@ -19,39 +19,6 @@ using Documenter
 using DocumenterMarkdown
 using MXNet
 
-"""
-Return all files of a submodule
-
-julia> listpages("ndarray")
-15-element Array{String,1}:
- "ndarray.jl"
- "ndarray/activation.jl"
- "ndarray/arithmetic.jl"
- "ndarray/array.jl"
- ...
- "ndarray/statistic.jl"
- "ndarray/trig.jl"
- "ndarray/type.jl"
-"""
-listpages(x) =
-  ["$x.jl"; joinpath.(x, readdir(joinpath(@__DIR__, "..", "src", x)))]
-
-const api_pages = [
-  "api/context.md",
-  "api/ndarray.md",
-  "api/symbolic-node.md",
-  "api/model.md",
-  "api/initializers.md",
-  "api/optimizers.md",
-  "api/callbacks.md",
-  "api/metric.md",
-  "api/io.md",
-  "api/nn-factory.md",
-  "api/executor.md",
-  "api/kvstore.md",
-  "api/visualize.md",
-]
-
 makedocs(
   sitename = "MXNet.jl",
   modules  = MXNet,
diff --git a/julia/docs/mkdocs.yml b/julia/docs/mkdocs.yml
index 383505621540..22cb71869673 100644
--- a/julia/docs/mkdocs.yml
+++ b/julia/docs/mkdocs.yml
@@ -62,5 +62,4 @@ nav:
     - Symbolic API: api/symbolic-node.md
     - Neural Networks Factory: api/nn-factory.md
     - Executor: api/executor.md
-    - Key-Value Store: api/kvstore.md
     - Network Visualization: api/visualize.md
diff --git a/julia/docs/src/api.md b/julia/docs/src/api.md
index 04cfadd6d698..60cb0831d1bf 100644
--- a/julia/docs/src/api.md
+++ b/julia/docs/src/api.md
@@ -18,5 +18,18 @@
 # API Documentation
 
 ```@contents
-Pages = api_pages
+Pages = [
+  "api/symbolic-node.md",
+  "api/ndarray.md",
+  "api/context.md",
+  "api/model.md",
+  "api/initializers.md",
+  "api/optimizers.md",
+  "api/callbacks.md",
+  "api/metric.md",
+  "api/io.md",
+  "api/nn-factory.md",
+  "api/executor.md",
+  "api/visualize.md",
+]
 ```
diff --git a/julia/docs/src/api/ndarray.md b/julia/docs/src/api/ndarray.md
index 640e8b3ec372..64f59dc5393e 100644
--- a/julia/docs/src/api/ndarray.md
+++ b/julia/docs/src/api/ndarray.md
@@ -19,7 +19,7 @@
 
 ## Arithmetic Operations
 
-In the following example `y` can be a `Real` value or another `NDArray`.
+In the following example `y` can be a `Real` value or another `NDArray`
 
 | API | Example  |                            |
 |-----|----------|----------------------------|
@@ -70,5 +70,21 @@ In the following example `y` can be a `Real` value or another `NDArray`.
 
 ```@autodocs
 Modules = [MXNet.mx]
-Pages = listpages("ndarray")
+Pages = [
+  "ndarray.jl",
+  "ndarray/activation.jl",
+  "ndarray/arithmetic.jl",
+  "ndarray/array.jl",
+  "ndarray/autoimport.jl",
+  "ndarray/comparison.jl",
+  "ndarray/context.jl",
+  "ndarray/io.jl",
+  "ndarray/linalg.jl",
+  "ndarray/reduction.jl",
+  "ndarray/remap.jl",
+  "ndarray/show.jl",
+  "ndarray/statistic.jl",
+  "ndarray/trig.jl",
+  "ndarray/type.jl",
+]
 ```
diff --git a/julia/docs/src/api/symbolic-node.md b/julia/docs/src/api/symbolic-node.md
index 785dda87fbde..0efe4605c414 100644
--- a/julia/docs/src/api/symbolic-node.md
+++ b/julia/docs/src/api/symbolic-node.md
@@ -19,5 +19,14 @@
 
 ```@autodocs
 Modules = [MXNet.mx]
-Pages = listpages("symbolic-node")
+Pages = [
+  "symbolic-node.jl",
+  "symbolic-node/arithmetic.jl",
+  "symbolic-node/array.jl",
+  "symbolic-node/autodiff.jl",
+  "symbolic-node/io.jl",
+  "symbolic-node/op.jl",
+  "symbolic-node/show.jl",
+  "symbolic-node/type.jl",
+]
 ```
diff --git a/julia/docs/src/index.md b/julia/docs/src/index.md
index 4213265b4bd4..aacd844cc38e 100644
--- a/julia/docs/src/index.md
+++ b/julia/docs/src/index.md
@@ -55,6 +55,18 @@ Depth = 2
 ## API Documentation
 
 ```@contents
-Pages = api_pages
-Depth = 2
+Pages = [
+  "api/context.md",
+  "api/ndarray.md",
+  "api/symbolic-node.md",
+  "api/model.md",
+  "api/initializers.md",
+  "api/optimizers.md",
+  "api/callbacks.md",
+  "api/metric.md",
+  "api/io.md",
+  "api/nn-factory.md",
+  "api/executor.md",
+  "api/visualize.md",
+]
 ```
diff --git a/julia/docs/src/tutorial/char-lstm.md b/julia/docs/src/tutorial/char-lstm.md
index ab7e9352b5ab..bc7f7b471d94 100644
--- a/julia/docs/src/tutorial/char-lstm.md
+++ b/julia/docs/src/tutorial/char-lstm.md
@@ -31,7 +31,7 @@ networks yet, the example shown here is an implementation of LSTM by
 using the default FeedForward model via explicitly unfolding over time.
 We will be using fixed-length input sequence for training. The code is
 adapted from the [char-rnn example for MXNet's Python
-binding](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb),
+binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb),
 which demonstrates how to use low-level
 [Symbolic API](@ref) to build customized neural
 network models directly.
@@ -165,7 +165,7 @@ char-lstm. To train the model, we just follow the standard high-level
 API. Firstly, we construct a LSTM symbolic architecture:
 
 Note all the parameters are defined in
-[examples/char-lstm/config.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/config.jl).
+[examples/char-lstm/config.jl](https://github.com/dmlc/MXNet.jl/blob/master/examples/char-lstm/config.jl).
 Now we load the text file and define the data provider. The data
 `input.txt` we used in this example is [a tiny Shakespeare
 dataset](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
@@ -318,6 +318,6 @@ illustrations](http://colah.github.io/posts/2015-08-Understanding-LSTMs/),
 but could otherwise be very useful for debugging. As we can see, the
 LSTM unfolded over time is just a (very) deep neural network. The
 complete code for producing this visualization can be found in
-[examples/char-lstm/visualize.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/visualize.jl).
+[examples/char-lstm/visualize.jl](https://github.com/apache/incubator-mxnet/tree/master/julia/examples/char-lstmvisualize.jl).
 
 ![image](images/char-lstm-vis.svg)
diff --git a/julia/docs/src/tutorial/mnist.md b/julia/docs/src/tutorial/mnist.md
index a404f75efe12..cc5267071f11 100644
--- a/julia/docs/src/tutorial/mnist.md
+++ b/julia/docs/src/tutorial/mnist.md
@@ -23,7 +23,7 @@ multi-layer perceptron and then a convolutional neural network (the
 LeNet architecture) on the [MNIST handwritten digit
 dataset](http://yann.lecun.com/exdb/mnist/). The code for this tutorial
 could be found in
-[examples/mnist](/api/julia/docs/api/tutorial/mnist/).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
+[examples/mnist](https://github.com/dmlc/MXNet.jl/tree/master/examples/mnist).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
 
 Simple 3-layer MLP
 ------------------
diff --git a/julia/docs/src/user-guide/overview.md b/julia/docs/src/user-guide/overview.md
index 342448a15bed..974cc7dee974 100644
--- a/julia/docs/src/user-guide/overview.md
+++ b/julia/docs/src/user-guide/overview.md
@@ -269,6 +269,8 @@ symbolic composition system. It is like
 [Theano](http://deeplearning.net/software/theano/), except that we
 avoided long expression compilation time by providing *larger* neural
 network related building blocks to guarantee computation performance.
+See also [this note](https://mxnet.readthedocs.org/en/latest/program_model.html)
+for the design and trade-off of the MXNet symbolic composition system.
 
 The basic type is `mx.SymbolicNode`. The following is a trivial example of
 composing two symbols with the `+` operation.
diff --git a/julia/examples/char-lstm/README.md b/julia/examples/char-lstm/README.md
index 155f29603623..ac745dd4cc41 100644
--- a/julia/examples/char-lstm/README.md
+++ b/julia/examples/char-lstm/README.md
@@ -29,7 +29,7 @@ and `StatsBase.jl`.
 ## Training
 
 This example is adapted from the
-[example in Python binding](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb) of
+[example in Python binding](https://github.com/dmlc/mxnet/blob/master/example/rnn/char_lstm.ipynb) of
 MXNet. The data `input.txt` can be downloaded [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
 
 Modify parameters in [config.jl](config.jl) and then run [train.jl](train.jl). An example output
diff --git a/julia/src/executor.jl b/julia/src/executor.jl
index 7f6c2bb5aa58..37f2dde615b8 100644
--- a/julia/src/executor.jl
+++ b/julia/src/executor.jl
@@ -245,7 +245,7 @@ Total 11 TempSpace resource requested
 ```
 """
 Base.print(io::IO, x::Executor) = print(io, debug_str(x))
-Base.print(x::Executor)         = print(stdout, x)
+Base.print(x::Executor)         = print(STDOUT, x)
 
 function debug_str(x::Executor)
   s_ref = Ref{Cstring}(C_NULL)
diff --git a/julia/src/symbolic-node/show.jl b/julia/src/symbolic-node/show.jl
index 9d40ea124505..f07c6b4655ee 100644
--- a/julia/src/symbolic-node/show.jl
+++ b/julia/src/symbolic-node/show.jl
@@ -57,6 +57,6 @@ function Base.print(io::IO, sym::SymbolicNode)
   print(io, unsafe_string(out[]))
 end
 
-Base.print(sym::SymbolicNode) = print(stdout, sym)
+Base.print(sym::SymbolicNode) = print(STDOUT, sym)
 
 
diff --git a/perl-package/AI-MXNet/t/test_autograd.t b/perl-package/AI-MXNet/t/test_autograd.t
index 2ddad60df989..931c6d59333b 100644
--- a/perl-package/AI-MXNet/t/test_autograd.t
+++ b/perl-package/AI-MXNet/t/test_autograd.t
@@ -23,7 +23,6 @@ use AI::MXNet::TestUtils qw(same almost_equal rand_ndarray);
 use AI::MXNet::Base qw(:DEFAULT pones);
 use Test::More tests => 246;
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub autograd_assert
 {
diff --git a/perl-package/AI-MXNet/t/test_gluon_trainer.t b/perl-package/AI-MXNet/t/test_gluon_trainer.t
index 3b1130af4ecf..81113af28c20 100644
--- a/perl-package/AI-MXNet/t/test_gluon_trainer.t
+++ b/perl-package/AI-MXNet/t/test_gluon_trainer.t
@@ -25,7 +25,6 @@ use AI::MXNet::TestUtils qw(almost_equal dies_ok);
 use Scalar::Util qw(refaddr);
 use AI::MXNet::Base;
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub test_multi_trainer
 {
@@ -253,3 +252,4 @@ sub test_trainer_reset_kv
 }
 
 test_trainer_reset_kv();
+
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
index 55e098683399..3bbd8fdc4ea4 100644
--- a/perl-package/AI-MXNet/t/test_module.t
+++ b/perl-package/AI-MXNet/t/test_module.t
@@ -22,7 +22,6 @@ use AI::MXNet qw(mx);
 use AI::MXNet::Base;
 use AI::MXNet::TestUtils qw(almost_equal enumerate same_array dies_like rand_ndarray);
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 sub test_module_layout
 {
diff --git a/perl-package/AI-MXNet/t/test_sparse_ndarray.t b/perl-package/AI-MXNet/t/test_sparse_ndarray.t
index afb0b25aa816..f143346b4890 100644
--- a/perl-package/AI-MXNet/t/test_sparse_ndarray.t
+++ b/perl-package/AI-MXNet/t/test_sparse_ndarray.t
@@ -24,7 +24,6 @@ use AI::MXNet::TestUtils qw(zip assert enumerate same rand_shape_2d rand_shape_3
     rand_sparse_ndarray random_arrays almost_equal rand_ndarray randint allclose dies_ok);
 use AI::MXNet::Base qw(pones pzeros pdl product rand_sparse);
 $ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
 
 
 sub sparse_nd_ones
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index 61c64ec0984f..5d332ff45ecb 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -31,7 +31,8 @@
 from .profiler import set_kvstore_handle
 
 def _ctype_key_value(keys, vals):
-    """Returns ctype arrays for the key-value args, and the whether string keys are used.
+    """
+    Returns ctype arrays for the key-value args, and the whether string keys are used.
     For internal use only.
     """
     if isinstance(keys, (tuple, list)):
@@ -65,7 +66,9 @@ def _ctype_key_value(keys, vals):
         return (c_keys, c_handle_array(vals), use_str_keys)
 
 def _ctype_dict(param_dict):
-    """Returns ctype arrays for keys and values(converted to strings) in a dictionary"""
+    """
+    Returns ctype arrays for keys and values(converted to strings) in a dictionary
+    """
     assert(isinstance(param_dict, dict)), \
         "unexpected type for param_dict: " + str(type(param_dict))
     c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()])
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 6e2d66cb9d15..07ec2ef4d61d 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -153,7 +153,8 @@ def reset(self):
         self.global_sum_metric = 0.0
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results to initial state."""
+        """Resets the local portion of the internal evaluation results
+        to initial state."""
         self.num_inst = 0
         self.sum_metric = 0.0
 
@@ -371,7 +372,8 @@ def reset(self):
             pass
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results to initial state."""
+        """Resets the local portion of the internal evaluation results
+        to initial state."""
         try:
             for metric in self.metrics:
                 metric.reset_local()
@@ -590,7 +592,8 @@ def update(self, labels, preds):
 
 
 class _BinaryClassificationMetrics(object):
-    """Private container class for classification metric statistics. True/false positive and
+    """
+    Private container class for classification metric statistics. True/false positive and
      true/false negative counts are sufficient statistics for various classification metrics.
     This class provides the machinery to track those statistics across mini-batches of
     (label, prediction) pairs.
@@ -607,7 +610,9 @@ def __init__(self):
         self.global_true_negatives = 0
 
     def update_binary_stats(self, label, pred):
-        """Update various binary classification counts for a single (label, pred) pair.
+        """
+        Update various binary classification counts for a single (label, pred)
+        pair.
 
         Parameters
         ----------
@@ -686,7 +691,9 @@ def global_fscore(self):
             return 0.
 
     def matthewscc(self, use_global=False):
-        """Calculate the Matthew's Correlation Coefficent"""
+        """
+        Calculate the Matthew's Correlation Coefficent
+        """
         if use_global:
             if not self.global_total_examples:
                 return 0.
@@ -1597,7 +1604,8 @@ def reset(self):
         self.reset_local()
 
     def reset_local(self):
-        """Resets the local portion of the internal evaluation results to initial state."""
+        """Resets the local portion of the internal evaluation results
+        to initial state."""
         self.num_inst = 0.
         self.lcm = numpy.zeros((self.k, self.k))
 
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index 8e8ac87c9e06..7dbc060ed60f 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -207,7 +207,8 @@ def pause(profile_process='worker'):
 
 
 def resume(profile_process='worker'):
-    """Resume paused profiling.
+    """
+    Resume paused profiling.
 
     Parameters
     ----------
diff --git a/python/mxnet/rtc.py b/python/mxnet/rtc.py
index 5dfc5ea6dfe2..4dea0e656b7e 100644
--- a/python/mxnet/rtc.py
+++ b/python/mxnet/rtc.py
@@ -172,8 +172,7 @@ def get_kernel(self, name, signature):
 
 class CudaKernel(object):
     """Constructs CUDA kernel. Should be created by `CudaModule.get_kernel`,
-    not intended to be used by users.
-    """
+    not intended to be used by users."""
     def __init__(self, handle, name, is_ndarray, dtypes):
         self.handle = handle
         self._name = name
diff --git a/python/mxnet/runtime.py b/python/mxnet/runtime.py
index f2e98fe674fa..0f7de76937c0 100644
--- a/python/mxnet/runtime.py
+++ b/python/mxnet/runtime.py
@@ -26,7 +26,9 @@
 from .base import _LIB, check_call
 
 class Feature(ctypes.Structure):
-    """Compile time feature description, member fields: `name` and `enabled`."""
+    """
+    Compile time feature description, member fields: `name` and `enabled`.
+    """
     _fields_ = [
         ("_name", ctypes.c_char_p),
         ("_enabled", ctypes.c_bool)
@@ -34,12 +36,16 @@ class Feature(ctypes.Structure):
 
     @property
     def name(self):
-        """Feature name."""
+        """
+        Feature name.
+        """
         return self._name.decode()
 
     @property
     def enabled(self):
-        """True if MXNet was compiled with the given compile-time feature."""
+        """
+        True if MXNet was compiled with the given compile-time feature.
+        """
         return self._enabled
 
     def __repr__(self):
@@ -49,7 +55,8 @@ def __repr__(self):
             return "✖ {}".format(self.name)
 
 def feature_list():
-    """Check the library for compile-time features. The list of features are maintained in libinfo.h and libinfo.cc
+    """
+    Check the library for compile-time features. The list of features are maintained in libinfo.h and libinfo.cc
 
     Returns
     -------
@@ -63,7 +70,9 @@ def feature_list():
     return features
 
 class Features(collections.OrderedDict):
-    """OrderedDict of name to Feature"""
+    """
+    OrderedDict of name to Feature
+    """
     instance = None
     def __new__(cls):
         if cls.instance is None:
@@ -75,7 +84,8 @@ def __repr__(self):
         return str(list(self.values()))
 
     def is_enabled(self, feature_name):
-        """Check for a particular feature by name
+        """
+        Check for a particular feature by name
 
         Parameters
         ----------
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 6c8fefca4490..4862aee8570d 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1935,7 +1935,8 @@ def same_array(array1, array2):
 
 @contextmanager
 def discard_stderr():
-    """Discards error output of a routine if invoked as:
+    """
+    Discards error output of a routine if invoked as:
 
     with discard_stderr():
         ...
@@ -2323,8 +2324,7 @@ def __exit__(self, ptype, value, trace):
 
 def collapse_sum_like(a, shape):
     """Given `a` as a numpy ndarray, perform reduce_sum on `a` over the axes that do not
-    exist in `shape`. Note that an ndarray with `shape` must be broadcastable to `a`.
-    """
+    exist in `shape`. Note that an ndarray with `shape` must be broadcastable to `a`."""
     assert len(a.shape) >= len(shape)
     if np.prod(shape) == 0 or a.size == 0:
         return np.zeros(shape, dtype=a.dtype)
@@ -2349,8 +2349,7 @@ def is_cd_run():
 
 def has_tvm_ops():
     """Returns True if MXNet is compiled with TVM generated operators. If current ctx
-    is GPU, it only returns True for CUDA compute capability > 52 where FP16 is supported.
-    """
+    is GPU, it only returns True for CUDA compute capability > 52 where FP16 is supported."""
     built_with_tvm_op = _features.is_enabled("TVM_OP")
     ctx = current_context()
     if ctx.device_type == 'gpu':
@@ -2368,8 +2367,7 @@ def has_tvm_ops():
 def is_op_runnable():
     """Returns True for all CPU tests. Returns True for GPU tests that are either of the following.
     1. Built with USE_TVM_OP=0.
-    2. Built with USE_TVM_OP=1, but with compute capability >= 53.
-    """
+    2. Built with USE_TVM_OP=1, but with compute capability >= 53."""
     ctx = current_context()
     if ctx.device_type == 'gpu':
         if not _features.is_enabled("TVM_OP"):
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 9e15caae9698..cef034fd0caa 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -60,7 +60,8 @@ def get_gpu_memory(gpu_dev_id):
 
 
 def set_np_shape(active):
-    """Turns on/off NumPy shape semantics, in which `()` represents the shape of scalar tensors,
+    """
+    Turns on/off NumPy shape semantics, in which `()` represents the shape of scalar tensors,
     and tuples with `0` elements, for example, `(0,)`, `(1, 0, 2)`, represent the shapes
     of zero-size tensors. This is turned off by default for keeping backward compatibility.
 
@@ -567,7 +568,8 @@ def hybrid_forward(self, F, x, w):
 
 
 def np_ufunc_legal_option(key, value):
-    """Checking if ufunc arguments are legal inputs
+    """
+    Checking if ufunc arguments are legal inputs
 
     Parameters
     ----------
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 26eea3dd062b..5c704c9646a2 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -2777,9 +2777,9 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
 // Numpy
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyShape
   (JNIEnv *env, jobject obj, jobject compatibleRef) {
-  int isNumpyShape;
+  bool isNumpyShape;
   int ret = MXIsNumpyShape(&isNumpyShape);
-  SetIntField(env, compatibleRef, isNumpyShape);
+  SetIntField(env, compatibleRef, static_cast<int>(isNumpyShape));
   return ret;
 }
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index de208c0fed99..b80e17c18071 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -276,7 +276,7 @@ int MXAutogradSetIsRecording(int is_recording, int* prev) {
   API_END();
 }
 
-int MXIsNumpyShape(int* curr) {
+int MXIsNumpyShape(bool* curr) {
   API_BEGIN();
   *curr = Imperative::Get()->is_np_shape();
   API_END();
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 882105da1321..d92253266f35 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1627,16 +1627,16 @@ static nnvm::Graph InferForwardAttrs(nnvm::Graph g,
 
 static bool SubgraphBackendCheck(const op::SubgraphBackendPtr& backend,
                                  const Context& default_ctx,
-                                 int verbose = 1) {
+                                 bool verbose = false) {
   if (backend->HasAttr("enable") && (backend->GetAttr<bool>("enable") != true)) {
-    if (verbose > 1) {
+    if (verbose) {
       LOG(INFO) << "Subgraph backend " << backend->GetName()
                 << " isn't activated.";
     }
     return false;
   }
   if (backend->HasAttr("context") && backend->GetAttr<Context>("context") != default_ctx) {
-    if (verbose > 1) {
+    if (verbose) {
       LOG(INFO) << "Subgraph backend " << backend->GetName()
                 << " isn't activated as context mismatch.";
     }
@@ -1647,7 +1647,7 @@ static bool SubgraphBackendCheck(const op::SubgraphBackendPtr& backend,
 
 static bool SubgraphPropertyCheck(const std::string& backend_name,
                                   const op::SubgraphPropertyPtr& prop, bool need_grad,
-                                  int verbose = 1) {
+                                  bool verbose = false) {
   auto full_name =
       prop->HasAttr("property_name") ? prop->GetAttr<std::string>("property_name") : std::string();
   if (prop->HasAttr("disable") && prop->GetAttr<bool>("disable") == true) {
@@ -1657,7 +1657,7 @@ static bool SubgraphPropertyCheck(const std::string& backend_name,
   }
   if (prop->HasAttr("inference_only") && prop->GetAttr<bool>("inference_only") == true) {
     if (need_grad) {
-      if (verbose > 1) {
+      if (verbose) {
         LOG(INFO) << "skip partitioning graph with subgraph property " << full_name
                   << " from backend " << backend_name << " as it requires `grad_req=null`.";
       }
@@ -1699,7 +1699,7 @@ static nnvm::Symbol BuildSubgraph(
     const std::unordered_map<std::string, int>& arg_stype_map, const Context& default_ctx,
     const std::map<std::string, Context>& ctx_map, std::vector<Context>* in_arg_ctxes,
     std::vector<Context>* arg_grad_ctxes, std::vector<OpReqType>* grad_req_types,
-    std::vector<Context>* aux_state_ctxes, int verbose = 1) {
+    std::vector<Context>* aux_state_ctxes, bool verbose = false) {
   // setup map for in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes and grad_req_types
   std::unordered_map<std::string, Context> in_arg_ctx_map;
   std::unordered_map<std::string, Context> arg_grad_ctx_map;
@@ -1794,7 +1794,7 @@ static nnvm::Symbol BuildSubgraph(const nnvm::Symbol& src, const op::SubgraphBac
                                   std::vector<NDArray>* in_args,
                                   std::vector<NDArray>* arg_grad_store,
                                   std::vector<OpReqType>* grad_req_type,
-                                  std::vector<NDArray>* aux_states, int verbose = 1) {
+                                  std::vector<NDArray>* aux_states, bool verbose = false) {
   // setup map for in_args, arg_grad_store, grad_req_type and aux_states
   std::unordered_map<std::string, NDArray> in_args_map;
   std::unordered_map<std::string, NDArray> arg_grad_store_map;
@@ -1929,11 +1929,11 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
   auto exec = new exec::GraphExecutor();
   bool init = false;
   if (!exec->subgraph_property().empty()) {
-    static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
+    static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
     const auto& backend_name = exec->subgraph_property();
     const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name);
     if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) {
-      if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
+      LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
       std::vector<Context> tmp_in_arg_ctxes = in_arg_ctxes;
       std::vector<Context> tmp_arg_grad_ctxes = arg_grad_ctxes;
       std::vector<Context> tmp_aux_state_ctxes = aux_state_ctxes;
@@ -2001,7 +2001,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
                          const std::vector<NDArray> &aux_states,
                          Executor* shared_exec) {
   auto exec = new exec::GraphExecutor();
-  static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
+  static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
   std::vector<NDArray> tmp_in_args = in_args;
   std::vector<NDArray> tmp_arg_grad_store = arg_grad_store;
   std::vector<OpReqType> tmp_grad_req_type = grad_req_type;
@@ -2011,7 +2011,7 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
     const auto& backend_name = exec->subgraph_property();
     const auto& backend = op::SubgraphBackendRegistry::Get()->GetSubgraphBackend(backend_name);
     if (exec::SubgraphBackendCheck(backend, default_ctx, verbose)) {
-      if (verbose) LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
+      LOG(INFO) << "Subgraph backend " << backend_name << " is activated.";
       symbol = exec::BuildSubgraph(symbol, backend, default_ctx, group2ctx, &tmp_in_args,
                                    &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states,
                                    verbose);
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 39c2880d627b..6818d757ab79 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -32,22 +32,6 @@ DMLC_REGISTER_PARAMETER(CachedOpConfig);
 
 constexpr uint32_t kEidNotExist = std::numeric_limits<uint32_t>::max();
 
-const char CachedOp::FULL[] = "full";
-const char CachedOp::FORWARD[] = "forward";
-const char CachedOp::BACKWARD[] = "backward";
-const char CachedOp::REF_COUNT[] = "ref_count";
-const char CachedOp::MEM_PLAN[] = "mem_plan";
-const char CachedOp::STORAGE_PLAN[] = "storage_plan";
-
-namespace {
-
-std::string AddPrefix(const std::string& prefix,
-                      const std::string& s) {
-  return prefix + "_" + s;
-}
-
-}  // namespace
-
 struct CachedOp::GraphInfo {
   nnvm::Graph fwd_graph;
   nnvm::Graph full_graph;
@@ -152,7 +136,7 @@ CachedOp::CachedOp(
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
     }
 
-    fwd_graph_.attrs[AddPrefix(FORWARD, REF_COUNT)] =
+    fwd_graph_.attrs["forward_ref_count"] =
         std::make_shared<dmlc::any>(std::move(ref_count));
 
     inlining_ = !config_.static_alloc &&
@@ -217,9 +201,9 @@ CachedOp::CachedOp(
       }
     }
 
-    auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >(AddPrefix(FORWARD, REF_COUNT));
+    auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >("forward_ref_count");
     for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count.at(i) += ref_count[i];
-    fwd_graph_.attrs[AddPrefix(FULL, REF_COUNT)] =
+    fwd_graph_.attrs["full_ref_count"] =
         std::make_shared<dmlc::any>(std::move(full_ref_count));
 
     size_t num_forward_inputs = num_inputs();
@@ -352,15 +336,14 @@ bool CachedOp::SetForwardGraph(
 
   // When dynmaic shape exists, it is not feasible to plan memory ahead of time
   if (contain_dynamic_shape) {
-    g.attrs.erase(AddPrefix(FORWARD, MEM_PLAN));
-    g.attrs.erase(AddPrefix(FULL, MEM_PLAN));
+    g.attrs.erase("forward_mem_plan");
+    g.attrs.erase("full_mem_plan");
     return false;
   }
-  const std::string& prefix = recording ? FULL : FORWARD;
   if (!match) {
-    g.attrs.erase(AddPrefix(FORWARD, MEM_PLAN));
-    g.attrs.erase(AddPrefix(FULL, MEM_PLAN));
-  } else if (g.attrs.count(AddPrefix(prefix, MEM_PLAN))) {
+    g.attrs.erase("forward_mem_plan");
+    g.attrs.erase("full_mem_plan");
+  } else if (g.attrs.count(recording ? "full_mem_plan" : "forward_mem_plan")) {
     return true;
   }
 
@@ -380,9 +363,9 @@ bool CachedOp::SetForwardGraph(
   }
 
   auto mem_plan = PlanMemory(
-      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(AddPrefix(prefix, REF_COUNT)),
-      AddPrefix(prefix, STORAGE_PLAN));
-  g.attrs[AddPrefix(prefix, MEM_PLAN)] =
+      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
+          recording ? "full_ref_count" : "forward_ref_count"));
+  g.attrs[recording ? "full_mem_plan" : "forward_mem_plan"] =
       std::make_shared<dmlc::any>(std::move(mem_plan));
 
   return false;
@@ -449,7 +432,7 @@ bool CachedOp::SetBackwardGraph(
   size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
   size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
 
-  if (!g.attrs.count(AddPrefix(BACKWARD, REF_COUNT))) {
+  if (!g.attrs.count("backward_ref_count")) {
     std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
     for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
       for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
@@ -460,7 +443,7 @@ bool CachedOp::SetBackwardGraph(
       }
     }
     for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)];
-    g.attrs[AddPrefix(BACKWARD, REF_COUNT)] = std::make_shared<dmlc::any>(std::move(ref_count));
+    g.attrs["backward_ref_count"] = std::make_shared<dmlc::any>(std::move(ref_count));
   }
 
   auto shapes = info->fwd_graph.GetAttr<mxnet::ShapeVector>("shape");
@@ -493,8 +476,8 @@ bool CachedOp::SetBackwardGraph(
                                     false, node_range, entry_range);
 
   if (!match) {
-    g.attrs.erase(AddPrefix(BACKWARD, MEM_PLAN));
-  } else if (g.attrs.count(AddPrefix(BACKWARD, MEM_PLAN))) {
+    g.attrs.erase("backward_mem_plan");
+  } else if (g.attrs.count("backward_mem_plan")) {
     return true;
   }
 
@@ -508,13 +491,11 @@ bool CachedOp::SetBackwardGraph(
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
 
   auto mem_plan = PlanMemory(
-      &g, std::move(storage),
-      g.GetAttr<std::vector<uint32_t> >(AddPrefix(BACKWARD, REF_COUNT)),
-      AddPrefix(BACKWARD, STORAGE_PLAN),
+      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
       {num_forward_nodes, idx.num_nodes()},
       {num_forward_entries, idx.num_node_entries()},
       detect_inplace_addto);
-  g.attrs[AddPrefix(BACKWARD, MEM_PLAN)] = std::make_shared<dmlc::any>(std::move(mem_plan));
+  g.attrs["backward_mem_plan"] = std::make_shared<dmlc::any>(std::move(mem_plan));
 
   return false;
 }
@@ -545,10 +526,9 @@ void CachedOp::StaticAllocMemory(
   const auto& default_ctx = state.context;
   nnvm::Graph& g = keep_fwd ? state.info.full_graph : state.info.fwd_graph;
   const auto& idx = g.indexed_graph();
-  const std::string& graph_type = keep_fwd ? BACKWARD : (recording ? FULL : FORWARD);
-  const auto& storage_plan_attr = AddPrefix(graph_type, STORAGE_PLAN);
-  const auto& storage_plan = g.GetAttr<std::vector<int> >(storage_plan_attr);
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(AddPrefix(graph_type, MEM_PLAN));
+  const auto& vstorage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector>(
+      keep_fwd ? "backward_mem_plan" : (recording ? "full_mem_plan" : "forward_mem_plan"));
   std::vector<int> addto_entry;
   if (g.attrs.count("addto_entry")) {
     addto_entry = g.GetAttr<std::vector<int> >("addto_entry");
@@ -578,9 +558,9 @@ void CachedOp::StaticAllocMemory(
   for (size_t i = start_eid; i < end_eid; ++i) {
     if (addto_entry.size() && addto_entry[i]) {
       state.array_reqs[i] = kAddTo;
-    } else if (storage_plan[i] >= 0) {
+    } else if (vstorage_inplace[i] >= 0) {
       state.array_reqs[i] = kWriteInplace;
-    } else if (storage_plan[i] == -2) {
+    } else if (vstorage_inplace[i] == -2) {
       // -2 indicate that the entry is never referenced.
       state.array_reqs[i] = kNullOp;
     } else {
@@ -882,9 +862,8 @@ OpStatePtr CachedOp::DynamicForward(
   }
 
   // Allocate NDArrays
-  const std::string& graph_type = recording ? FULL : FORWARD;
-  std::vector<uint32_t> ref_count =
-    g.GetAttr<std::vector<uint32_t> >(AddPrefix(graph_type, REF_COUNT));
+  std::vector<uint32_t> ref_count = g.GetAttr<std::vector<uint32_t> >(
+      recording ? "full_ref_count" : "forward_ref_count");
 
   std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
   for (size_t i = 0; i < idx.num_node_entries(); ++i) {
@@ -892,7 +871,8 @@ OpStatePtr CachedOp::DynamicForward(
   }
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
   if (!use_naive_run) {
-    const auto& mem_plan = g.GetAttr<MemoryPlanVector >(AddPrefix(graph_type, MEM_PLAN));
+    const auto& mem_plan = g.GetAttr<MemoryPlanVector >(
+        recording ? "full_mem_plan" : "forward_mem_plan");
     AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
                   mem_plan, arrays, &array_reqs);
     const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
@@ -1031,7 +1011,7 @@ void CachedOp::DynamicBackward(
   }
 
   // Allocate NDArrays
-  auto ref_count = g.GetAttr<std::vector<uint32_t> >(AddPrefix(BACKWARD, REF_COUNT));
+  auto ref_count = g.GetAttr<std::vector<uint32_t> >("backward_ref_count");
   if (retain_graph) {
     for (size_t i = 0; i < num_forward_entries; ++i) ++ref_count[i];
   }
@@ -1047,7 +1027,7 @@ void CachedOp::DynamicBackward(
     if (ref_count[i] == 0) array_reqs[i] = kNullOp;
   }
 
-  const auto& mem_plan = g.GetAttr<MemoryPlanVector >(AddPrefix(BACKWARD, MEM_PLAN));
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector >("backward_mem_plan");
   AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(),
                  mem_plan, arrays, &array_reqs);
 
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index 84f96300c27b..db049d59ed80 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -140,13 +140,6 @@ class CachedOp {
   void RegisterOpHook(const CachedOp::CachedOpMonCallback& callback,
                       bool monitor_all = false);
 
-  static const char FULL[];
-  static const char FORWARD[];
-  static const char BACKWARD[];
-  static const char REF_COUNT[];
-  static const char MEM_PLAN[];
-  static const char STORAGE_PLAN[];
-
  private:
   struct GraphInfo;
   struct DynamicRuntime;
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 64034dafb4d5..356b85e67ee2 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -834,7 +834,6 @@ inline MemoryPlanVector PlanMemory(
     nnvm::Graph* p_g,
     nnvm::StorageVector&& storage,
     const std::vector<uint32_t>& ref_count,
-    const std::string& storage_plan,
     const std::pair<uint32_t, uint32_t>& node_range = {0, 0},
     const std::pair<uint32_t, uint32_t>& entry_range = {0, 0},
     bool detect_inplace_addto = false) {
@@ -852,7 +851,6 @@ inline MemoryPlanVector PlanMemory(
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<mxnet::ShapeVector>("shape");
   const auto& storage_inplace = g.GetAttr<std::vector<int> >("storage_inplace_index");
-  g.attrs[storage_plan] = std::make_shared<any>(storage_inplace);
   const auto& storage_ids = g.GetAttr<StorageVector>("storage_id");
   uint32_t entry_start = entry_range.first;
   uint32_t entry_end =
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 3feccf55b734..e1374ecdb9dd 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -1616,13 +1616,12 @@ void NDArray::Save(dmlc::Stream *strm) const {
     nd_cpu.WaitToRead();
     save_data = nd_cpu.data();
   } else {
-#if MXNET_USE_MKLDNN == 1
-    // For mkldnn, a copy of *this can ensure no write access pending on *this.
-    nd_cpu = this->Copy(Context::CPU());
-    nd_cpu.WaitToRead();
-#else
     this->WaitToRead();
     nd_cpu = *this;
+#if MXNET_USE_MKLDNN == 1
+    if (nd_cpu.IsMKLDNNData()) {
+      nd_cpu = nd_cpu.Reorder2Default();
+    }
 #endif
     save_data = nd_cpu.data();
   }
@@ -1715,8 +1714,7 @@ bool NDArray::Load(dmlc::Stream *strm) {
            " Please turn on np shape semantics in Python using `with np_shape(True)`"
            " or decorator `use_np_shape` to scope the code of loading the ndarray.";
   } else {
-    // when the flag is global on, skip the check since it would be always global on.
-    CHECK(Imperative::Get()->is_np_shape() == GlobalOn || !Imperative::Get()->is_np_shape())
+    CHECK(!Imperative::Get()->is_np_shape())
         << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
            " Please turn off np shape semantics in Python using `with np_shape(False)`"
            " to scope the code of loading the ndarray.";
@@ -2007,18 +2005,16 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
-    Engine::Get()->PushAsync(
-        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          RunContext ctx{this->ctx(), nullptr, nullptr, false};
-          NDArray src = *this;
+    this->WaitToRead();
+    RunContext rctx{this->ctx(), nullptr, nullptr, false};
+    NDArray src = *this;
 #if MXNET_USE_MKLDNN == 1
-          src = this->Reorder2Default();
+    if (src.IsMKLDNNData()) {
+      src = this->Reorder2Default();
+    }
 #endif
-          ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), ctx);
-          on_complete();
-        },
-        this->ctx(), {this->var()}, {}, FnProperty::kNormal, 0, "SyncCopyCPU2CPU");
-    this->WaitToWrite();
+    ndarray::Copy<cpu, cpu>(src.data(), &dst,
+                            Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushAsync(
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index 9ce135040fb4..e8e2cd90b86c 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -43,6 +43,8 @@ bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
   }
 #endif
   (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
   return true;
 }
 
diff --git a/src/operator/subgraph/build_subgraph.cc b/src/operator/subgraph/build_subgraph.cc
index 0f4c570331a2..d43647ac83b9 100644
--- a/src/operator/subgraph/build_subgraph.cc
+++ b/src/operator/subgraph/build_subgraph.cc
@@ -318,8 +318,8 @@ void PreSelectSubgraphNodes(const nnvm::Graph& g, SubgraphSelectorV2Ptr subgraph
       for (auto node : excluded_nodes) {
         excluded_node_names += node->node->attrs.name + ", ";
       }
-      static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
-      if (verbose > 1) {
+      static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
+      if (verbose) {
         LOG(INFO) << "Found a cycle when BFS from node " << simple_nodes[snid]->node->attrs.name
                   << ". Excluding nodes " << excluded_node_names << "and retrying";
       }
@@ -706,9 +706,9 @@ void TopSortEntries(const nnvm::Graph& g,
 }
 
 nnvm::Graph BuildSubgraph(nnvm::Graph&& g) {
-    static int verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", 1);
+    static bool verbose = dmlc::GetEnv("MXNET_SUBGRAPH_VERBOSE", false);
   if (!g.HasAttr("subgraph_property")) {  // treat the whole graph as a subgraph
-    if (verbose > 1) {
+    if (verbose) {
       LOG(INFO) << "The graph has no attribute of subgraph_property attached. "
                    "The original graph is returned.";
     }
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 8405404dc627..96c869f40d40 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -30,7 +30,6 @@
 #include <algorithm>
 #include <utility>
 #include <type_traits>
-
 #include "./util/tensor_util-inl.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
@@ -1354,7 +1353,6 @@ void BatchDotForward_(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (req[0] == kNullOp) return;
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
       << "Binary function only support input/output with the same type";
@@ -1364,46 +1362,115 @@ void BatchDotForward_(const nnvm::NodeAttrs& attrs,
     (outputs[0].type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
     << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    int ndim = outputs[0].ndim();
-    if (outputs[0].shape_.Size() == 0 || inputs[0].shape_.Size() == 0
-                                      || inputs[1].shape_.Size() == 0) {
-      if (outputs[0].shape_.Size() != 0 && req[0] != kAddTo) {
-        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, outputs[0].shape_.Size(),
-                                                          outputs[0].dptr<DType>());
+    mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 1, DType*> workspace =
+      ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
+    if (kNullOp != req[0]) {
+      if (param.transpose_a && param.transpose_b) {
+        mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else if (!param.transpose_a && param.transpose_b) {
+        mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else if (param.transpose_a && !param.transpose_b) {
+        mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else {
+        mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
       }
-      return;
     }
-    size_t batch_size = outputs[0].shape_.ProdShape(0, ndim - 2);
-    mshadow::Tensor<xpu, 3, DType> out =
-        outputs[0].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
-                                                        outputs[0].shape_[ndim - 2],
-                                                        outputs[0].shape_[ndim - 1]), s);
-    mshadow::Tensor<xpu, 3, DType> mlhs =
-        inputs[0].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
-                                                       inputs[0].shape_[ndim - 2],
-                                                       inputs[0].shape_[ndim - 1]), s);
-    mshadow::Tensor<xpu, 3, DType> mrhs =
-        inputs[1].get_with_shape<xpu, 3, DType>(Shape3(batch_size,
-                                                       inputs[1].shape_[ndim - 2],
-                                                       inputs[1].shape_[ndim - 1]), s);
-    mshadow::Tensor<xpu, 1, DType*> workspace =
-        ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
+  });
+}
+
+template<typename xpu>
+void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK_NE(req[1], kWriteInplace);
+  CHECK_NE(req[0], kWriteInplace);
+  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64 ||
+    (outputs[0].type_flag_ == kFloat16 && ctx.run_ctx.ctx.dev_mask() == mshadow::gpu::kDevMask))
+    << "dot only supports float32/float64 for CPU, and float16/float32/float64 for GPU";
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs_grad = outputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs_grad = outputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 2, DType*> workspace =
+      ctx.requested[0].get_space_typed<xpu, 2, DType*>(
+        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
+    mshadow::Tensor<xpu, 1, DType*> rhs_workspace = workspace[0];
+    mshadow::Tensor<xpu, 1, DType*> lhs_workspace = workspace[1];
     if (param.transpose_a && param.transpose_b) {
-      mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
-                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                     workspace);
+      // Gradient of z = dot(x.T, y.T)
+      // dy = dot(x, dz).T = dot(dz.T, x.T)
+      // dx = dot(dz, y).T = dot(y.T, dz.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f :  (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
     } else if (!param.transpose_a && param.transpose_b) {
-      mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
-                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                     workspace);
+      // Gradient of z = dot(x, y.T)
+      // dy = dot(x.T, dz).T = dot(dz.T, x)
+      // dx = dot(dz, y)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
     } else if (param.transpose_a && !param.transpose_b) {
-      mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
-                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                     workspace);
+      // Gradient of z = dot(x.T, y)
+      // dy = dot(x, dz)
+      // dx = dot(dz, y.T).T = dot(y, dz.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
     } else {
-      mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
-                                     (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                     workspace);
+      // Gradient of z = dot(x, y)
+      // dy = dot(x.T, dz)
+      // dx = dot(dz, y.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
     }
   });
 }
@@ -1418,34 +1485,24 @@ inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& rshape = (*in_attrs)[1];
   // return false if lhs and rhs both have fully unknown shape
   if (!ndim_is_known(lshape) || !ndim_is_known(rshape)) return false;
-  if (lshape.ndim() >= 3 && rshape.ndim() >= 3 && lshape.ndim() == rshape.ndim()) {
-    int ndim = lshape.ndim();
+  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
     // only partially infer shape if last dim of lhs and second dim of rhs is known
-    bool last_dim_known = dim_size_is_known(lshape, ndim - 1);
-    bool second_dim_known = dim_size_is_known(rshape, ndim - 2);
+    bool last_dim_known = dim_size_is_known(lshape, 2);
+    bool second_dim_known = dim_size_is_known(rshape, 1);
     if ( !last_dim_known || !second_dim_known) return false;
-    for (int i = 0; i < ndim - 2; i++) {
-      CHECK_EQ(lshape[i], rshape[i])
-        << "batch_dot shape error (the leading batch dimensions must be equal): "
-        << lshape << " X " << rshape
-        << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    }
-    dim_t out_m = param.transpose_a ? lshape[ndim - 1] : lshape[ndim - 2];
-    dim_t lshape_k = param.transpose_a ? lshape[ndim - 2] : lshape[ndim - 1];
-    dim_t out_n = param.transpose_b ? rshape[ndim - 2] : rshape[ndim - 1];
-    dim_t rshape_k = param.transpose_b ? rshape[ndim - 1] : rshape[ndim - 2];
-    CHECK_EQ(lshape_k, rshape_k)
-      << "batch_dot shape error (shape mismatch): " << lshape << " X " << rshape
+    CHECK(lshape[0] == rshape[0])
+      << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
       << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    std::vector<dim_t> out_shape_vec;
-    for (int i = 0; i < ndim - 2; i++) {
-      out_shape_vec.push_back(lshape[i]);
-    }
-    out_shape_vec.push_back(out_m);
-    out_shape_vec.push_back(out_n);
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(out_shape_vec));
+    index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
+    index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
+    index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
+    index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
+    CHECK(lshape_k == rshape_k)
+      << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
+      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
   } else {
-    LOG(FATAL) << "batch_dot currently only support N-D*N-D array (N >= 3)"
+    LOG(FATAL) << "batch_dot currently only support 3D*3D array"
                << lshape << " v.s. " << rshape;
   }
   // return true if output shape is fully inferred
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index 556260ed9600..11a056146e1d 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -115,13 +115,13 @@ NNVM_REGISTER_OP(batch_dot)
 .describe(R"doc(Batchwise dot product.
 
 ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
-``y`` are data in batch, namely N-D (N >= 3) arrays in shape of `(B0, ..., B_i, :, :)`.
+``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
 
-For example, given ``x`` with shape `(B_0, ..., B_i, N, M)` and ``y`` with shape
-`(B_0, ..., B_i, M, K)`, the result array will have shape `(B_0, ..., B_i, N, K)`,
+For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
+`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
 which is computed by::
 
-   batch_dot(x,y)[b_0, ..., b_i, :, :] = dot(x[b_0, ..., b_i, :, :], y[b_0, ..., b_i, :, :])
+   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
 
 )doc" ADD_FILELINE)
 .set_num_inputs(2)
@@ -138,73 +138,21 @@ which is computed by::
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
 .set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
-.set_attr<nnvm::FGradient>("FGradient",
-    [](const nnvm::NodePtr& n,
-       const std::vector<nnvm::NodeEntry>& ograds) {
-  const DotParam& param = nnvm::get<DotParam>(n->attrs.parsed);
-  nnvm::NodePtr lhs_grad;
-  nnvm::NodePtr rhs_grad;
-  std::string lhs_gnode_name = n->attrs.name + "_backward_lhs";
-  std::string rhs_gnode_name = n->attrs.name + "_backward_rhs";
-  if (param.transpose_a && param.transpose_b) {
-    // Gradient of z = dot(x.T, y.T)
-    // dx = dot(dz, y).T = dot(y.T, dz.T)
-    // dy = dot(x, dz).T = dot(dz.T, x.T)
-    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
-                        {n->inputs[1], ograds[0]}, &(n->attrs.dict), &n);
-    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
-                        {ograds[0], n->inputs[0]}, &(n->attrs.dict), &n);
-  } else if (!param.transpose_a && param.transpose_b) {
-    // Gradient of z = dot(x, y.T)
-    // dx = dot(dz, y)
-    // dy = dot(x.T, dz).T = dot(dz.T, x)
-    auto lhs_attrs_dict = n->attrs.dict;
-    auto rhs_attrs_dict = n->attrs.dict;
-    lhs_attrs_dict["transpose_a"] = "false";
-    lhs_attrs_dict["transpose_b"] = "false";
-    rhs_attrs_dict["transpose_a"] = "true";
-    rhs_attrs_dict["transpose_b"] = "false";
-    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
-                        {ograds[0], n->inputs[1]}, &lhs_attrs_dict, &n);
-    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
-                        {ograds[0], n->inputs[0]}, &rhs_attrs_dict, &n);
-  } else if (param.transpose_a && !param.transpose_b) {
-    // Gradient of z = dot(x.T, y)
-    // dx = dot(dz, y.T).T = dot(y, dz.T)
-    // dy = dot(x, dz)
-    auto lhs_attrs_dict = n->attrs.dict;
-    auto rhs_attrs_dict = n->attrs.dict;
-    lhs_attrs_dict["transpose_a"] = "false";
-    lhs_attrs_dict["transpose_b"] = "true";
-    rhs_attrs_dict["transpose_a"] = "false";
-    rhs_attrs_dict["transpose_b"] = "false";
-    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
-                        {n->inputs[1], ograds[0]}, &lhs_attrs_dict, &n);
-    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
-                        {n->inputs[0], ograds[0]}, &rhs_attrs_dict, &n);
-  } else {
-    // Gradient of z = dot(x, y)
-    // dx = dot(dz, y.T)
-    // dy = dot(x.T, dz)
-    auto lhs_attrs_dict = n->attrs.dict;
-    auto rhs_attrs_dict = n->attrs.dict;
-    lhs_attrs_dict["transpose_a"] = "false";
-    lhs_attrs_dict["transpose_b"] = "true";
-    rhs_attrs_dict["transpose_a"] = "true";
-    rhs_attrs_dict["transpose_b"] = "false";
-    lhs_grad = MakeNode("batch_dot", lhs_gnode_name,
-                        {ograds[0], n->inputs[1]}, &lhs_attrs_dict, &n);
-    rhs_grad = MakeNode("batch_dot", rhs_gnode_name,
-                        {n->inputs[0], ograds[0]}, &rhs_attrs_dict, &n);
-  }
-  std::vector<nnvm::NodeEntry> ret;
-  ret.emplace_back(nnvm::NodeEntry{lhs_grad, 0, 0});
-  ret.emplace_back(nnvm::NodeEntry{rhs_grad, 0, 0});
-  return ret;
-})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
 .add_argument("lhs", "NDArray-or-Symbol", "The first input")
 .add_argument("rhs", "NDArray-or-Symbol", "The second input")
 .add_arguments(DotParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_batch_dot)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/dot.cu b/src/operator/tensor/dot.cu
index b245b1c9e5ed..8ee2e2832fbb 100644
--- a/src/operator/tensor/dot.cu
+++ b/src/operator/tensor/dot.cu
@@ -38,5 +38,8 @@ NNVM_REGISTER_OP(_backward_dot)
 NNVM_REGISTER_OP(batch_dot)
 .set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
 
+NNVM_REGISTER_OP(_backward_batch_dot)
+.set_attr<FCompute>("FCompute<gpu>", BatchDotBackward_<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index c18a95400f22..0cb21cedee35 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -1415,10 +1415,10 @@ def check_arcsinh():
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
     def check_arccosh():
-        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi, 5*np.pi/4])
+        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi])
         y = nd.arccosh(x)
         # expected ouput for indices=(0, 1, -3, -2, -1) after applying arccosh()
-        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi), np.arccosh(5*np.pi/4)]
+        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi)]
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
     def check_arctanh():
diff --git a/tests/nightly/test_large_vector.py b/tests/nightly/test_large_vector.py
index b8edc83220bd..23f4b8e4f310 100644
--- a/tests/nightly/test_large_vector.py
+++ b/tests/nightly/test_large_vector.py
@@ -556,8 +556,8 @@ def test_concat():
     a = nd.ones(LARGE_X)
     b = nd.zeros(LARGE_X)
     c = nd.concat(a, b, dim=0)
-    assert c[0] == 1
-    assert c[-1] == 0
+    assert c[0][0] == 1
+    assert c[-1][-1] == 0
     assert c.shape[0] == (2 * LARGE_X)
 
 
@@ -710,37 +710,6 @@ def test_full():
     assert a[-1] == 3
 
 
-def test_sign():
-    a = mx.nd.random.normal(-1, 1, shape=LARGE_X)
-    mx_res = mx.nd.sign(a)
-    assert_almost_equal(mx_res[-1].asnumpy(), np.sign(a[-1].asnumpy()))
-
-
-def test_logical():
-    def check_logical_and(a, b):
-        mx_res = mx.nd.logical_and(a, b)
-        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_and(a[-1].asnumpy(), b[-1].asnumpy()))
-
-    def check_logical_or(a, b):
-        mx_res = mx.nd.logical_or(a, b)
-        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_or(a[-1].asnumpy(), b[-1].asnumpy()))
-
-    def check_logical_not(a, b):
-        mx_res = mx.nd.logical_not(a, b)
-        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_not(a[-1].asnumpy(), b[-1].asnumpy()))
-
-    def check_logical_xor(a, b):
-        mx_res = mx.nd.logical_xor(a, b)
-        assert_almost_equal(mx_res[-1].asnumpy(), np.logical_xor(a[-1].asnumpy(), b[-1].asnumpy()))
-
-    a = mx.nd.ones(LARGE_X)
-    b = mx.nd.zeros(LARGE_X)
-    check_logical_and(a, b)
-    check_logical_or(a, b)
-    check_logical_not(a, b)
-    check_logical_xor(a, b)
-
-
 def test_astype():
     x = create_vector(size=LARGE_X//4)
     x = nd.tile(x, 4)
@@ -783,7 +752,7 @@ def assert_correctness_of_rounding_ops(output, mid, expected_vals):
 
 def test_rounding_ops():
     x = create_input_for_rounding_ops()
-
+    
     def check_ceil():
         y = nd.ceil(x)
         # expected ouput for middle 5 values after applying ceil()
@@ -885,48 +854,6 @@ def check_tan():
         expected_output = [-.577, -1, 0, 1, .577]
         assert_correctness_of_trigonometric_ops(y, expected_output)
 
-    def check_arcsinh():
-        x = create_input_for_trigonometric_ops([-np.pi/2, -np.pi/4, 0, np.pi/4, np.pi/2])
-        y = nd.arcsinh(x)
-        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arcsinh()
-        expected_output = [np.arcsinh(-np.pi/2), np.arcsinh(-np.pi/4), 0, np.arcsinh(np.pi/4), np.arcsinh(np.pi/2)]
-        assert_correctness_of_trigonometric_ops(y, expected_output)
-
-    def check_arccosh():
-        x = create_input_for_trigonometric_ops([1, np.pi/2, 3*np.pi/4, np.pi, 5*np.pi/4])
-        y = nd.arccosh(x)
-        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arccosh()
-        expected_output = [0, np.arccosh(np.pi/2), np.arccosh(3*np.pi/4), np.arccosh(np.pi), np.arccosh(5*np.pi/4)]
-        assert_correctness_of_trigonometric_ops(y, expected_output)
-
-    def check_arctanh():
-        x = create_input_for_trigonometric_ops([-1/4, -1/2, 0, 1/4, 1/2])
-        y = nd.arctanh(x)
-        # expected ouput for indices=(0, 1, -3, -2, -1) after applying arctanh()
-        expected_output = [np.arctanh(-1/4), np.arctanh(-1/2), 0, np.arctanh(1/4), np.arctanh(1/2)]
-        assert_correctness_of_trigonometric_ops(y, expected_output)
-
-    def check_sinh():
-        x = create_input_for_trigonometric_ops([-np.pi/2, -np.pi/4, 0, np.pi/4, np.pi/2])
-        y = nd.sinh(x)
-        # expected ouput for indices=(0, 1, -3, -2, -1) after applying sinh()
-        expected_output = [np.sinh(-np.pi/2), np.sinh(-np.pi/4), 0, np.sinh(np.pi/4), np.sinh(np.pi/2)]
-        assert_correctness_of_trigonometric_ops(y, expected_output)
-
-    def check_cosh():
-        x = create_input_for_trigonometric_ops([0, 1, np.pi/2, 3*np.pi/4, np.pi])
-        y = nd.cosh(x)
-        # expected ouput for indices=(0, 1, -3, -2, -1) after applying cosh()
-        expected_output = [1, np.cosh(1), np.cosh(np.pi/2), np.cosh(3*np.pi/4), np.cosh(np.pi)]
-        assert_correctness_of_trigonometric_ops(y, expected_output)
-
-    def check_tanh():
-        x = create_input_for_trigonometric_ops([-1/4, -1/2, 0, 1/4, 1/2])
-        y = nd.tanh(x)
-        # expected ouput for indices=(0, 1, -3, -2, -1) after applying tanh()
-        expected_output = [np.tanh(-1/4), np.tanh(-1/2), 0, np.tanh(1/4), np.tanh(1/2)]
-        assert_correctness_of_trigonometric_ops(y, expected_output)
-
     def check_radians():
         x = create_input_for_trigonometric_ops([0, 90, 180, 270, 360])
         y = nd.radians(x)
@@ -947,12 +874,6 @@ def check_degrees():
     check_sin()
     check_cos()
     check_tan()
-    check_arcsinh()
-    check_arccosh()
-    check_arctanh()
-    check_sinh()
-    check_cosh()
-    check_tanh()
     check_radians()
     check_degrees()
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index ae8ad621df75..b764ac73d30c 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -24,7 +24,6 @@
 import platform
 import mxnet as mx
 import scipy.stats as ss
-from nose.tools import assert_raises
 from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.base import MXNetError
@@ -902,124 +901,6 @@ def hybrid_forward(self, F, a):
             expected_grad[basic_index] = 1
             assert same(a.grad.asnumpy(), expected_grad)
 
-@with_seed()
-@use_np
-def test_npx_batch_dot():
-    ctx = mx.context.current_context()
-    dtypes = ['float32', 'float64']
-    if ctx.device_type == 'gpu':
-        dtypes += ['float16']
-    eps_dict = {'float32': 1E-4, 'float64': 1E-4, 'float16': 1E-3}
-    class TestBatchDot(HybridBlock):
-        def __init__(self, transpose_a, transpose_b):
-            super(TestBatchDot, self).__init__()
-            self._transpose_a = transpose_a
-            self._transpose_b = transpose_b
-
-        def hybrid_forward(self, F, lhs, rhs):
-            return F.npx.batch_dot(lhs, rhs,
-                                   transpose_a=self._transpose_a,
-                                   transpose_b=self._transpose_b)
-
-    def batch_dot_numpy(lhs, rhs, transpose_a, transpose_b):
-        assert lhs.ndim == rhs.ndim >= 3
-        if transpose_a:
-            lhs = lhs.swapaxes(-1, -2)
-        if transpose_b:
-            rhs = rhs.swapaxes(-1, -2)
-        return _np.matmul(lhs, rhs)
-
-    def gt_grad_batch_dot_numpy(lhs, rhs, ograd, transpose_a, transpose_b, lhs_req, rhs_req,
-                                init_lhs_grad, init_rhs_grad):
-
-        if transpose_a and transpose_b:
-            # Gradient of z = dot(x.T, y.T)
-            # dx = dot(dz, y).T = dot(y.T, dz.T)
-            # dy = dot(x, dz).T = dot(dz.T, x.T)
-            lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=True, transpose_b=True)
-            rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=True)
-        elif not transpose_a and transpose_b:
-            # Gradient of z = dot(x, y.T)
-            # dx = dot(dz, y)
-            # dy = dot(x.T, dz).T = dot(dz.T, x)
-            lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=False)
-            rhs_grad = batch_dot_numpy(ograd, lhs, transpose_a=True, transpose_b=False)
-        elif transpose_a and not transpose_b:
-            # Gradient of z = dot(x.T, y)
-            # dx = dot(dz, y.T).T = dot(y, dz.T)
-            # dy = dot(x, dz)
-            lhs_grad = batch_dot_numpy(rhs, ograd, transpose_a=False, transpose_b=True)
-            rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=False, transpose_b=False)
-        else:
-            # Gradient of z = dot(x, y)
-            # dx = dot(dz, y.T)
-            # dy = dot(x.T, dz)
-            lhs_grad = batch_dot_numpy(ograd, rhs, transpose_a=False, transpose_b=True)
-            rhs_grad = batch_dot_numpy(lhs, ograd, transpose_a=True, transpose_b=False)
-        if lhs_req == 'add':
-            lhs_grad += init_lhs_grad
-        if rhs_req == 'add':
-            rhs_grad += init_rhs_grad
-        return lhs_grad, rhs_grad
-
-
-    configs = [
-        ((2, 3, 0), (2, 4, 0), False, True),
-        ((2, 4, 3), (2, 4, 3), True, False),
-        ((0, 3, 0), (0, 0, 2), False, False),
-        ((3, 2, 3, 2), (3, 2, 2, 3), True, True),
-        ((3, 1, 5, 2), (3, 1, 2, 1), False, False)
-    ]
-    bad_configs = [
-        ((5, 3, 2), (5, 1, 3), False, False),
-        ((2, 5, 3, 1), (2, 4, 3, 1), True, False)
-    ]
-    for hybridize in [True, False]:
-        for lhs_shape, rhs_shape, transpose_a, transpose_b in configs:
-            for dtype in dtypes:
-                eps = eps_dict[dtype]
-                for lhs_grad_req in ['write', 'add']:
-                    for rhs_grad_req in ['write', 'add']:
-                        f_batch_dot = TestBatchDot(transpose_a=transpose_a,
-                                                   transpose_b=transpose_b)
-                        if hybridize:
-                            f_batch_dot.hybridize()
-                        lhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
-                        rhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
-                        lhs_val.attach_grad(grad_req=lhs_grad_req)
-                        rhs_val.attach_grad(grad_req=rhs_grad_req)
-                        gt_out = batch_dot_numpy(lhs_val.asnumpy(), rhs_val.asnumpy(),
-                                                 transpose_a, transpose_b)
-                        init_lhs_grad = mx.np.random.uniform(-1.0, 1.0, lhs_shape, dtype=dtype)
-                        init_rhs_grad = mx.np.random.uniform(-1.0, 1.0, rhs_shape, dtype=dtype)
-                        o_grad = mx.np.random.uniform(-1.0, 1.0, gt_out.shape, dtype=dtype)
-                        if lhs_grad_req == 'add':
-                            lhs_val.grad[:] = init_lhs_grad
-                        if rhs_grad_req == 'add':
-                            rhs_val.grad[:] = init_rhs_grad
-                        with mx.autograd.record():
-                            out = f_batch_dot(lhs_val, rhs_val)
-                        out.backward(o_grad)
-                        assert_almost_equal(out.asnumpy(), gt_out, rtol=eps, atol=eps)
-                        gt_lhs_grad, gt_rhs_grad = gt_grad_batch_dot_numpy(lhs_val.asnumpy(),
-                                                              rhs_val.asnumpy(),
-                                                              o_grad.asnumpy(),
-                                                              transpose_a=transpose_a,
-                                                              transpose_b=transpose_b,
-                                                              lhs_req=lhs_grad_req,
-                                                              rhs_req=rhs_grad_req,
-                                                              init_lhs_grad=init_lhs_grad.asnumpy(),
-                                                              init_rhs_grad=init_rhs_grad.asnumpy())
-                        assert_almost_equal(lhs_val.grad.asnumpy(), gt_lhs_grad, rtol=eps, atol=eps)
-                        assert_almost_equal(rhs_val.grad.asnumpy(), gt_rhs_grad, rtol=eps, atol=eps)
-    for lhs_shape, rhs_shape, transpose_a, transpose_b in bad_configs:
-        for dtype in dtypes:
-            lhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, lhs_shape), dtype=dtype)
-            rhs_val = mx.np.array(_np.random.uniform(-1.0, 1.0, rhs_shape), dtype=dtype)
-            assert_raises(MXNetError, lambda: mx.npx.batch_dot(lhs_val, rhs_val,
-                                                               transpose_a=transpose_a,
-                                                               transpose_b=transpose_b))
-
 
 @with_seed()
 @use_np
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index dde28fdb766f..7ea106b2620f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2964,7 +2964,6 @@ def test_big_transpose():
     assert_allclose(x_np, z.asnumpy().astype('uint8'))
 
 
-@with_seed()
 def test_larger_transpose():
     x = mx.nd.random.normal(shape=(50,51))
     y = mx.nd.transpose(x)
@@ -3325,9 +3324,9 @@ def test_batch_dot():
                         agrad_npy = np.empty((batch_size, m, k), dtype=data_type)
                         bgrad_npy = np.empty((batch_size, k, n), dtype=data_type)
                         a_init_grad_npy = np.random.normal(size=(batch_size, m, k))
-                        a_init_grad_npy = a_init_grad_npy.astype(data_type)
+                        a_init_grad_npy = a_npy.astype(data_type)
                         b_init_grad_npy = np.random.normal(size=(batch_size, k, n))
-                        b_init_grad_npy = b_init_grad_npy.astype(data_type)
+                        b_init_grad_npy = b_npy.astype(data_type)
                         for i in range(batch_size):
                             c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :])
                             bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :])