diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 70934f64080c..e2702dd285e1 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1143,8 +1143,9 @@ build_python_docs() {
     export PATH=/home/jenkins_slave/.local/bin:$PATH
 
     pushd python
+    cp tutorials/getting-started/crash-course/prepare_dataset.py .
     make clean
-    make html EVAL=0
+    make html EVAL=1
 
     GZIP=-9 tar zcvf python-artifacts.tgz -C build/_build/html .
     popd
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index ac83c100cb5a..93c93a6dd7e9 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -1070,11 +1070,11 @@ def should_pack_website() {
 // Call this function from Jenkins to generate just the Python API microsite artifacts.
 def docs_python(lib_name) {
     return ['Python Docs': {
-      node(NODE_LINUX_CPU) {
+      node(NODE_LINUX_GPU_G4) {
         ws('workspace/docs') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init(lib_name, mx_lib, false)
-            utils.docker_run('ubuntu_cpu', 'build_python_docs', false)
+            utils.unpack_and_init(lib_name, mx_lib_cython)
+            utils.docker_run('ubuntu_gpu_cu111', 'build_python_docs', true)
             if (should_pack_website()) {
               utils.pack_lib('python-artifacts', 'docs/_build/python-artifacts.tgz', false)
             }
diff --git a/ci/jenkins/Jenkinsfile_website_beta b/ci/jenkins/Jenkinsfile_website_beta
index c789b30988f5..b667daedfaa2 100644
--- a/ci/jenkins/Jenkinsfile_website_beta
+++ b/ci/jenkins/Jenkinsfile_website_beta
@@ -36,13 +36,15 @@ utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-m
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_unix_cpu_openblas('libmxnet')
+    custom_steps.compile_unix_cpu_openblas('libmxnet'),
+    custom_steps.compile_unix_full_gpu('libmxnet_gpu')
   ])
 
   utils.parallel_stage('Build Docs', [
     // Only building a subset of the docs for previewing on staging
     custom_steps.docs_jekyll(),
-    custom_steps.docs_python('libmxnet')
+    custom_steps.docs_c('libmxnet'),
+    custom_steps.docs_python('libmxnet_gpu')
   ])
 
   utils.parallel_stage('Prepare', [
diff --git a/ci/jenkins/Jenkinsfile_website_full b/ci/jenkins/Jenkinsfile_website_full
index d2de41132d72..354cb51048c4 100644
--- a/ci/jenkins/Jenkinsfile_website_full
+++ b/ci/jenkins/Jenkinsfile_website_full
@@ -35,13 +35,14 @@ utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-m
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_unix_cpu_openblas('libmxnet')
+    custom_steps.compile_unix_cpu_openblas('libmxnet'),
+    custom_steps.compile_unix_full_gpu('libmxnet_gpu')
   ])
 
   utils.parallel_stage('Build Docs', [
     custom_steps.docs_jekyll(),
     custom_steps.docs_c('libmxnet'),
-    custom_steps.docs_python('libmxnet'),
+    custom_steps.docs_python('libmxnet_gpu'),
   ])
 
   utils.parallel_stage('Prepare', [
diff --git a/ci/jenkins/Jenkinsfile_website_full_pr b/ci/jenkins/Jenkinsfile_website_full_pr
index 7ac880fc9127..1f7cfaeb5d96 100644
--- a/ci/jenkins/Jenkinsfile_website_full_pr
+++ b/ci/jenkins/Jenkinsfile_website_full_pr
@@ -29,19 +29,20 @@ node('utility') {
   utils = load('ci/Jenkinsfile_utils.groovy')
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
-utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu')
+utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu_g4: 'mxnetlinux-gpu-g4', linux_gpu: 'mxnetlinux-gpu')
 
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_unix_cpu_openblas('libmxnet')
+    custom_steps.compile_unix_cpu_openblas('libmxnet'),
+    custom_steps.compile_unix_full_gpu('libmxnet_gpu')
   ])
 
   utils.parallel_stage('Build Docs', [
     // Optimization would be to flag these not to stash if not previewing them
     custom_steps.docs_jekyll(),
     custom_steps.docs_c('libmxnet'),
-    custom_steps.docs_python('libmxnet'),
+    custom_steps.docs_python('libmxnet_gpu'),
   ])
 
   // TODO: add a website preview function
diff --git a/ci/jenkins/Jenkinsfile_website_nightly b/ci/jenkins/Jenkinsfile_website_nightly
index 6fa5d1a9396f..5d81588cc530 100644
--- a/ci/jenkins/Jenkinsfile_website_nightly
+++ b/ci/jenkins/Jenkinsfile_website_nightly
@@ -30,18 +30,19 @@ node('restricted-utility') {
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
 
-utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu')
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu')
 
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_unix_cpu_openblas('libmxnet')
+    custom_steps.compile_unix_cpu_openblas('libmxnet'),
+    custom_steps.compile_unix_full_gpu('libmxnet_gpu')
   ])
 
   utils.parallel_stage('Build Docs', [
     custom_steps.docs_jekyll(),
     custom_steps.docs_c('libmxnet'),
-    custom_steps.docs_python('libmxnet'),
+    custom_steps.docs_python('libmxnet_gpu'),
   ])
 
   utils.parallel_stage('Prepare', [
diff --git a/ci/jenkins/Jenkinsfile_website_python_docs b/ci/jenkins/Jenkinsfile_website_python_docs
index 13c7cb1177e2..de4effdc6c8c 100644
--- a/ci/jenkins/Jenkinsfile_website_python_docs
+++ b/ci/jenkins/Jenkinsfile_website_python_docs
@@ -29,16 +29,16 @@ node('utility') {
   utils = load('ci/Jenkinsfile_utils.groovy')
   custom_steps = load('ci/jenkins/Jenkins_steps.groovy')
 }
-utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu')
+utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_g4: 'mxnetlinux-gpu-g4')
 
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_unix_cpu_openblas('libmxnet')
+    custom_steps.compile_unix_full_gpu('libmxnet_gpu')
   ])
 
   utils.parallel_stage('Python Docs', [
-    custom_steps.docs_python('libmxnet')
+    custom_steps.docs_python('libmxnet_gpu')
   ])
 
 }
diff --git a/ci/jenkins/Jenkinsfile_website_version_artifacts b/ci/jenkins/Jenkinsfile_website_version_artifacts
index 01daa05210b8..0a31c118cfe9 100644
--- a/ci/jenkins/Jenkinsfile_website_version_artifacts
+++ b/ci/jenkins/Jenkinsfile_website_version_artifacts
@@ -35,13 +35,14 @@ utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-m
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-    custom_steps.compile_unix_cpu_openblas('libmxnet')
+    custom_steps.compile_unix_cpu_openblas('libmxnet'),
+    custom_steps.compile_unix_full_gpu('libmxnet_gpu')
   ])
 
   utils.parallel_stage('Build Docs', [
     custom_steps.docs_jekyll(),
     custom_steps.docs_c('libmxnet'),
-    custom_steps.docs_python('libmxnet'),
+    custom_steps.docs_python('libmxnet_gpu'),
   ])
 
   utils.parallel_stage('Build Full Website', [
diff --git a/docs/python_docs/python/scripts/md2ipynb.py b/docs/python_docs/python/scripts/md2ipynb.py
index 9d1c2c173330..8cc341b74e79 100644
--- a/docs/python_docs/python/scripts/md2ipynb.py
+++ b/docs/python_docs/python/scripts/md2ipynb.py
@@ -26,17 +26,28 @@ def md2ipynb():
     (src_fn, input_fn, output_fn) = sys.argv
 
     # timeout for each notebook, in sec
-    timeout = 20 * 60
+    timeout = 60 * 60
     # if enable evaluation
     do_eval = int(os.environ.get('EVAL', True))
+    
+    # Skip these notebooks as some APIs will no longer be used
+    skip_list = ["pytorch.md", "mnist.md", "custom-loss.md", "fit_api_tutorial.md", \
+        "01-ndarray-intro.md", "02-ndarray-operations.md", "03-ndarray-contexts.md", \
+        "gotchas_numpy_in_mxnet.md", "csr.md", "row_sparse.md", "fine_tuning_gluon.md", \
+        "inference_on_onnx_model.md", "amp.md", "profiler.md"]
+
+    require_gpu = []
+    # the files will be ignored for execution
+    ignore_execution = skip_list + require_gpu
 
     reader = notedown.MarkdownReader(match='strict')
     with open(input_fn, 'r', encoding="utf8") as f:
         notebook = reader.read(f)
     if do_eval:
-        tic = time.time()
-        notedown.run(notebook, timeout)
-        print('%s: Evaluated %s in %f sec'%(src_fn, input_fn, time.time()-tic))
+        if not any([i in input_fn for i in ignore_execution]):
+            tic = time.time()
+            notedown.run(notebook, timeout)
+            print('%s: Evaluated %s in %f sec'%(src_fn, input_fn, time.time()-tic))
     # need to add language info to for syntax highlight
     notebook['metadata'].update({'language_info':{'name':'python'}})
     with open(output_fn, 'w', encoding='utf-8') as f:
diff --git a/docs/python_docs/python/tutorials/deploy/export/onnx.md b/docs/python_docs/python/tutorials/deploy/export/onnx.md
index 7961abad7e2a..170f139e77e7 100644
--- a/docs/python_docs/python/tutorials/deploy/export/onnx.md
+++ b/docs/python_docs/python/tutorials/deploy/export/onnx.md
@@ -28,41 +28,47 @@ In this tutorial, we will learn how to use MXNet to ONNX exporter on pre-trained
 ## Prerequisites
 
 To run the tutorial you will need to have installed the following python modules:
-- [MXNet >= 1.3.0](https://mxnet.apache.org/get_started)
-- [onnx]( https://github.com/onnx/onnx#user-content-installation) v1.2.1 (follow the install guide)
+- [MXNet >= 2.0.0](https://mxnet.apache.org/get_started)
+- [onnx]( https://github.com/onnx/onnx#user-content-installation) v1.7 & v1.8 (follow the install guide)
 
-*Note:* MXNet-ONNX importer and exporter follows version 7 of ONNX operator set which comes with ONNX v1.2.1.
+*Note:* MXNet-ONNX importer and exporter follows version 12 & 13 of ONNX operator set which comes with ONNX v1.7 & v1.8.
 
 
 ```{.python .input}
 import mxnet as mx
-import numpy as np
-from mxnet.contrib import onnx as onnx_mxnet
+from mxnet import initializer as init, np, onnx as mxnet_onnx
+from mxnet.gluon import nn
 import logging
 logging.basicConfig(level=logging.INFO)
 ```
 
-## Downloading a model from the MXNet model zoo
+## Create a model from the MXNet Gluon
 
-We download the pre-trained ResNet-18 [ImageNet](http://www.image-net.org/) model from the [MXNet Model Zoo](../../../api/gluon/model_zoo/index.rst).
-We will also download synset file to match labels.
+Let's build a concise model with [MXNet gluon](../../../api/gluon/index.rst) package. The model is multilayer perceptrons with two fully-connected layers. The first one is our hidden layer, which contains 256 hidden units and applies ReLU activation function. The second is our output layer. 
 
 ```{.python .input}
-# Download pre-trained resnet model - json and params by running following code.
-path='http://data.mxnet.io/models/imagenet/'
-[mx.test_utils.download(path+'resnet/18-layers/resnet-18-0000.params'),
- mx.test_utils.download(path+'resnet/18-layers/resnet-18-symbol.json'),
- mx.test_utils.download(path+'synset.txt')]
+net = nn.HybridSequential()
+net.add(nn.Dense(256, activation='relu'), nn.Dense(10))
 ```
 
-Now, we have downloaded ResNet-18 symbol, params and synset file on the disk.
+Then we initialize the model and export it into symbol file and parameter file. 
+
+```{.python .input}
+net.initialize(init.Normal(sigma=0.01))
+net.hybridize()
+input = np.ones(shape=(50,), dtype=np.float32)
+output = net(input)
+net.export("mlp")
+```
+
+Now, we have exported the model symbol, params file on the disk.
 
 ## MXNet to ONNX exporter API
 
 Let us describe the MXNet's `export_model` API.
 
 ```{.python .input}
-help(onnx_mxnet.export_model)
+help(mxnet_onnx.export_model)
 ```
 
 Output:
@@ -110,22 +116,22 @@ Since we have downloaded pre-trained model files, we will use the `export_model`
 We will use the downloaded pre-trained model files (sym, params) and define input variables.
 
 ```{.python .input}
-# Downloaded input symbol and params files
-sym = './resnet-18-symbol.json'
-params = './resnet-18-0000.params'
+# The input symbol and params files
+sym = './mlp-symbol.json'
+params = './mlp-0000.params'
 
 # Standard Imagenet input - 3 channels, 224*224
-input_shape = (1,3,224,224)
+input_shape = (50,)
 
 # Path of the output file
-onnx_file = './mxnet_exported_resnet50.onnx'
+onnx_file = './mxnet_exported_mlp.onnx'
 ```
 
 We have defined the input parameters required for the `export_model` API. Now, we are ready to covert the MXNet model into ONNX format.
 
 ```{.python .input}
 # Invoke export model API. It returns path of the converted onnx model
-converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file)
+converted_model_path = mxnet_onnx.export_model(sym, params, [input_shape], [np.float32], onnx_file)
 ```
 
 This API returns path of the converted model which you can later use to import the model into other frameworks.
diff --git a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
index 0a7a8d5d5bd2..85aca898643b 100644
--- a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
+++ b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
@@ -78,7 +78,8 @@ from mxnet import gluon
 import mxnet as mx
 
 # set context
-ctx = mx.gpu()
+gpus = mx.test_utils.list_gpus()
+ctx =  mx.gpu() if gpus else mx.cpu()
 
 # load pre-trained model
 net = gluon.model_zoo.vision.resnet50_v1(pretrained=True, ctx=ctx)
@@ -94,17 +95,17 @@ img_path = gluon.utils.download('https://github.com/dmlc/web-data/blob/master/mx
 img = mx.image.imread(img_path)
 img = mx.image.imresize(img, 224, 224) # resize
 img = mx.image.color_normalize(img.astype(dtype='float32')/255,
-                               mean=mx.nd.array([0.485, 0.456, 0.406]),
-                               std=mx.nd.array([0.229, 0.224, 0.225])) # normalize
+                               mean=mx.np.array([0.485, 0.456, 0.406]),
+                               std=mx.np.array([0.229, 0.224, 0.225])) # normalize
 img = img.transpose((2, 0, 1)) # channel first
-img = img.expand_dims(axis=0) # batchify
-img = img.as_in_context(ctx)
+img = mx.np.expand_dims(img, axis=0) # batchify
+img = img.as_in_ctx(ctx)
 
-prob = net(img).softmax() # predict and normalize output
-idx = prob.topk(k=5)[0] # get top 5 result
+prob = mx.npx.softmax(net(img)) # predict and normalize output
+idx = mx.npx.topk(prob, k=5)[0] # get top 5 result
 for i in idx:
-    i = int(i.asscalar())
-    print('With prob = %.5f, it contains %s' % (prob[0,i].asscalar(), labels[i]))
+    i = int(i.item())
+    print('With prob = %.5f, it contains %s' % (prob[0,i].item(), labels[i]))
 ```
 
 After running the above script, you should get the following output showing the five classes that the image most relates to with probability:
diff --git a/docs/python_docs/python/tutorials/extend/customop.md b/docs/python_docs/python/tutorials/extend/customop.md
index d7c08f4751eb..2ee70afbec8b 100644
--- a/docs/python_docs/python/tutorials/extend/customop.md
+++ b/docs/python_docs/python/tutorials/extend/customop.md
@@ -31,11 +31,12 @@ import numpy as np
 import mxnet as mx
 from mxnet import gluon, autograd
 import os
+mx.npx.reset_np()
 ```
 
 ## Parameter-less operators
 
-This operator implements the standard sigmoid activation function. This is only for illustration purposes, in real life you would use the built-in operator `mx.nd.relu`.
+This operator implements the standard sigmoid activation function. This is only for illustration purposes, in real life you would use the built-in operator `mx.npx.relu`.
 
 ### Forward & backward implementation
 
@@ -218,7 +219,7 @@ print(y)
 ## Using custom operators with fork
 In Linux systems, the default method in multiprocessing to create process is by using fork. If there are unfinished async custom operations when forking, the program will be blocked because of python GIL. Always use sync calls like `wait_to_read` or `waitall` before calling fork.
 
-```{.python .input}
+```{.python}
 x = mx.nd.array([0, 1, 2, 3])
 y = mx.nd.Custom(x, op_type='sigmoid')
 # unfinished async sigmoid operation will cause blocking
@@ -227,7 +228,7 @@ os.fork()
 
 Correctly handling this will make mxnet depend upon libpython, so the workaround now is to ensure that all custom operations are executed before forking process.
 
-```{.python .input}
+```{.python}
 x = mx.nd.array([0, 1, 2, 3])
 y = mx.nd.Custom(x, op_type='sigmoid')
 # force execution by reading y
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md b/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md
index 494e786c7075..7ab1a85cb923 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md
@@ -181,8 +181,19 @@ class Net(nn.Block):
 
 ```{.python .input}
 class MLP(nn.Block):
- def __init__(self): super().__init__() self.dense1 = nn.Dense(5,activation='relu') self.dense2 = nn.Dense(25,activation='relu') self.dense3 = nn.Dense(2)
- def forward(self, x): layer1 = self.dense1(x) layer2 = self.dense2(layer1) layer3 = self.dense3(layer2) return layer3  net = MLP()
+    def __init__(self):
+        super().__init__()
+        self.dense1 = nn.Dense(5,activation='relu')
+        self.dense2 = nn.Dense(25,activation='relu')
+        self.dense3 = nn.Dense(2)
+
+    def forward(self, x):
+        layer1 = self.dense1(x)
+        layer2 = self.dense2(layer1)
+        layer3 = self.dense3(layer2)
+        return layer3
+
+net = MLP()
 net
 ```
 
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md b/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md
index 3dad7247d48a..135c22a85b8a 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md
@@ -273,7 +273,7 @@ print(curr_weight)
 
 ```{.python .input}
 batch_size = len(nn_input)
-trainer.step(batch_size)
+trainer.step(batch_size, ignore_stale_grad=True)
 print(net.weight.data())
 ```
 
@@ -364,7 +364,7 @@ p = precision()
 And finally, call the `update` method to return the results of `precision` for your data
 
 ```{.python .input}
-p.update(np.array(y_true), np.array(y_pred))
+p.update(np.array(labels), np.array(preds))
 ```
 
 ## Next steps
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md b/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md
index d9c76a8a735c..68cdbd829729 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md
@@ -37,9 +37,9 @@ You will first start by generating random data `X` (with 3 variables) and corres
 
 
 ```{.python .input}
-mx.random.seed(42) # Fix the seed for reproducibility
-X = mx.random.uniform(shape=(10, 3))
-y = mx.random.uniform(shape=(10, 1))
+mx.np.random.seed(42) # Fix the seed for reproducibility
+X = mx.np.random.uniform(size=(10, 3))
+y = mx.np.random.uniform(size=(10, 1))
 dataset = mx.gluon.data.dataset.ArrayDataset(X, y)
 ```
 
@@ -93,8 +93,8 @@ def transform(data, label):
     data = data.astype('float32')/255
     return data, label
 
-train_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=True, transform=transform)
-valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False, transform=transform)
+train_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=True).transform(transform)
+valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False).transform(transform)
 ```
 
 
@@ -109,7 +109,7 @@ label_desc = {0:'T-shirt/top', 1:'Trouser', 2:'Pullover', 3:'Dress', 4:'Coat', 5
 
 print("Data type: {}".format(data.dtype))
 print("Label: {}".format(label))
-print("Label description: {}".format(label_desc[label]))
+print("Label description: {}".format(label_desc[label.item()]))
 imshow(data[:,:,0].asnumpy(), cmap='gray')
 ```
 
@@ -169,8 +169,8 @@ You instantiate the ImageFolderDatasets by providing the path to the data, and t
 Optionally, you can pass a `transform` parameter to these `Dataset`'s as you've seen before.
 
 ```{.python .input}
-training_path='/home/ec2-user/SageMaker/data/101_ObjectCategories'
-testing_path='/home/ec2-user/SageMaker/data/101_ObjectCategories_test'
+training_path='./data/101_ObjectCategories'
+testing_path='./data/101_ObjectCategories_test'
 train_dataset = mx.gluon.data.vision.datasets.ImageFolderDataset(training_path)
 test_dataset = mx.gluon.data.vision.datasets.ImageFolderDataset(testing_path)
 ```
@@ -201,7 +201,7 @@ imshow(data.asnumpy(), cmap='gray')
 
 Sometimes you have data that doesn't quite fit the format expected by the included Datasets. You might be able to preprocess your data to fit the expected format, but it is easy to create your own dataset to do this.
 
-All you need to do is create a class that implements a `__getitem__` method, that returns a sample (i.e. a tuple of mx.nd.NDArrays).
+All you need to do is create a class that implements a `__getitem__` method, that returns a sample (i.e. a tuple of mx.np.ndarrays).
 
 # New in MXNet 2.0: faster C++ backend dataloaders
 
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md b/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
index 5660de34504e..8f1b23b1e4ae 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
@@ -45,7 +45,7 @@ import numpy as np
 
 from prepare_dataset import process_dataset #utility code to rearrange the data
 
-mx.random.seed(42)
+mx.np.random.seed(42)
 ```
 
 ```{.python .input}
@@ -322,13 +322,13 @@ hybridize the model.
 
 ```{.python .input}
 # Create the model based on the blueprint provided and initialize the parameters
-ctx = mx.cpu()
+ctx = mx.gpu()
 
 initializer = mx.initializer.Xavier()
 
 model = LeafNetwork()
 model.initialize(initializer, ctx=ctx)
-model.summary(mx.nd.random.uniform(shape=(4, 3, 128, 128)))
+model.summary(mx.np.random.uniform(size=(4, 3, 128, 128), ctx=ctx))
 model.hybridize()
 ```
 
@@ -368,7 +368,7 @@ def test(val_data):
     for batch in val_data:
         data = batch[0]
         labels = batch[1]
-        outputs = model(data)
+        outputs = model(data.as_in_ctx(ctx))
         acc.update([labels], [outputs])
 
     _, accuracy = acc.get()
@@ -396,8 +396,8 @@ for epoch in range(epochs):
         data = batch[0]
         label = batch[1]
         with mx.autograd.record():
-            outputs = model(data)
-            loss = loss_fn(outputs, label)
+            outputs = model(data.as_in_ctx(ctx))
+            loss = loss_fn(outputs, label.as_in_ctx(ctx))
         mx.autograd.backward(loss)
         trainer.step(batch_size)
         accuracy.update([label], [outputs])
@@ -405,7 +405,7 @@ for epoch in range(epochs):
             _, acc = accuracy.get()
 
             print(f"""Epoch[{epoch + 1}] Batch[{idx + 1}] Speed: {batch_size / (time.time() - btic)} samples/sec \
-                  batch loss = {loss.mean().asscalar()} | accuracy = {acc}""")
+                  batch loss = {loss.mean().item()} | accuracy = {acc}""")
             btic = time.time()
 
     _, acc = accuracy.get()
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md b/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md
index babcdd633a54..33717f03373c 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md
@@ -118,6 +118,7 @@ class LeafNetwork(nn.HybridBlock):
 Load the saved parameters onto GPU 0 directly as shown below; additionally, you could use `net.collect_params().reset_ctx(gpu)` to change the device.
 
 ```{.python .input}
+net = LeafNetwork()
 net.load_parameters('leaf_models.params', ctx=gpu)
 ```
 
@@ -160,6 +161,11 @@ validation_transformer = transforms.Compose([
     transforms.Normalize(mean, std)
 ])
 
+# Use ImageFolderDataset to create a Dataset object from directory structure
+train_dataset = gluon.data.vision.ImageFolderDataset('./datasets/train')
+val_dataset = gluon.data.vision.ImageFolderDataset('./datasets/validation')
+test_dataset = gluon.data.vision.ImageFolderDataset('./datasets/test')
+
 # Create data loaders
 batch_size = 4
 train_loader = gluon.data.DataLoader(train_dataset.transform_first(training_transformer),batch_size=batch_size, shuffle=True, try_nopython=True)
@@ -172,13 +178,14 @@ This is the same test function defined previously in the **Step 6**.
 
 ```{.python .input}
 # Function to return the accuracy for the validation and test set
-def test(val_data):
+def test(val_data, devices):
     acc = gluon.metric.Accuracy()
     for batch in val_data:
-        data = batch[0]
-        labels = batch[1]
-        outputs = model(data)
-        acc.update([labels], [outputs])
+        data, label = batch[0], batch[1]
+        data_list = gluon.utils.split_and_load(data, devices)
+        label_list = gluon.utils.split_and_load(label, devices)
+        outputs = [net(X) for X in data_list]
+        acc.update(label_list, outputs)
 
     _, accuracy = acc.get()
     return accuracy
@@ -206,7 +213,7 @@ epochs = 2
 accuracy = gluon.metric.Accuracy()
 log_interval = 5
 
-for epoch in range(10):
+for epoch in range(epochs):
     train_loss = 0.
     tic = time.time()
     btic = time.time()
@@ -242,7 +249,7 @@ for epoch in range(10):
 
     _, acc = accuracy.get()
 
-    acc_val = test(validation_loader)
+    acc_val = test(validation_loader, devices)
     print(f"[Epoch {epoch + 1}] training: accuracy={acc}")
     print(f"[Epoch {epoch + 1}] time cost: {time.time() - tic}")
     print(f"[Epoch {epoch + 1}] validation: validation accuracy={acc_val}")
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index 2398d700bad0..39f6d371bbb0 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -133,9 +133,9 @@ validation_transformer = transforms.Compose([
 ])
 
 # save mean and std NDArray values for inference
-mean_img = mx.nd.stack(*[mx.nd.full((224, 224), m) for m in mean])
-std_img = mx.nd.stack(*[mx.nd.full((224, 224), s) for s in std])
-mx.nd.save('mean_std_224.nd', {"mean_img": mean_img, "std_img": std_img})
+mean_img = mx.np.stack([mx.np.full((224, 224), m) for m in mean])
+std_img = mx.np.stack([mx.np.full((224, 224), s) for s in std])
+mx.npx.savez('mean_std_224.np', **{"mean_img": mean_img, "std_img": std_img})
 
 train_path = os.path.join(path, 'train')
 val_path = os.path.join(path, 'valid')
@@ -184,7 +184,7 @@ schedule = mx.lr_scheduler.MultiFactorScheduler(step=lr_steps, factor=lr_factor,
 
 # setup optimizer with learning rate scheduler, metric, and loss function
 sgd_optimizer = mx.optimizer.SGD(learning_rate=lr, lr_scheduler=schedule, momentum=momentum, wd=wd)
-metric = mx.metric.Accuracy()
+metric = mx.gluon.metric.Accuracy()
 softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 ```
 
@@ -196,7 +196,7 @@ Now let's define the test metrics and start fine-tuning.
 
 ```{.python .input}
 def test(net, val_data, ctx):
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
     for i, (data, label) in enumerate(val_data):
         data = gluon.utils.split_and_load(data, ctx_list=ctx, even_split=False)
         label = gluon.utils.split_and_load(label, ctx_list=ctx, even_split=False)
@@ -224,7 +224,7 @@ for epoch in range(1, epochs + 1):
             l.backward()
 
         trainer.step(batch_size)
-        train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
+        train_loss += sum([l.mean().item() for l in loss]) / len(loss)
         metric.update(label, outputs)
 
     _, train_acc = metric.get()
diff --git a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
index 94aff3d2e9ee..ddfd9e7f32b5 100644
--- a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
+++ b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
@@ -24,13 +24,13 @@ Before anything else, let's import required packages for this tutorial.
 
 
 ```{.python .input}
-import numpy as np
+import numpy as onp
 import mxnet as mx
-from mxnet import nd, autograd, gluon
+from mxnet import np, npx, autograd, gluon
 from mxnet.gluon import nn, Trainer
 from mxnet.gluon.data import DataLoader, ArrayDataset
 
-mx.random.seed(12345)  # Added for reproducibility
+mx.np.random.seed(12345)  # Added for reproducibility
 ```
 
 In this tutorial we will use fake dataset, which contains 10 features drawn from a normal distribution with mean equals to 0 and standard deviation equals to 1, and a class label, which can be either 0 or 1. The size of the dataset is an arbitrary value. The function below helps us to generate a dataset. Class label `y` is generated via a non-random logic, so the network would have a pattern to look for. Boundary of 3 is selected to make sure that number of positive examples smaller than negative, but not too small
@@ -38,7 +38,7 @@ In this tutorial we will use fake dataset, which contains 10 features drawn from
 
 ```{.python .input}
 def get_random_data(size, ctx):
-    x = nd.normal(0, 1, shape=(size, 10), ctx=ctx)
+    x = np.random.normal(0, 1, size=(size, 10), ctx=ctx)
     y = x.sum(axis=1) > 3
     return x, y
 ```
@@ -103,8 +103,8 @@ Below we define these objects.
 loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
 trainer = Trainer(params=net.collect_params(), optimizer='sgd',
                   optimizer_params={'learning_rate': 0.1})
-accuracy = mx.metric.Accuracy()
-f1 = mx.metric.F1()
+accuracy = mx.gluon.metric.Accuracy()
+f1 = mx.gluon.metric.F1()
 ```
 
 The next step is to define the training function in which we iterate over all batches of training data, execute the forward pass on each batch and calculate training loss. On line 19, we sum losses of every batch per epoch into a single variable, because we calculate loss per single batch, but want to display it per epoch.
@@ -129,7 +129,7 @@ def train_model():
         trainer.step(batch_size)
 
         # sum losses of every batch
-        cumulative_train_loss += nd.sum(loss_result).asscalar()
+        cumulative_train_loss += np.sum(loss_result).item()
 
     return cumulative_train_loss
 ```
@@ -140,13 +140,13 @@ Our validation function is very similar to the training one. The main difference
 
 `Accuracy` metric requires 2 arguments: 1) a vector of ground-truth classes and 2) A vector or matrix of predictions. When predictions are of the same shape as the vector of ground-truth classes, `Accuracy` class assumes that prediction vector contains predicted classes. So, it converts the vector to `Int32` and compare each item of ground-truth classes to prediction vector.
 
-Because of the behaviour above, you will get an unexpected result if you just apply [Sigmoid](../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.sigmoid) function to the network result and pass it to `Accuracy` metric. As mentioned before, we need to apply `Sigmoid` function to the output of the neuron to get a probability of belonging to the class 1. But `Sigmoid` function produces output in range [0; 1], and all numbers in that range are going to be casted to 0, even if it is as high as 0.99. To avoid this we write a custom bit of code on line 12, that:
+Because of the behaviour above, you will get an unexpected result if you just apply [Sigmoid](https://mxnet.apache.org/versions/master/api/python/docs/api/npx/generated/mxnet.npx.sigmoid.html) function to the network result and pass it to `Accuracy` metric. As mentioned before, we need to apply `Sigmoid` function to the output of the neuron to get a probability of belonging to the class 1. But `Sigmoid` function produces output in range [0; 1], and all numbers in that range are going to be casted to 0, even if it is as high as 0.99. To avoid this we write a custom bit of code on line 12, that:
 
 1. Calculates sigmoid using `Sigmoid` function
 
 2. Subtracts a threshold from the original sigmoid output. Usually, the threshold is equal to 0.5, but it can be higher, if you want to increase certainty of an item to belong to class 1.
 
-3. Uses [mx.nd.ceil](../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.ceil) function, which converts all negative values to 0 and all positive values to 1
+3. Uses [mx.np.ceil](https://mxnet.apache.org/versions/master/api/python/docs/api/np/generated/mxnet.np.ceil.html#mxnet-np-ceil) function, which converts all negative values to 0 and all positive values to 1
 
 After these transformations we can pass the result to `Accuracy.update()` method and expect it to behave in a proper way.
 
@@ -168,20 +168,20 @@ def validate_model(threshold):
         output = net(val_data)
 
         # Similar to cumulative training loss, calculate cumulative validation loss
-        cumulative_val_loss += nd.sum(loss(output, val_ground_truth_class)).asscalar()
+        cumulative_val_loss += np.sum(loss(output, val_ground_truth_class)).item()
 
         # getting prediction as a sigmoid
-        prediction = net(val_data).sigmoid()
+        prediction = npx.sigmoid(net(val_data))
 
         # Converting neuron outputs to classes
-        predicted_classes = mx.nd.ceil(prediction - threshold)
+        predicted_classes = mx.np.ceil(prediction - threshold)
 
         # Update validation accuracy
         accuracy.update(val_ground_truth_class, predicted_classes.reshape(-1))
 
         # calculate probabilities of belonging to different classes. F1 metric works only with this notation
         prediction = prediction.reshape(-1)
-        probabilities = mx.nd.stack(1 - prediction, prediction, axis=1)
+        probabilities = mx.np.stack([1 - prediction, prediction], axis=1)
 
         f1.update(val_ground_truth_class, probabilities)
 
diff --git a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
index a374d931af0f..8d4ee2af4190 100644
--- a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
+++ b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
@@ -26,7 +26,7 @@ In the next 10 minutes, we'll do a quick comparison between the two frameworks a
 PyTorch uses conda for installation by default, for example:
 
 ```{.python .input}
-# !conda install pytorch-cpu -c pytorch
+# !conda install pytorch-cpu -c pytorch, torchvision
 ```
 
 For MXNet we use pip:
@@ -35,10 +35,10 @@ For MXNet we use pip:
 # !pip install mxnet
 ```
 
-To install Apache MXNet with GPU support, you need to specify CUDA version. For example, the snippet below will install Apache MXNet with CUDA 9.2 support:
+To install Apache MXNet with GPU support, you need to specify CUDA version. For example, the snippet below will install Apache MXNet with CUDA 10.2 support:
 
 ```{.python .input}
-# !pip install mxnet-cuda92
+# !pip install mxnet-cu102
 ```
 
 ## Data manipulation
@@ -60,9 +60,9 @@ y
 **MXNet:**
 
 ```{.python .input}
-from mxnet import nd
+from mxnet import np
 
-x = nd.ones((5,3))
+x = np.ones((5,3))
 y = x + 1
 y
 ```
@@ -203,7 +203,7 @@ for epoch in range(5):
             loss = mx_loss_fn(mx_net(X), y)
         loss.backward()
         mx_trainer.step(batch_size=128)
-        total_loss += loss.mean().asscalar()
+        total_loss += loss.mean().item()
     print('epoch %d, avg loss %.4f, time %.2f' % (
         epoch, total_loss/len(mx_train_data), time.time()-tic))
 ```
@@ -218,7 +218,7 @@ Some of the differences in Apache MXNet when compared to PyTorch are as follows:
 
 * You need to specify the update step size (usually batch size) when performing [step()](../../../api/gluon/trainer.rst#mxnet.gluon.Trainer.step) on the trainer.
 
-* You need to call [.asscalar()](../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.NDArray.asscalar) to turn a multidimensional array into a scalar.
+* You need to call [.item()](../../../api/np/arrays.ndarray.rst#the-n-dimensional-array-ndarray) to turn a multidimensional array into a scalar.
 
 * In this sample, Apache MXNet is twice as fast as PyTorch. Though you need to be cautious with such toy comparisons.
 
diff --git a/docs/python_docs/python/tutorials/packages/autograd/index.md b/docs/python_docs/python/tutorials/packages/autograd/index.md
index 91d7cbd892f4..1e00afd84497 100644
--- a/docs/python_docs/python/tutorials/packages/autograd/index.md
+++ b/docs/python_docs/python/tutorials/packages/autograd/index.md
@@ -84,13 +84,13 @@ As suggested by the name, `autograd` is automatic and so the complexities of the
 
 Step one is to import the `autograd` package.
 
-```
+```{.python .input}
 from mxnet import autograd
 ```
 
 As a simple example, we'll implement the regression model shown in the diagrams above, and later use `autograd` to automatically calculate the gradient of the loss with respect to each of the weight parameters.
 
-```
+```{.python .input}
 import mxnet as mx
 from mxnet.gluon.nn import HybridSequential, Dense
 from mxnet.gluon.loss import L2Loss
@@ -106,13 +106,13 @@ net.initialize()
 loss_fn = L2Loss()
 
 # Create dummy data
-x = mx.nd.array([[0.3, 0.5]])
-y = mx.nd.array([[1.5]])
+x = mx.np.array([[0.3, 0.5]])
+y = mx.np.array([[1.5]])
 ```
 
 We're ready for our first forward pass through the network, and we want `autograd` to record the computational graph so we can calculate gradients. One of the simplest ways to do this is by running the network (and loss) code in the scope of an `autograd.record` context.
 
-```
+```{.python .input}
 with autograd.record():
     y_hat = net(x)
     loss = loss_fn(y_hat, y)
@@ -122,13 +122,13 @@ Only operations that we want recorded are in the scope of the `autograd.record`
 
 Remember: if `loss` isn't a single scalar value (e.g. could be a loss for each sample, rather than for whole batch) a `sum` operation will be applied implicitly before starting the backward propagation, and the gradients calculated will be of this `sum` with respect to the parameters.
 
-```
+```{.python .input}
 loss.backward()
 ```
 
 And that's it! All the `autograd` magic is complete. We should now have gradients for each parameter of the network, which will be used by the optimizer to update the parameter values for improved performance. Check out the gradients of the first layer for example:
 
-```
+```{.python .input}
 net[0].weight.grad()
 ```
 
@@ -140,9 +140,9 @@ With MXNet Gluon, `autograd` is critical for switching between training and infe
 
 Creating a network of a single `Dropout` block will demonstrate this.
 
-```
+```{.python .input}
 dropout = mx.gluon.nn.Dropout(rate=0.5)
-data = mx.nd.ones(shape=(3,3))
+data = mx.np.ones(shape=(3,3))
 
 output = dropout(data)
 is_training = autograd.is_training()
@@ -151,7 +151,7 @@ print('is_training:', is_training, output)
 
 We called `dropout` when `autograd` wasn't recording, so our network was in inference mode and thus we didn't see any dropout of the input (i.e. it's still ones). We can confirm the current mode by calling `autograd.is_training()`.
 
-```
+```{.python .input}
 with autograd.record():
     output = dropout(data)
 print('is_training:', is_training, output)
@@ -167,7 +167,7 @@ When creating neural networks with MXNet Gluon it is assumed that you're interes
 
 Sometimes we don't need the gradients for all of the parameters though. One example would be 'freezing' the values of the parameters in certain layers. Since we don't need to update the values, we don't need the gradients. Using the `grad_req` property of a parameter and setting it to `'null'`, we can indicate this to `autograd`, saving us computation time and memory.
 
-```
+```{.python .input}
 net[0].weight.grad_req = 'null'
 ```
 
@@ -185,8 +185,8 @@ With `autograd` it's simple, but there's one key difference compared to paramete
 
 As a simple example, let's take the case where $y=2x^2$ and use `autograd` to calculate gradient of $y$ with respect to $x$ at three different values of $x$. We could obviously work out the gradient by hand in this case as $dy/dx=4x$, but let's use this knowledge to check `autograd`. Given $x$ is an `ndarray` and not a `Parameter`, we need to call `x.attach_grad()`.
 
-```
-x = mx.nd.array([1, 2, 3])
+```{.python .input}
+x = mx.np.array([1, 2, 3])
 x.attach_grad()
 with autograd.record():
     y = 2 * x ** 2
@@ -200,14 +200,14 @@ As mentioned before, one of the main advantages of `autograd` is the ability to
 
 We'll write a function as a toy example of a dynamic network. We'll add an `if` condition and a loop with a variable number of iterations, both of which will depend on the input data. Although these can now be used in static graphs (with conditional operators) it's still much more natural to use native control flow.
 
-```
+```{.python .input}
 import math
 
 
 def f(x):
     y = x  # going to change y but still want to use x
     if x < 0.75:  # variable num_loops because it depends on x
-        num_loops = math.floor(1/(1-x.asscalar()))
+        num_loops = math.floor(1/(1-x.item()))
         for i in range(num_loops):
             y = y * x  # increase polynomial degree
     else:  # otherwise flatline
@@ -221,7 +221,7 @@ We can plot the resultant function for $x$ between 0 and 1, and we should recogn
 
 Using `autograd`, let's now find the gradient of this arbritrary function. We don't have a vectorized function in this case, because of the control flow, so let's also create a function to calculate the gradient using `autograd`.
 
-```
+```{.python .input}
 def get_grad(f, x):
     x.attach_grad()
     with autograd.record():
@@ -229,8 +229,8 @@ def get_grad(f, x):
     y.backward()
     return x.grad
 
-xs = mx.nd.arange(0.0, 1.0, step=0.1)
-grads = [get_grad(f, x).asscalar() for x in xs]
+xs = mx.np.arange(0.0, 1.0, step=0.1)
+grads = [get_grad(f, x).item() for x in xs]
 print(grads)
 ```
 
@@ -253,8 +253,8 @@ Most of the time `autograd` will be aware of the complete computational graph, a
 
 As an example, let's take $y=x^3$ (calculated with `mxnet`) and $z=y^2$. (calculated with `numpy`). We can manually calculate $dz/dy=2y$ (once again with `numpy`), and use this as the head gradient for `autograd` to automatically calculate $dz/dx$. Applying the chain rule by hand we could calculate $dz/dx=6x^5$, so for $x=2$ we expect $dz/dx=192$. Let's check to see whether `autograd` calculates the same.
 
-```
-x = mx.nd.array([2,])
+```{.python .input}
+x = mx.np.array([2,])
 x.attach_grad()
 # compute y inside of mxnet (with `autograd`)
 with autograd.record():
@@ -264,7 +264,7 @@ y_np = y.asnumpy()
 z_np = y_np**2
 dzdy_np = 2*y_np
 # compute dz/dx inside of mxnet (given dz/dy)
-dzdy = mx.nd.array(dzdy_np)
+dzdy = mx.np.array(dzdy_np)
 y.backward(dzdy)
 print(x.grad)
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md
index 35642a138d54..fe5daa5f546c 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md
@@ -33,7 +33,7 @@ from matplotlib import pyplot as plt
 
 def visualize_activation(activation_fn):
     data = np.linspace(-10, 10, 501)
-    x = mx.nd.array(data)
+    x = mx.np.array(data)
     x.attach_grad()
     with mx.autograd.record():
         y = activation_fn(x)
@@ -42,7 +42,7 @@ def visualize_activation(activation_fn):
     plt.figure()
     plt.plot(data, y.asnumpy())
     plt.plot(data, x.grad.asnumpy())
-    activation = activation_fn.name[:-1]
+    activation = activation_fn.__class__.__name__[:-1]
     plt.legend(["{} activation".format(activation), "{} gradient".format(activation)])
 
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
index d29885aa6208..cc98f1f42266 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
@@ -37,7 +37,7 @@ from __future__ import print_function
 import mxnet as mx
 from mxnet import np, npx, gluon, autograd
 from mxnet.gluon.nn import Dense
-mx.random.seed(1)                      # Set seed for reproducable results
+mx.np.random.seed(1)                      # Set seed for reproducable results
 ```
 
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
index f104e2cdd92b..7ce2e22b3661 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
@@ -98,7 +98,7 @@ Through the use of experiments, this section will demonstrate the benefits of hy
 Previously, we learned how to use the Sequential class to concatenate multiple layers. Next, we will replace the Sequential class with the HybridSequential class in order to make use of hybrid programming.
 
 ```{.python .input}
-from mxnet import np, npx
+from mxnet import np, npx, sym
 from mxnet.gluon import nn
 import time
 
@@ -156,7 +156,7 @@ The .json and .params files generated during this process are a symbolic program
 
 In MXNet, a symbolic program refers to a program that makes use of the Symbol type. We know that, when the NDArray input `x` is provided to `net`, `net(x)` will directly calculate the model output and return a result based on `x`. For models that have called the `hybridize` function, we can also provide a Symbol-type input variable, and `net(x)` will return Symbol type results.
 
-```{.python .input}
+```{.python}
 x = sym.var('data')
 net(x)
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/init.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/init.md
index 3776814f52d8..a2e5a5201455 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/init.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/init.md
@@ -43,7 +43,7 @@ greatly simplifies the process of doing deep learning.
 Let's see what happens when we instantiate a network. We start by defining a multi-layer perceptron.
 
 ```{.python .input}
-from mxnet import init, nd
+from mxnet import init, np
 from mxnet.gluon import nn
 
 
@@ -85,7 +85,7 @@ As we can see, nothing really changed. Only once we provide the network with
 some data do we see a difference. Let's try it out.
 
 ```{.python .input}
-x = nd.random.uniform(shape=(2, 20))
+x = np.random.uniform(size=(2, 20))
 net(x)  # Forward computation
 print(net.collect_params())
 ```
@@ -113,13 +113,13 @@ nothing but report a debug message stating when it was invoked and with which
 parameters.
 
 ```{.python .input  n=22}
-class PrintInit(init.Initializer):
+class MyInit(init.Initializer):
     def _init_weight(self, name, data):
         print('Init', name, data.shape)
         # The actual initialization logic is omitted here.
 
 net = getnet()
-net.initialize(init=PrintInit())
+net.initialize(init=MyInit())
 ```
 
 Note that, although `MyInit` will print information about the model parameters
@@ -130,7 +130,7 @@ initialization when calling the `initialize` function - this
 we define the input and perform a forward calculation.
 
 ```{.python .input  n=25}
-x = nd.random.uniform(shape=(2, 20))
+x = np.random.uniform(size=(2, 20))
 y = net(x)
 ```
 
@@ -254,8 +254,8 @@ $$
 class MyInit(init.Initializer):
     def _init_weight(self, name, data):
         print('Init', name, data.shape)
-        data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
-        data *= data.abs() >= 5
+        data[:] = np.random.uniform(low=-10, high=10, size=data.shape)
+        data *= np.abs(data) >= 5
 
 net.initialize(MyInit(), force_reinit=True)
 net[0].weight.data()[0]
@@ -288,11 +288,11 @@ net = nn.Sequential()
 shared = nn.Dense(8, activation='relu')
 net.add(nn.Dense(8, activation='relu'),
         shared,
-        nn.Dense(8, activation='relu', params=shared.params),
+        nn.Dense(8, activation='relu').share_parameters(shared.params),
         nn.Dense(10))
 net.initialize()
 
-x = nd.random.uniform(shape=(2, 20))
+x = np.random.uniform(size=(2, 20))
 net(x)
 
 # Check whether the parameters are the same.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
index 511bd9b2a4b5..60b620d0fb39 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
@@ -35,7 +35,7 @@ When creating a block, you can simply do as follows:
 
 ```{.python .input}
 mydense = gluon.nn.Dense(100)
-print(mydense.name)
+print(mydense.__class__.__name__)
 ```
 
 When you create more Blocks of the same kind, they will be named with incrementing suffixes to avoid collision:
@@ -43,7 +43,7 @@ When you create more Blocks of the same kind, they will be named with incrementi
 
 ```{.python .input}
 dense1 = gluon.nn.Dense(100)
-print(dense1.name)
+print(dense1.__class__.__name__)
 ```
 
 ## Naming Parameters
@@ -62,7 +62,7 @@ When getting parameters within a Block, you should use the structure based name
 
 
 ```{.python .input}
-print(dense0.collect_params())
+print(dense1.collect_params())
 ```
 
 ## Nested Blocks
@@ -79,14 +79,14 @@ class Model(gluon.HybridBlock):
         self.mydense = gluon.nn.Dense(20)
 
     def forward(self, x):
-        x = mx.nd.relu(self.dense0(x))
-        x = mx.nd.relu(self.dense1(x))
-        return mx.nd.relu(self.mydense(x))
+        x = mx.npx.relu(self.dense0(x))
+        x = mx.npx.relu(self.dense1(x))
+        return mx.npx.relu(self.mydense(x))
 
 model0 = Model()
 model0.initialize()
 model0.hybridize()
-model0(mx.nd.zeros((1, 20)))
+model0(mx.np.zeros((1, 20)))
 ```
 
 The same principle also applies to container blocks like Sequential. We can simply do as follows:
@@ -106,9 +106,10 @@ For `HybridBlock`, we use `save_parameters`/`load_parameters`, which uses model
 
 
 ```{.python .input}
+model1 = Model()
 model0.save_parameters('model.params')
 model1.load_parameters('model.params')
-print(mx.nd.load('model.params').keys())
+print(mx.npx.load('model.params').keys())
 ```
 
 For `SymbolBlock.imports`, we use `export`, which uses parameter name `param.name`, to save parameters.
@@ -124,15 +125,15 @@ Sometimes you may want to load a pretrained model, and replace certain Blocks in
 
 For example, the alexnet in model zoo has 1000 output dimensions, but maybe you only have 100 classes in your application.
 
-To see how to do this, we first load a pretrained AlexNet.
+To see how to do this, we first load a pretrained ResNet.
 
 - In Gluon model zoo, all image classification models follow the format where the feature extraction layers are named `features` while the output layer is named `output`.
 - Note that the output layer is a dense block with 1000 dimension outputs.
 
 
 ```{.python .input}
-alexnet = gluon.model_zoo.vision.alexnet(pretrained=True)
-print(alexnet.output)
+resnet = gluon.model_zoo.vision.resnet50_v2()
+print(resnet.output)
 ```
 
 
@@ -140,6 +141,6 @@ To change the output to 100 dimension, we replace it with a new block.
 
 
 ```{.python .input}
-alexnet.output = gluon.nn.Dense(100)
-alexnet.output.initialize()
+resnet.output = gluon.nn.Dense(100)
+resnet.output.initialize()
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
index 0b4b3445902f..5e39f283dc0b 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
@@ -38,11 +38,11 @@ follows:
 
 ```{.python .input  n=1}
 import mxnet as mx
-from mxnet import nd
-from mxnet.gluon import nn, Block
+from mxnet import np, npx
+from mxnet.gluon import nn, Block, Parameter, Constant
 
 
-x = nd.random.uniform(shape=(2, 20))
+x = np.random.uniform(size=(2, 20))
 
 net = nn.Sequential()
 net.add(nn.Dense(256, activation='relu'))
@@ -97,19 +97,21 @@ This may help you understand more clearly how the `Sequential` class works.
 
 ```{.python .input  n=3}
 class MySequential(Block):
-    def __init__(self, **kwargs):
-        super(MySequential, self).__init__(**kwargs)
+    def __init__(self):
+        super(MySequential, self).__init__()
+        self._layers = []
 
     def add(self, block):
         # Here, block is an instance of a Block subclass, and we assume it has a unique name. We save it in the
-        # member variable _children of the Block class, and its type is OrderedDict. When the MySequential instance
-        # calls the initialize function, the system automatically initializes all members of _children.
-        self._children[block.name] = block
+        # member variable _layers of the Block class, and its type is List. When the MySequential instance
+        # calls the initialize function, the system automatically initializes all members of _layers.
+        self._layers.append(block)
+        self.register_child(block)
 
     def forward(self, x):
         # OrderedDict guarantees that members will be traversed in the order they were added.
         for block in self._children.values():
-            x = block(x)
+            x = block()(x)
         return x
 ```
 
@@ -229,8 +231,7 @@ class FancyMLP(nn.Block):
 
         # Random weight parameters created with the get_constant are not
         # iterated during training (i.e. constant parameters).
-        self.rand_weight = mx.gluon.Constant(
-            'rand_weight', nd.random.uniform(shape=(20, 20)))
+        self.rand_weight = Constant(np.random.uniform(size=(20, 20)))
         self.dense = nn.Dense(20, activation='relu')
 
     def forward(self, x):
@@ -238,16 +239,16 @@ class FancyMLP(nn.Block):
         # Use the constant parameters created, as well as the ReLU and dot
         # functions of NDArray.
 
-        x = nd.relu(nd.dot(x, self.rand_weight.data()) + 1)
+        x = npx.relu(np.dot(x, self.rand_weight.data()) + 1)
         # Re-use the fully connected layer. This is equivalent to sharing
         # parameters with two fully connected layers.
         x = self.dense(x)
-        # Here in the control flow, we need to call `asscalar` to return the
+        # Here in the control flow, we need to call `item` to return the
         # scalar for comparison.
 
-        while x.norm().asscalar() > 1:
+        while npx.norm(x).item() > 1:
             x /= 2
-        if x.norm().asscalar() < 0.8:
+        if npx.norm(x).item() < 0.8:
             x *= 10
         return x.sum()
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/parameters.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/parameters.md
index 8708bb460daa..5e231f030576 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/parameters.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/parameters.md
@@ -28,7 +28,7 @@ This section shows how to manipulate parameters. In particular we will cover the
 As always, we start with a Multilayer Perceptron with a single hidden layer. We will use it to demonstrate the aspects mentioned above.
 
 ```{.python .input  n=1}
-from mxnet import init, nd
+from mxnet import init, np
 from mxnet.gluon import nn
 
 
@@ -37,7 +37,7 @@ net.add(nn.Dense(256, activation='relu'))
 net.add(nn.Dense(10))
 net.initialize()  # Use the default initialization method
 
-x = nd.random.uniform(shape=(2, 20))
+x = np.random.uniform(size=(2, 20))
 net(x)            # Forward computation
 ```
 
@@ -46,11 +46,10 @@ net(x)            # Forward computation
 In case of a Sequential class we can access the parameters simply by indexing each layer of the network. The `params` variable contains the required data. Let's try this out in practice by inspecting the parameters of the first layer.
 
 ```{.python .input  n=2}
-print(net[0].params)
-print(net[1].params)
+print(net.collect_params())
 ```
 
-From the output we can see that the layer consists of two sets of parameters: `dense0_weight` and `dense0_bias`. They are both single precision and they have the necessary shapes that we would expect from the first layer, given that the input dimension is 20 and the output dimension 256. The names of the parameters are very useful, because they allow us to identify parameters *uniquely* even in a network of hundreds of layers and with nontrivial structure. The second layer is structured in a similar way.
+From the output we can see that the layer consists of two sets of parameters: `0.weight` and `0.bias`. They are both single precision and they have the necessary shapes that we would expect from the first layer, given that the input dimension is 20 and the output dimension 256. The names of the parameters are very useful, because they allow us to identify parameters *uniquely* even in a network of hundreds of layers and with nontrivial structure. The second layer is structured in a similar way.
 
 ### Targeted Parameters
 
@@ -63,11 +62,11 @@ print(net[1].bias.data())
 
 The first line returns the bias of the second layer. Since this is an object containing data, gradients, and additional information, we need to request the data explicitly. To request the data, we call `data` method on the parameter on the second line. Note that the bias is all 0 since we initialized the bias to contain all zeros.
 
-We can also access the parameter by name, such as `dense0_weight`. This is possible since each layer comes with its own parameter dictionary that can be accessed directly. Both methods are entirely equivalent, but the first method leads to more readable code.
+We can also access the parameter by name, such as `0.weight`. This is possible since each layer comes with its own parameter dictionary that can be accessed directly. Both methods are entirely equivalent, but the first method leads to more readable code.
 
 ```{.python .input  n=4}
-print(net[0].params['dense0_weight'])
-print(net[0].params['dense0_weight'].data())
+print(net[0].params['weight'])
+print(net[0].params['weight'].data())
 ```
 
 Note that the weights are nonzero as they were randomly initialized when we constructed the network.
@@ -92,14 +91,14 @@ print(net.collect_params())
 This provides us with the third way of accessing the parameters of the network. If we want to get the value of the bias term of the second layer we could simply use this:
 
 ```{.python .input  n=7}
-net.collect_params()['dense1_bias'].data()
+net.collect_params()['1.bias'].data()
 ```
 
 By adding a regular expression as an argument to `collect_params` method, we can select only a particular set of parameters whose names are matched by the regular expression.
 
 ```{.python .input  n=8}
 print(net.collect_params('.*weight'))
-print(net.collect_params('dense0.*'))
+print(net.collect_params('0.*'))
 ```
 
 ### Rube Goldberg strikes again
@@ -197,8 +196,8 @@ $$
 class MyInit(init.Initializer):
     def _init_weight(self, name, data):
         print('Init', name, data.shape)
-        data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
-        data *= data.abs() >= 5
+        data[:] = np.random.uniform(low=-10, high=10, size=data.shape)
+        data *= np.abs(data) >= 5
 
 net.initialize(MyInit(), force_reinit=True)
 net[0].weight.data()[0]
@@ -223,11 +222,11 @@ net = nn.Sequential()
 shared = nn.Dense(8, activation='relu')
 net.add(nn.Dense(8, activation='relu'),
         shared,
-        nn.Dense(8, activation='relu', params=shared.params),
+        nn.Dense(8, activation='relu').share_parameters(shared.params),
         nn.Dense(10))
 net.initialize()
 
-x = nd.random.uniform(shape=(2, 20))
+x = np.random.uniform(size=(2, 20))
 net(x)
 
 # Check whether the parameters are the same
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
index 4aea789b6dbb..932175d3dc1b 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
@@ -35,11 +35,10 @@ Let's look at the above methods in more detail. Let's start by importing the mod
 from __future__ import print_function
 
 import mxnet as mx
-import mxnet.ndarray as nd
-from mxnet import nd, autograd, gluon
+from mxnet import np, npx, autograd, gluon
 from mxnet.gluon.data.vision import transforms
 
-import numpy as np
+import numpy as onp
 ```
 
 ## Setup: build and train a simple model
@@ -109,7 +108,7 @@ def train_model(model):
 
             # Print loss once in a while
             if batch_num % 50 == 0:
-                curr_loss = nd.mean(loss).asscalar()
+                curr_loss = np.mean(loss).item()
                 print("Epoch: %d; Batch %d; Loss %f" % (epoch, batch_num, curr_loss))
 ```
 
@@ -183,16 +182,16 @@ def verify_loaded_model(net):
     for data, label in sample_data:
 
         # Display the images
-        img = nd.transpose(data, (1,0,2,3))
-        img = nd.reshape(img, (28,10*28,1))
-        imtiles = nd.tile(img, (1,1,3))
+        img = np.transpose(data, (1,0,2,3))
+        img = npx.reshape(img, (28,10*28,1))
+        imtiles = np.tile(img, (1,1,3))
         plt.imshow(imtiles.asnumpy())
         plt.show()
 
         # Display the predictions
-        data = nd.transpose(data, (0, 3, 1, 2))
-        out = net(data.as_in_context(ctx))
-        predictions = nd.argmax(out, axis=1)
+        data = np.transpose(data, (0, 3, 1, 2))
+        out = net(data.as_in_ctx(ctx))
+        predictions = np.argmax(out, axis=1)
         print('Model predictions: ', predictions.asnumpy())
 
         break
diff --git a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
index ecbe1216b783..dafc36e32525 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
@@ -32,9 +32,9 @@ import mxnet as mx
 import os
 import tarfile
 
-mx.random.seed(42) # Fix the seed for reproducibility
-X = mx.random.uniform(shape=(10, 3))
-y = mx.random.uniform(shape=(10, 1))
+mx.np.random.seed(42) # Fix the seed for reproducibility
+X = mx.np.random.uniform(size=(10, 3))
+y = mx.np.random.uniform(size=(10, 1))
 dataset = mx.gluon.data.dataset.ArrayDataset(X, y)
 ```
 
@@ -51,12 +51,6 @@ assert sample[1].shape == (1, )
 print(sample)
 ```
 
-(
-[ 0.4375872   0.29753461  0.89177299]
-<NDArray 3 @cpu(0)>,
-[ 0.83261985]
-<NDArray 1 @cpu(0)>)
-
 
 We get a tuple of a data sample and its corresponding label, which makes sense because we passed the data `X` and the labels `y` in that order when we instantiated the `ArrayDataset`. We don't usually retrieve individual samples from `Dataset` objects though (unless we're quality checking the output samples). Instead we use a `DataLoader`.
 
@@ -100,8 +94,8 @@ def transform(data, label):
     data = data.astype('float32')/255
     return data, label
 
-train_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=True, transform=transform)
-valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False, transform=transform)
+train_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=True).transform(transform)
+valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False).transform(transform)
 ```
 
 
@@ -118,7 +112,7 @@ label_desc = {0:'T-shirt/top', 1:'Trouser', 2:'Pullover', 3:'Dress', 4:'Coat', 5
 imshow(data[:,:,0].asnumpy(), cmap='gray')
 print("Data type: {}".format(data.dtype))
 print("Label: {}".format(label))
-print("Label description: {}".format(label_desc[label]))
+print("Label description: {}".format(label_desc[label.item()]))
 ```
 
 `Data type: <class 'numpy.float32'>`<!--notebook-skip-line-->
@@ -172,31 +166,31 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
 epochs = 5
 for epoch in range(epochs):
     # training loop (with autograd and trainer steps, etc.)
-    cumulative_train_loss = mx.nd.zeros(1, ctx=ctx)
+    cumulative_train_loss = mx.np.zeros(1, ctx=ctx)
     training_samples = 0
     for batch_idx, (data, label) in enumerate(train_data_loader):
-        data = data.as_in_context(ctx).reshape((-1, 784)) # 28*28=784
-        label = label.as_in_context(ctx)
+        data = data.as_in_ctx(ctx).reshape((-1, 784)) # 28*28=784
+        label = label.as_in_ctx(ctx)
         with autograd.record():
             output = net(data)
             loss = criterion(output, label)
         loss.backward()
         trainer.step(data.shape[0])
-        cumulative_train_loss += loss.sum()
+        cumulative_train_loss += mx.np.sum(loss)
         training_samples += data.shape[0]
-    train_loss = cumulative_train_loss.asscalar()/training_samples
+    train_loss = cumulative_train_loss.item()/training_samples
 
     # validation loop
-    cumulative_valid_loss = mx.nd.zeros(1, ctx)
+    cumulative_valid_loss = mx.np.zeros(1, ctx=ctx)
     valid_samples = 0
     for batch_idx, (data, label) in enumerate(valid_data_loader):
-        data = data.as_in_context(ctx).reshape((-1, 784)) # 28*28=784
-        label = label.as_in_context(ctx)
+        data = data.as_in_ctx(ctx).reshape((-1, 784)) # 28*28=784
+        label = label.as_in_ctx(ctx)
         output = net(data)
         loss = criterion(output, label)
-        cumulative_valid_loss += loss.sum()
+        cumulative_valid_loss += mx.np.sum(loss)
         valid_samples += data.shape[0]
-    valid_loss = cumulative_valid_loss.asscalar()/valid_samples
+    valid_loss = cumulative_valid_loss.item()/valid_samples
 
     print("Epoch {}, training loss: {:.2f}, validation loss: {:.2f}".format(epoch, train_loss, valid_loss))
 ```
@@ -290,7 +284,7 @@ assert label == 1
 
 Sometimes you have data that doesn't quite fit the format expected by the included [Dataset](../../../../api/gluon/data/index.rst#mxnet.gluon.data.Dataset)s. You might be able to preprocess your data to fit the expected format, but it is easy to create your own dataset to do this.
 
-All you need to do is create a class that implements a `__getitem__` method, that returns a sample (i.e. a tuple of [mx.nd.NDArray](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.NDArray)'s).
+All you need to do is create a class that implements a `__getitem__` method, that returns a sample (i.e. a tuple of [mx.np.ndarray](../../../../api/np/arrays.ndarray.rst#the-n-dimensional-array-ndarray)'s).
 
 # Appendix: Upgrading from Module `DataIter` to Gluon `DataLoader`
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
index ff6293964543..34c6a3d53129 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
@@ -34,7 +34,6 @@ import time
 
 import numpy as onp
 from matplotlib import pyplot as plt
-from mxboard import SummaryWriter
 import mxnet as mx
 from mxnet import gluon
 from mxnet import np, npx
@@ -141,7 +140,7 @@ class Generator(gluon.HybridBlock):
 
     def forward(self, x):
         x = self.prev(x)
-        x = np.reshape(x, (0, -1, 1, 1))
+        x = np.reshape(x, (-2, -1, 1, 1))
         return self.G(x)
 ```
 
@@ -274,75 +273,65 @@ Define the training loop.
 
 
 ```{.python .input}
-with SummaryWriter(logdir='./logs/') as sw:
-
-    epochs = 1
-    counter = 0
-    for epoch in range(epochs):
-        print("Epoch", epoch)
-        starttime = time.time()
-
-        d_error_epoch = np.zeros((1,), ctx=ctx)
-        g_error_epoch = np.zeros((1,), ctx=ctx)
-
-        for idx, data in enumerate(train_dataloader):
+epochs = 1
+counter = 0
+for epoch in range(epochs):
+    print("Epoch", epoch)
+    starttime = time.time()
 
-            #get real data and generator input
-            real_data = data.as_in_context(ctx)
-            g_input, label, c2 = create_generator_input()
+    d_error_epoch = np.zeros((1,), ctx=ctx)
+    g_error_epoch = np.zeros((1,), ctx=ctx)
 
+    for idx, data in enumerate(train_dataloader):
 
-            #Update discriminator: Input real data and fake data
-            with autograd.record():
-                output_real,_,_ = discriminator(real_data)
-                d_error_real    = loss1(output_real, real_label)
+        #get real data and generator input
+        real_data = data.as_in_context(ctx)
+        g_input, label, c2 = create_generator_input()
 
-                # create fake image and input it to discriminator
-                fake_image      = generator(g_input)
-                output_fake,_,_ = discriminator(fake_image.detach())
-                d_error_fake    = loss1(output_fake, fake_label)
 
-                # total discriminator error
-                d_error         = d_error_real + d_error_fake
+        #Update discriminator: Input real data and fake data
+        with autograd.record():
+            output_real,_,_ = discriminator(real_data)
+            d_error_real    = loss1(output_real, real_label)
 
-            d_error_epoch += d_error.mean()
+            # create fake image and input it to discriminator
+            fake_image      = generator(g_input)
+            output_fake,_,_ = discriminator(fake_image.detach())
+            d_error_fake    = loss1(output_fake, fake_label)
 
-            #Update D every second iteration
-            if (counter+1) % 2 == 0:
-                d_error.backward()
-                d_trainer.step(batch_size)
+            # total discriminator error
+            d_error         = d_error_real + d_error_fake
 
-            #Update generator: Input random noise and latent code vector
-            with autograd.record():
-                fake_image = generator(g_input)
-                output_fake, category_prob, continuous_mean = discriminator(fake_image)
-                g_error = loss1(output_fake, real_label) + loss3(category_prob, label) + loss2(c2, continuous_mean)
+        d_error_epoch += d_error.mean()
 
-            g_error.backward()
-            g_error_epoch += g_error.mean()
+        #Update D every second iteration
+        if (counter+1) % 2 == 0:
+            d_error.backward()
+            d_trainer.step(batch_size)
 
-            g_trainer.step(batch_size)
-            q_trainer.step(batch_size)
+        #Update generator: Input random noise and latent code vector
+        with autograd.record():
+            fake_image = generator(g_input)
+            output_fake, category_prob, continuous_mean = discriminator(fake_image)
+            g_error = loss1(output_fake, real_label) + loss3(category_prob, label) + loss2(c2, continuous_mean)
 
-            # logging
-            if idx % 10 == 0:
-                count = idx + 1
-                logging.info('speed: {} samples/s'.format(batch_size / (time.time() - starttime)))
-                logging.info('discriminator loss = %f, generator loss = %f at iter %d epoch %d'
-                         %(d_error_epoch.item()/count,g_error_epoch.item()/count, count, epoch))
+        g_error.backward()
+        g_error_epoch += g_error.mean()
 
-                g_input,_,_ = create_generator_input()
+        g_trainer.step(batch_size)
+        q_trainer.step(batch_size)
 
-                # create some fake image for logging in MXBoard
-                fake_image = generator(g_input)
+        # logging
+        if idx % 10 == 0:
+            count = idx + 1
+            logging.info('speed: {} samples/s'.format(batch_size / (time.time() - starttime)))
+            logging.info('discriminator loss = %f, generator loss = %f at iter %d epoch %d'
+                        %(d_error_epoch.item()/count,g_error_epoch.item()/count, count, epoch))
 
-                sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.item()/count}, global_step=counter)
-                sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.item()/count}, global_step=counter)
-                sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(onp.uint8)  , global_step=counter)
-                sw.flush()
+            g_input,_,_ = create_generator_input()
 
-        discriminator.save_parameters("infogan_d_latest.params")
-        generator.save_parameters("infogan_g_latest.params")
+    discriminator.save_parameters("infogan_d_latest.params")
+    generator.save_parameters("infogan_g_latest.params")
 ```
 
 ## Image similarity
@@ -425,7 +414,7 @@ We trained the Generator for a couple of epochs and stored a couple of fake imag
 The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/)
 
 
-```{.python .input}
+```{.python}
 import json
 
 from sklearn.manifold import TSNE
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index 00ff5dbbc1a1..64d81e463dde 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -46,10 +46,13 @@ Before we define the model, let's first fetch the [MNIST](http://yann.lecun.com/
 The following source code downloads and loads the images and the corresponding labels into memory.
 
 ```{.python .input}
+import os
 import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon.data.vision import transforms
 
 # Fixing the random seed
-mx.random.seed(42)
+mx.np.random.seed(42)
 
 mnist = mx.test_utils.get_mnist()
 ```
@@ -62,9 +65,18 @@ Data iterators take care of this by randomly shuffling the inputs. Note that we
 The following source code initializes the data iterators for the MNIST dataset. Note that we initialize two iterators: one for train data and one for test data.
 
 ```{.python .input}
+def transform(data, label):
+    return data.astype(np.float32)/255, label.astype(np.float32)
+
 batch_size = 100
-train_data = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True)
-val_data = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
+num_workers = 8
+train_data = gluon.data.DataLoader(
+    gluon.data.vision.MNIST(train=True).transform_first(transforms.ToTensor()),
+    batch_size=batch_size, shuffle=True, num_workers=num_workers)
+
+val_data = gluon.data.DataLoader(
+    gluon.data.vision.MNIST(train=False).transform(transform),
+    batch_size=batch_size, shuffle=False, num_workers=num_workers)
 ```
 
 ## Approaches
@@ -115,7 +127,7 @@ initialized parameters.
 
 ```{.python .input}
 gpus = mx.test_utils.list_gpus()
-ctx =  [mx.gpu()] if gpus else [mx.cpu(0), mx.cpu(1)]
+ctx =  mx.gpu() if gpus else [mx.cpu(0), mx.cpu(1)]
 net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02})
 ```
@@ -142,26 +154,18 @@ training scope which is defined by `autograd.record()`.
 %%time
 epoch = 10
 # Use Accuracy as the evaluation metric.
-metric = mx.metric.Accuracy()
+metric = mx.gluon.metric.Accuracy()
 softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss()
 for i in range(epoch):
-    # Reset the train data iterator.
-    train_data.reset()
     # Loop over the train data iterator.
-    for batch in train_data:
-        # Splits train data into multiple slices along batch_axis
-        # and copy each slice into a context.
-        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-        # Splits train labels into multiple slices along batch_axis
-        # and copy each slice into a context.
-        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
+    for batch_num, (data, label) in enumerate(train_data):
         outputs = []
         # Inside training scope
         with ag.record():
             for x, y in zip(data, label):
-                z = net(x)
+                z = net(x.as_in_ctx(ctx))
                 # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y)
+                loss = softmax_cross_entropy_loss(z, y.as_in_ctx(ctx))
                 # Backpropagate the error for one iteration.
                 loss.backward()
                 outputs.append(z)
@@ -169,7 +173,7 @@ for i in range(epoch):
         metric.update(label, outputs)
         # Make one step of parameter update. Trainer needs to know the
         # batch size of data to normalize the gradient by 1/batch_size.
-        trainer.step(batch.data[0].shape[0])
+        trainer.step(data.shape[0])
     # Gets the evaluation result.
     name, acc = metric.get()
     # Reset evaluation result to initial state.
@@ -183,20 +187,12 @@ After the above training completes, we can evaluate the trained model by running
 
 ```{.python .input}
 # Use Accuracy as the evaluation metric.
-metric = mx.metric.Accuracy()
-# Reset the validation data iterator.
-val_data.reset()
+metric = mx.gluon.metric.Accuracy()
 # Loop over the validation data iterator.
-for batch in val_data:
-    # Splits validation data into multiple slices along batch_axis
-    # and copy each slice into a context.
-    data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-    # Splits validation label into multiple slices along batch_axis
-    # and copy each slice into a context.
-    label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
+for batch_num, (data, label) in enumerate(val_data):
     outputs = []
     for x in data:
-        outputs.append(net(x))
+        outputs.append(net(x.as_in_ctx(ctx)))
     # Updates internal evaluation
     metric.update(label, outputs)
 print('validation acc: %s=%f'%metric.get())
@@ -217,7 +213,7 @@ A typical way to write your network is creating a new class inherited from `gluo
 class. We can define the network by composing and inheriting Block class as follows:
 
 ```{.python .input}
-import mxnet.ndarray as F
+from mxnet import np, npx
 
 class Net(gluon.Block):
     def __init__(self, **kwargs):
@@ -230,13 +226,13 @@ class Net(gluon.Block):
         self.fc2 = nn.Dense(10)
 
     def forward(self, x):
-        x = self.pool1(F.tanh(self.conv1(x)))
-        x = self.pool2(F.tanh(self.conv2(x)))
+        x = self.pool1(np.tanh(self.conv1(x)))
+        x = self.pool2(np.tanh(self.conv2(x)))
         # 0 means copy over size from corresponding dimension.
         # -1 means infer size from the rest of dimensions.
-        x = x.reshape((0, -1))
-        x = F.tanh(self.fc1(x))
-        x = F.tanh(self.fc2(x))
+        x = x.reshape((-2, -1))
+        x = np.tanh(self.fc1(x))
+        x = np.tanh(self.fc2(x))
         return x
 ```
 
@@ -273,27 +269,19 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03})
 
 ```{.python .input}
 # Use Accuracy as the evaluation metric.
-metric = mx.metric.Accuracy()
+metric = mx.gluon.metric.Accuracy()
 softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
 for i in range(epoch):
-    # Reset the train data iterator.
-    train_data.reset()
     # Loop over the train data iterator.
-    for batch in train_data:
-        # Splits train data into multiple slices along batch_axis
-        # and copy each slice into a context.
-        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-        # Splits train labels into multiple slices along batch_axis
-        # and copy each slice into a context.
-        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
+    for batch_num, (data, label) in enumerate(train_data):
         outputs = []
         # Inside training scope
         with ag.record():
             for x, y in zip(data, label):
-                z = net(x)
+                z = net(x.as_in_ctx(ctx))
                 # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y)
+                loss = softmax_cross_entropy_loss(z, y.as_in_ctx(ctx))
                 # Backpropogate the error for one iteration.
                 loss.backward()
                 outputs.append(z)
@@ -301,7 +289,7 @@ for i in range(epoch):
         metric.update(label, outputs)
         # Make one step of parameter update. Trainer needs to know the
         # batch size of data to normalize the gradient by 1/batch_size.
-        trainer.step(batch.data[0].shape[0])
+        trainer.step(data.shape[0])
     # Gets the evaluation result.
     name, acc = metric.get()
     # Reset evaluation result to initial state.
@@ -315,20 +303,12 @@ Finally, we'll use the trained LeNet model to generate predictions for the test
 
 ```{.python .input}
 # Use Accuracy as the evaluation metric.
-metric = mx.metric.Accuracy()
-# Reset the validation data iterator.
-val_data.reset()
+metric = mx.gluon.metric.Accuracy()
 # Loop over the validation data iterator.
-for batch in val_data:
-    # Splits validation data into multiple slices along batch_axis
-    # and copy each slice into a context.
-    data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-    # Splits validation label into multiple slices along batch_axis
-    # and copy each slice into a context.
-    label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
+for batch_num, (data, label) in enumerate(val_data):
     outputs = []
     for x in data:
-        outputs.append(net(x))
+        outputs.append(net(x.as_in_ctx(ctx)))
     # Updates internal evaluation
     metric.update(label, outputs)
 print('validation acc: %s=%f'%metric.get())
diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
index 3baaf732fe23..da507c0b2f7e 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
@@ -114,7 +114,7 @@ class GetImagePairs(mx.gluon.data.vision.ImageFolderDataset):
             image1_index, image1_tuple = random.choice(items_with_index)
         image0 = super().__getitem__(image0_index)
         image1 = super().__getitem__(image1_index)
-        label = mx.nd.array([int(image1_tuple[1] != image0_tuple[1])])
+        label = mx.np.array([int(image1_tuple[1] != image0_tuple[1])])
         return image0[0], image1[0], label
 
     def __len__(self):
@@ -146,7 +146,7 @@ Following code plots some examples from the test dataset.
 
 ```{.python .input}
 img1, img2, label = test[0]
-print("Same: {}".format(int(label.asscalar()) == 0))
+print("Same: {}".format(int(label.item()) == 0))
 fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10, 5))
 ax0.imshow(img1.asnumpy()[:,:,0], cmap='gray')
 ax0.axis('off')
@@ -183,7 +183,7 @@ for epoch in range(10):
             loss_contrastive = loss(output1, output2, label)
         loss_contrastive.backward()
         trainer.step(image1.shape[0])
-        loss_mean = loss_contrastive.mean().asscalar()
+        loss_mean = loss_contrastive.mean().item()
         print("Epoch number {}\n Current loss {}\n".format(epoch, loss_mean))
 
 ```
@@ -196,9 +196,9 @@ During inference we compute the Euclidean distance between the output vectors of
 for i, data in enumerate(test_dataloader):
     img1, img2, label = data
     output1, output2 = model(img1, img2)
-    dist_sq = mx.ndarray.sum(mx.ndarray.square(output1 - output2))
-    dist = mx.ndarray.sqrt(dist_sq).asscalar()
-    print("Euclidean Distance:", dist, "Test label", label[0].asscalar())
+    dist_sq = mx.np.sum(mx.np.square(output1 - output2))
+    dist = mx.np.sqrt(dist_sq).item()
+    print("Euclidean Distance:", dist, "Test label", label[0].item())
     fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10, 5))
     ax0.imshow(img1.asnumpy()[0, 0, :, :], cmap='gray')
     ax0.axis('off')
diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/kl_divergence.md b/docs/python_docs/python/tutorials/packages/gluon/loss/kl_divergence.md
index 86047d7c40a8..69f9c9032b5d 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/loss/kl_divergence.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/loss/kl_divergence.md
@@ -23,7 +23,7 @@ In MXNet Gluon, we can use [KLDivLoss](../../../../api/gluon/loss/index.rst#mxne
 
 As an example, let's compare a few categorical distributions (`dist_1`, `dist_2` and `dist_3`), each with 4 categories.
 
-```
+```{.python .input}
 from matplotlib import pyplot as plt
 import mxnet as mx
 import numpy as np
@@ -55,20 +55,20 @@ We often apply a [softmax](../../../../api/npx/generated/mxnet.npx.softmax.rst)
 
 Since we're already working with distributions in this example, we don't need to apply the softmax and only need to apply [log](../../../../api/np/generated/mxnet.np.log.rst). And we'll create batch dimensions even though we're working with single distributions.
 
-```
+```{.python .input}
 def kl_divergence(dist_a, dist_b):
     # add batch dimension
-    pred_batch = mx.nd.array(dist_a).expand_dims(0)
-    target_batch = mx.nd.array(dist_b).expand_dims(0)
+    pred_batch = mx.np.expand_dims(mx.np.array(dist_a), axis=0)
+    target_batch = mx.np.expand_dims(mx.np.array(dist_b), axis=0)
     # log the distribution
-    pred_batch = pred_batch.log()
+    pred_batch = mx.np.log(pred_batch)
     # create loss (assuming we have a logged prediction distribution)
     loss_fn = mx.gluon.loss.KLDivLoss(from_logits=True)
     divergence = loss_fn(pred_batch, target_batch)
-    return divergence.asscalar()
+    return divergence.item()
 ```
 
-```
+```{.python .input}
 print("Distribution 1 compared with Distribution 2: {}".format(
         kl_divergence(dist_1, dist_2)))
 print("Distribution 1 compared with Distribution 3: {}".format(
@@ -83,24 +83,24 @@ As expected we see a smaller KL Divergence for distributions 1 & 2 than 1 & 3. A
 
 Alternatively, instead of manually applying the [log_softmax](../../../../api/npx/generated/mxnet.npx.log_softmax.rst) to our network outputs, we can leave that to the loss function. When setting `from_logits=False` on [KLDivLoss](../../../../api/gluon/loss/index.rst#mxnet.gluon.loss.KLDivLoss), the [log_softmax](../../../../api/npx/generated/mxnet.npx.log_softmax.rst) is applied to the first argument passed to `loss_fn`. As an example, let's assume our network outputs us the values below (favorably chosen so that when we [softmax](../../../../api/npx/generated/mxnet.npx.softmax.rst) these values we get the same distribution parameters as `dist_1`).
 
-```
-output = mx.nd.array([0.39056206, 1.3068528, 0.39056206, -0.30258512])
+```{.python .input}
+output = mx.np.array([0.39056206, 1.3068528, 0.39056206, -0.30258512])
 ```
 
 We can pass this to our [KLDivLoss](../../../../api/gluon/loss/index.rst#mxnet.gluon.loss.KLDivLoss) loss function (with `from_logits=False`) and get the same KL Divergence between `dist_1` and `dist_2` as before, because the [log_softmax](../../../../api/npx/generated/mxnet.npx.log_softmax.rst) is applied within the loss function.
 
-```
+```{.python .input}
 def kl_divergence_not_from_logits(dist_a, dist_b):
     # add batch dimension
-    pred_batch = mx.nd.array(dist_a).expand_dims(0)
-    target_batch = mx.nd.array(dist_b).expand_dims(0)
+    pred_batch = mx.np.expand_dims(mx.np.array(dist_a), axis=0)
+    target_batch = mx.np.expand_dims(mx.np.array(dist_b), axis=0)
     # create loss (assuming we have a logged prediction distribution)
     loss_fn = mx.gluon.loss.KLDivLoss(from_logits=False)
     divergence = loss_fn(pred_batch, target_batch)
-    return divergence.asscalar()
+    return divergence.item()
 ```
 
-```
+```{.python .input}
 print("Distribution 1 compared with Distribution 2: {}".format(
         kl_divergence_not_from_logits(output, dist_2)))
 ```
@@ -110,11 +110,11 @@ print("Distribution 1 compared with Distribution 2: {}".format(
 Occasionally, you might have issues with [KLDivLoss](../../../../api/gluon/loss/index.rst#mxnet.gluon.loss.KLDivLoss). One common issue arises when the support of the distributions being compared are not the same. 'Support' here is referring to the values of the distribution which have a non-zero probability. Conveniently, all our examples above had the same support, but we might have a case where some categories have a probability of 0.
 
 
-```
+```{.python .input}
 dist_4 = np.array([0, 0.9, 0, 0.1])
 ```
 
-```
+```{.python .input}
 print("Distribution 4 compared with Distribution 1: {}".format(
         kl_divergence(dist_4, dist_1)))
 ```
@@ -125,12 +125,12 @@ We can see that the result is `nan`, which will obviously cause issues when calc
 
 One minor difference between the true definition of KL Divergence and the result from [KLDivLoss](../../../../api/gluon/loss/index.rst#mxnet.gluon.loss.KLDivLoss) is how the aggregation of category contributions is performed. Although the true definition sums up these contributions, the default behaviour in MXNet Gluon is to average terms along the batch dimension. As a result, the [KLDivLoss](../../../../api/gluon/loss/index.rst#mxnet.gluon.loss.KLDivLoss) output will be smaller than the true definition by a factor of the number of categories.
 
-```
+```{.python .input}
 true_divergence = (dist_2*(np.log(dist_2)-np.log(dist_1))).sum()
 print('true_divergence: {}'.format(true_divergence))
 ```
 
-```
+```{.python .input}
 num_categories = dist_1.shape[0]
 divergence = kl_divergence(dist_1, dist_2)
 print('divergence: {}'.format(divergence))
diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md
index 1cdf796decb6..a3647691661b 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md
@@ -29,7 +29,7 @@ We'll first import the modules, where the `mxnet.gluon.loss` module is imported
 from IPython import display
 from matplotlib import pyplot as plt
 import mxnet as mx
-from mxnet import nd, autograd
+from mxnet import np, npx, autograd
 from mxnet.gluon import nn, loss as gloss
 ```
 
@@ -44,8 +44,8 @@ loss = gloss.L2Loss()
 And then feed two inputs to compute the elementwise loss values.
 
 ```{.python .input}
-x = nd.ones((2,))
-y = nd.ones((2,)) * 2
+x = np.ones((2,))
+y = np.ones((2,)) * 2
 loss(x, y)
 ```
 
@@ -58,7 +58,7 @@ These values should be equal to the math definition: $0.5\|x-y\|^2$.
 Next we show how to use a loss function to compute gradients.
 
 ```{.python .input}
-X = nd.random.uniform(shape=(2, 4))
+X = np.random.uniform(size=(2, 4))
 net = nn.Dense(1)
 net.initialize()
 with autograd.record():
@@ -88,8 +88,8 @@ def plot(x, y):
     plt.show()
 
 def show_regression_loss(loss):
-    x = nd.arange(-5, 5, .1)
-    y = loss(x, nd.zeros_like(x))
+    x = np.arange(-5, 5, .1)
+    y = loss(x, np.zeros_like(x))
     plot(x, y)
 
 ```
@@ -98,8 +98,8 @@ Then plot the classification losses with label values fixed to be 1.
 
 ```{.python .input}
 def show_classification_loss(loss):
-    x = nd.arange(-5, 5, .1)
-    y = loss(x, nd.ones_like(x))
+    x = np.arange(-5, 5, .1)
+    y = loss(x, np.ones_like(x))
     plot(x, y)
 ```
 
@@ -167,8 +167,8 @@ Running these two steps one-by-one, however, may lead to numerical instabilities
 
 ```{.python .input}
 loss = gloss.SoftmaxCrossEntropyLoss()
-x = nd.array([[1, 10], [8, 2]])
-y = nd.array([0, 1])
+x = np.array([[1, 10], [8, 2]])
+y = np.array([0, 1])
 loss(x, y)
 ```
 
@@ -210,9 +210,9 @@ The loss is large, if the predicted probability distribution is far from the gro
 For instance, in the following example we get a KL divergence of 0.02. We set ```from_logits=False```, so the loss functions will apply ```log_softmax``` on the network output, before computing the KL divergence.
 
 ```{.python .input}
-output = mx.nd.array([[0.39056206, 1.3068528, 0.39056206, -0.30258512]])
-print('output.softmax(): {}'.format(output.softmax().asnumpy().tolist()))
-target_dist = mx.nd.array([[0.3, 0.4, 0.1, 0.2]])
+output = mx.np.array([[0.39056206, 1.3068528, 0.39056206, -0.30258512]])
+print('output.softmax(): {}'.format(npx.softmax(output).asnumpy().tolist()))
+target_dist = mx.np.array([[0.3, 0.4, 0.1, 0.2]])
 loss_fn = gloss.KLDivLoss(from_logits=False)
 loss = loss_fn(output, target_dist)
 print('loss (kl divergence): {}'.format(loss.asnumpy().tolist()))
@@ -253,9 +253,9 @@ Cosine distance measures the similarity between two arrays given a label and is
 For instance, in the following code example we measure the similarity between the input vectors `x` and `y`. Since they are the same the label equals `1`. The loss function returns $$ \sum_i 1 - {cos\_sim({input1}_i, {input2}_i)} $$ which is equal `0`.
 
 ```{.python .input}
-x = mx.nd.array([1,0,1,0,1,0])
-y = mx.nd.array([1,0,1,0,1,0])
-label = mx.nd.array(1)
+x = mx.np.array([1,0,1,0,1,0])
+y = mx.np.array([1,0,1,0,1,0])
+label = mx.np.array([1])
 loss = gloss.CosineEmbeddingLoss()
 print(loss(x,y,label))
 ```
@@ -263,9 +263,9 @@ print(loss(x,y,label))
 Now let's make `y` the opposite of `x`, so we set the label `-1` and the function will return  $$ \sum_i cos\_sim(input1, input2) $$
 
 ```{.python .input}
-x = mx.nd.array([1,0,1,0,1,0])
-y = mx.nd.array([0,1,0,1,0,1])
-label = mx.nd.array(-1)
+x = mx.np.array([1,0,1,0,1,0])
+y = mx.np.array([0,1,0,1,0,1])
+label = mx.np.array([-1])
 loss = gloss.CosineEmbeddingLoss()
 print(loss(x,y,label))
 ```
@@ -293,10 +293,10 @@ $$ L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!}) $$
 Some examples in a batch may be more important than others. We can apply weights to individual examples during the forward pass of the loss function using the `sample_weight` argument. All examples are weighted equally by default.
 
 ```{.python .input}
-x = nd.ones((2,))
-y = nd.ones((2,)) * 2
+x = np.ones((2,))
+y = np.ones((2,)) * 2
 loss = gloss.L2Loss()
-loss(x, y, nd.array([1, 2]))
+loss(x, y, np.array([1, 2]))
 ```
 
 ## Conclusion
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
index 5dc052390c43..9abe52388349 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
@@ -125,12 +125,12 @@ In the basic usage example, with just 2 lines of code, we will set up our model
 
 
 ```{.python .input}
-train_acc = mx.metric.Accuracy() # Metric to monitor
+train_acc = mx.gluon.metric.Accuracy() # Metric to monitor
 
 # Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
 est = estimator.Estimator(net=resnet_18_v1,
                           loss=loss_fn,
-                          metrics=train_acc,
+                          train_metrics=train_acc,
                           trainer=trainer,
                           context=ctx)
 
@@ -194,7 +194,7 @@ class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd):
         for metric in estimator.train_metrics:
             # look for train Loss in training metrics
             # we wrapped loss value as a metric to record it
-            if isinstance(metric, mx.metric.Loss):
+            if isinstance(metric, mx.gluon.metric.Loss):
                 loss_name, loss_val = metric.get()
                 # append loss value for this epoch
                 self.loss_history.setdefault(loss_name, []).append(loss_val)
@@ -207,7 +207,7 @@ class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd):
 resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), ctx=ctx)
 trainer = gluon.Trainer(resnet_18_v1.collect_params(),
                         'sgd', {'learning_rate': learning_rate})
-train_acc = mx.metric.Accuracy()
+train_acc = mx.gluon.metric.Accuracy()
 ```
 
 
@@ -215,7 +215,7 @@ train_acc = mx.metric.Accuracy()
 # Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
 est = estimator.Estimator(net=resnet_18_v1,
                           loss=loss_fn,
-                          metrics=train_acc,
+                          train_metrics=train_acc,
                           trainer=trainer,
                           context=ctx)
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
index b43358c0b83e..d446e60e6218 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
@@ -45,7 +45,7 @@ Usually, our unit of work is an epoch (a full pass through the dataset) and the
 import mxnet as mx
 
 # Set seed for reproducibility
-mx.random.seed(42)
+mx.np.random.seed(42)
 
 class Learner():
     def __init__(self, net, data_loader, ctx):
@@ -85,7 +85,7 @@ class Learner():
         # Update parameters
         if take_step: self.trainer.step(data.shape[0])
         # Set and return loss.
-        self.iteration_loss = mx.nd.mean(loss).asscalar()
+        self.iteration_loss = mx.np.mean(loss).item()
         return self.iteration_loss
 
     def close(self):
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
index 2a9ca0de95fa..1a5f6f4516e3 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
@@ -242,7 +242,7 @@ for epoch in range(1, num_epochs+1):
 
         # Show loss and learning rate after first iteration of epoch
         if batch_num == 1:
-            curr_loss = mx.nd.mean(loss).asscalar()
+            curr_loss = mx.np.mean(loss).item()
             curr_lr = trainer.learning_rate
             print("Epoch: %d; Batch %d; Loss %f; LR %f" % (epoch, batch_num, curr_loss, curr_lr))
 ```
@@ -310,7 +310,7 @@ for epoch in range(1, num_epochs + 1):
         trainer.step(data.shape[0])
         # Show loss and learning rate after first iteration of epoch
         if batch_num == 1:
-            curr_loss = mx.nd.mean(loss).asscalar()
+            curr_loss = mx.np.mean(loss).item()
             curr_lr = trainer.learning_rate
             print("Epoch: %d; Batch %d; Loss %f; LR %f" % (epoch, batch_num, curr_loss, curr_lr))
         iteration_idx += 1
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md b/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md
index 49056b587e7a..9f07f620274d 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md
@@ -45,7 +45,7 @@ When using pre-trained models from the [Gluon Model Zoo](https://mxnet.apache.or
 import mxnet as mx
 from mxnet.gluon.data.vision.transforms import Normalize
 
-image_int = mx.nd.random.randint(low=0, high=256, shape=(1,3,2,2))
+image_int = mx.np.random.randint(low=0, high=256, size=(1,3,2,2))
 image_float = image_int.astype('float32')/255
 # the following normalization statistics are taken from gluon model zoo
 normalizer = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
@@ -82,7 +82,7 @@ As an example, we'll apply `BatchNorm` to a batch of 2 samples, each with 2 chan
 
 
 ```{.python .input}
-data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
+data = mx.np.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
 print(data)
 ```
 
@@ -110,7 +110,7 @@ Warning: `BatchNorm` assumes the channel dimension is the 2nd in order (i.e. `ax
 ```{.python .input}
 with mx.autograd.record():
     output = net(data)
-    loss = output.abs()
+    loss = mx.np.abs(output)
 loss.backward()
 print(output)
 ```
@@ -119,8 +119,13 @@ We can immediately see the activations have been scaled down and centered around
 
 
 ```{.python .input}
-batch_means = data.mean(axis=1, exclude=True)
-batch_vars = (data - batch_means.reshape(1, -1, 1, 1)).square().mean(axis=1, exclude=True)
+axes = list(range(data.ndim))
+del axes[1]
+batch_means = mx.np.mean(data, axis=axes)
+batch_square = mx.np.square(data - batch_means.reshape(1, -1, 1, 1))
+axes = list(range(batch_square.ndim))
+del axes[1]
+batch_vars = mx.np.mean(batch_square, axis=axes)
 print('batch_means:', batch_means.asnumpy())
 print('batch_vars:', batch_vars.asnumpy())
 ```
@@ -129,7 +134,7 @@ And use these to scale the first entry in `data`, to confirm the `BatchNorm` cal
 
 
 ```{.python .input}
-print("manually calculated:", ((data[0][0][0][0] - batch_means[0])/batch_vars[0].sqrt()).asnumpy())
+print("manually calculated:", ((data[0][0][0][0] - batch_means[0])/mx.np.sqrt(batch_vars[0])).asnumpy())
 print("automatically calculated:", output[0][0][0][0].asnumpy())
 ```
 
@@ -153,7 +158,7 @@ You should notice though that these running statistics do not match the batch st
 for i in range(100):
     with mx.autograd.record():
         output = net(data)
-        loss = output.abs()
+        loss = mx.np.abs(output)
     loss.backward()
 print('running_means:', net.running_mean.data().asnumpy())
 print('running_vars:', net.running_var.data().asnumpy())
@@ -212,7 +217,7 @@ As an example, we'll apply `LayerNorm` to a batch of 2 samples, each with 4 time
 
 
 ```{.python .input}
-data = mx.nd.arange(start=0, stop=2*4*2).reshape(2, 4, 2)
+data = mx.np.arange(start=0, stop=2*4*2).reshape(2, 4, 2)
 print(data)
 ```
 
@@ -251,7 +256,7 @@ As an example, we'll apply `InstanceNorm` to a batch of 2 samples, each with 2 c
 
 
 ```{.python .input}
-data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
+data = mx.np.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
 print(data)
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/trainer.md b/docs/python_docs/python/tutorials/packages/gluon/training/trainer.md
index 05be542c127d..11db13a0b409 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/trainer.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/trainer.md
@@ -32,7 +32,7 @@ The final step is to update the neural network model parameters using an optimiz
 To illustrate how to use the Gluon `Trainer` we will create a simple perceptron model and create a `Trainer ` instance using the perceptron model parameters and a simple optimizer - `sgd` with learning rate as 1.
 
 ```{.python .input}
-from mxnet import nd, autograd, optimizer, gluon
+from mxnet import np, autograd, optimizer, gluon
 
 net = gluon.nn.Dense(1)
 net.initialize()
@@ -48,8 +48,8 @@ Before we can use the `trainer` to update model parameters, we must first run th
 
 ```{.python .input}
 batch_size = 8
-X = nd.random.uniform(shape=(batch_size, 4))
-y = nd.random.uniform(shape=(batch_size,))
+X = np.random.uniform(size=(batch_size, 4))
+y = np.random.uniform(size=(batch_size,))
 
 loss = gluon.loss.L2Loss()
 
diff --git a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
index e7b97c08d1b1..bc1b0b1211d4 100644
--- a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
+++ b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
@@ -31,8 +31,8 @@ import mxnet as mx
 
 kv = mx.kv.create('local') # create a local kv store.
 shape = (2,3)
-kv.init(3, mx.nd.ones(shape)*2)
-a = mx.nd.zeros(shape)
+kv.init(3, mx.np.ones(shape)*2)
+a = mx.np.zeros(shape)
 kv.pull(3, out = a)
 print(a.asnumpy())
 ```
@@ -44,7 +44,7 @@ print(a.asnumpy())
 For any key that has been initialized, you can push a new value with the same shape to the key:
 
 ```{.python .input}
-kv.push(3, mx.nd.ones(shape)*8)
+kv.push(3, mx.np.ones(shape)*8)
 kv.pull(3, out = a) # pull out the value
 print(a.asnumpy())
 ```
@@ -58,7 +58,7 @@ Please note summation only happens if the value list is longer than one
 
 ```{.python .input}
 contexts = [mx.cpu(i) for i in range(4)]
-b = [mx.nd.ones(shape, ctx) for ctx in contexts]
+b = [mx.np.ones(shape=shape, ctx=ctx) for ctx in contexts]
 kv.push(3, b)
 kv.pull(3, out = a)
 print(a.asnumpy())
@@ -82,7 +82,7 @@ print(a.asnumpy())
 `[[ 4.  4.  4.],[ 4.  4.  4.]]`<!--notebook-skip-line-->
 
 ```{.python .input}
-kv.push(3, mx.nd.ones(shape))
+kv.push(3, mx.np.ones(shape))
 kv.pull(3, out=a)
 print(a.asnumpy())
 ```
@@ -98,7 +98,7 @@ You've already seen how to pull a single key-value pair. Similarly, to push, you
 pull the value onto several devices with a single call:
 
 ```{.python .input}
-b = [mx.nd.ones(shape, ctx) for ctx in contexts]
+b = [mx.np.ones(shape=shape, ctx=ctx) for ctx in contexts]
 kv.pull(3, out = b)
 print(b[1].asnumpy())
 ```
@@ -114,9 +114,9 @@ For a single device:
 
 ```{.python .input}
 keys = [5, 7, 9]
-kv.init(keys, [mx.nd.ones(shape)]*len(keys))
-kv.push(keys, [mx.nd.ones(shape)]*len(keys))
-b = [mx.nd.zeros(shape)]*len(keys)
+kv.init(keys, [mx.np.ones(shape)]*len(keys))
+kv.push(keys, [mx.np.ones(shape)]*len(keys))
+b = [mx.np.zeros(shape)]*len(keys)
 kv.pull(keys, out = b)
 print(b[1].asnumpy())
 ```
@@ -132,7 +132,7 @@ print(b[1].asnumpy())
 For multiple devices:
 
 ```{.python .input}
-b = [[mx.nd.ones(shape, ctx) for ctx in contexts]] * len(keys)
+b = [[mx.np.ones(shape=shape, ctx=ctx) for ctx in contexts]] * len(keys)
 kv.push(keys, b)
 kv.pull(keys, out = b)
 print(b[1][1].asnumpy())
diff --git a/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md b/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
index cbcbc9378528..6536bfff2ec7 100644
--- a/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
+++ b/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
@@ -90,7 +90,7 @@ npx.load('my_array')
 ```{.python .input}
 # Save a list of arrays
 b = np.array([4, 6, 8])
-npx.save('my_arrays', [a, b])  # FIXME, cannot be a tuple
+npx.savez('my_arrays', *[a, b])
 npx.load('my_arrays')
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
index 3080ad16be97..2c6315b6fc76 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
@@ -50,11 +50,11 @@ logging.basicConfig(level=logging.INFO)
 
 import matplotlib.pyplot as plt
 import mxnet as mx
-from mxnet import gluon, nd, autograd
+from mxnet import gluon, np, npx, autograd
 from mxnet.gluon.data.vision.datasets import ImageFolderDataset
 from mxnet.gluon.data import DataLoader
 import mxnet.contrib.onnx as onnx_mxnet
-import numpy as np
+import numpy as onp
 
 %matplotlib inline
 ```
@@ -156,7 +156,7 @@ We transform the dataset images using the following operations:
 def transform(image, label):
     resized = mx.image.resize_short(image, EDGE)
     cropped, crop_info = mx.image.center_crop(resized, SIZE)
-    transposed = nd.transpose(cropped, (2,0,1))
+    transposed = np.transpose(cropped, (2,0,1))
     return transposed, label
 ```
 
@@ -340,22 +340,22 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd',
 
 ### Evaluation loop
 
-We measure the accuracy in a non-blocking way, using `nd.array` to take care of the parallelisation that MXNet and Gluon offers.
+We measure the accuracy in a non-blocking way, using `np.array` to take care of the parallelisation that MXNet and Gluon offers.
 
 
 ```{.python .input}
  def evaluate_accuracy_gluon(data_iterator, net):
     num_instance = 0
-    sum_metric = nd.zeros(1,ctx=ctx, dtype=np.int32)
+    sum_metric = np.zeros(1,ctx=ctx, dtype=np.int32)
     for i, (data, label) in enumerate(data_iterator):
         data = data.astype(np.float32).as_in_context(ctx)
         label = label.astype(np.int32).as_in_context(ctx)
         output = net(data)
-        prediction = nd.argmax(output, axis=1).astype(np.int32)
+        prediction = np.argmax(output, axis=1).astype(np.int32)
         num_instance += len(prediction)
         sum_metric += (prediction==label).sum()
     accuracy = (sum_metric.astype(np.float32)/num_instance)
-    return accuracy.asscalar()
+    return accuracy.item()
 ```
 
 
@@ -379,7 +379,7 @@ for epoch in range(5):
         label = label.as_in_context(ctx)
 
         if i%20==0 and i >0:
-            print('Batch [{0}] loss: {1:.4f}'.format(i, loss.mean().asscalar()))
+            print('Batch [{0}] loss: {1:.4f}'.format(i, loss.mean().item()))
 
         with autograd.record():
             output = net(data)
@@ -387,7 +387,7 @@ for epoch in range(5):
         loss.backward()
         trainer.step(data.shape[0])
 
-    nd.waitall() # wait at the end of the epoch
+    npx.waitall() # wait at the end of the epoch
     new_val_accuracy = evaluate_accuracy_gluon(dataloader_test, net)
     print("Epoch [{0}] Test Accuracy {1:.4f} ".format(epoch, new_val_accuracy))
 
@@ -416,7 +416,7 @@ TOP_P = 3
 ```{.python .input}
 # Convert img to format expected by the network
 def transform(img):
-    return nd.array(np.expand_dims(np.transpose(img, (2,0,1)),axis=0).astype(np.float32), ctx=ctx)
+    return np.array(np.expand_dims(np.transpose(img, (2,0,1)),axis=0).astype(np.float32), ctx=ctx)
 ```
 
 
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/index.md b/docs/python_docs/python/tutorials/packages/optimizer/index.md
index bd194733a629..6aa0206dc1e2 100644
--- a/docs/python_docs/python/tutorials/packages/optimizer/index.md
+++ b/docs/python_docs/python/tutorials/packages/optimizer/index.md
@@ -149,7 +149,7 @@ To instantiate the Adagrad optimizer in MXNet you can use the following line of
 
 
 ```{.python .input}
-adagrad_optimizer = optimizer.AdaGrad(learning_rate=0.1, eps=1e-07)
+adagrad_optimizer = optimizer.AdaGrad(learning_rate=0.1, epsilon=1e-07)
 ```
 
 ### [RMSProp](../../../api/optimizer/index.rst#mxnet.optimizer.RMSProp)
diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md
index 2d572a8c3cad..9656441ef4d0 100644
--- a/docs/python_docs/python/tutorials/performance/backend/amp.md
+++ b/docs/python_docs/python/tutorials/performance/backend/amp.md
@@ -70,9 +70,9 @@ class SyntheticDataLoader(object):
         shape = (batch_size, 3, data_shape, data_shape)
         cls_targets_shape = (batch_size, 6132)
         box_targets_shape = (batch_size, 6132, 4)
-        self.data = mx.nd.random.uniform(-1, 1, shape=shape, ctx=mx.cpu_pinned())
-        self.cls_targets = mx.nd.random.uniform(0, 1, shape=cls_targets_shape, ctx=mx.cpu_pinned())
-        self.box_targets = mx.nd.random.uniform(0, 1, shape=box_targets_shape, ctx=mx.cpu_pinned())
+        self.data = mx.np.random.uniform(-1, 1, size=shape, ctx=mx.cpu_pinned())
+        self.cls_targets = mx.np.random.uniform(0, 1, size=cls_targets_shape, ctx=mx.cpu_pinned())
+        self.box_targets = mx.np.random.uniform(0, 1, size=box_targets_shape, ctx=mx.cpu_pinned())
     
     def next(self):
         if self.counter >= self.epoch_size:
@@ -274,11 +274,11 @@ with mx.Context(mx.gpu(0)):
         model = get_model("resnet50_v1")
         model.initialize(ctx=mx.current_context())
         model.hybridize()
-        model(mx.nd.zeros((1, 3, 224, 224)))
+        model(mx.np.zeros((1, 3, 224, 224)))
         converted_model = amp.convert_hybrid_block(model)
 
     # Run dummy inference with the converted gluon model
-    result = converted_model.forward(mx.nd.random.uniform(shape=(1, 3, 224, 224),
+    result = converted_model.forward(mx.np.random.uniform(size=(1, 3, 224, 224),
                                                           dtype=np.float32))
 
     print("Conversion and Inference completed successfully")
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index 354dc48e2f70..f935e46f2258 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -25,12 +25,12 @@ If you have just started to use MXNet, you might be tempted to measure the execu
 
 ```{.python .input}
 from time import time
-from mxnet import autograd, nd
+from mxnet import autograd, np
 import mxnet as mx
 
 start = time()
-x = nd.random_uniform(shape=(2000,2000))
-y = nd.dot(x, x)
+x = np.random.uniform(size=(2000,2000))
+y = np.dot(x, x)
 print('Time for matrix multiplication: %f sec\n' % (time() - start))
 
 start = time()                                
@@ -151,7 +151,7 @@ profiler.set_state('run')
 run_training_iteration(*next(itr))
 
 # Make sure all operations have completed
-mx.nd.waitall()
+mx.npx.waitall()
 # Ask the profiler to stop recording
 profiler.set_state('stop')
 # Dump all results to log file before download
@@ -265,7 +265,7 @@ class CustomAddOneProp(mx.operator.CustomOpProp):
         return MyAddOne()
 
 
-inp = mx.nd.zeros(shape=(500, 500))
+inp = mx.np.zeros(shape=(500, 500))
 
 profiler.set_config(profile_all=True, continuous_dump=True, \
                     aggregate_stats=True)
@@ -273,7 +273,7 @@ profiler.set_state('run')
 
 w = nd.Custom(inp, op_type="MyAddOne")
 
-mx.nd.waitall()
+mx.npx.waitall()
 
 profiler.set_state('stop')
 print(profiler.dumps())
@@ -301,7 +301,7 @@ a = mx.symbol.Variable('a')
 b = mx.symbol.Custom(data=a, op_type='MyAddOne')
 c = b.bind(mx.cpu(), {'a': inp})
 y = c.forward()
-mx.nd.waitall()
+mx.npx.waitall()
 profiler.set_state('stop')
 print(profiler.dumps())
 profiler.dump()