From a5ec9b9738ebbc30ebe38374a37bae6369946e07 Mon Sep 17 00:00:00 2001
From: Sheng Zha <zhasheng@amazon.com>
Date: Fri, 14 Aug 2020 12:52:22 -0700
Subject: [PATCH] update notebooks

---
 .../python/tutorials/deploy/export/onnx.md    |  12 +-
 .../inference/image_classification_jetson.md  |   2 +-
 .../python/tutorials/extend/customop.md       |  24 +--
 .../gluon_from_experiment_to_deployment.md    |  14 +-
 .../logistic_regression_explained.md          |  18 +-
 .../gluon/blocks/activations/activations.md   |  22 +--
 .../packages/gluon/blocks/custom-layer.md     |  14 +-
 .../gluon/blocks/custom_layer_beginners.md    |  20 +--
 .../packages/gluon/blocks/hybridize.md        |  12 +-
 .../tutorials/packages/gluon/blocks/naming.md |  22 +--
 .../packages/gluon/blocks/save_load_params.md |  20 +--
 .../packages/gluon/data/data_augmentation.md  |  28 ++--
 .../tutorials/packages/gluon/data/datasets.md |  28 ++--
 .../gluon/image/image-augmentation.md         | 148 -----------------
 .../packages/gluon/image/info_gan.md          |  40 ++---
 .../tutorials/packages/gluon/image/mnist.md   |  24 +--
 .../python/tutorials/packages/gluon/index.rst |  12 --
 .../packages/gluon/loss/custom-loss.md        |  18 +-
 .../tutorials/packages/gluon/loss/loss.md     |   2 +-
 .../gluon/training/fit_api_tutorial.md        |  26 +--
 .../learning_rates/learning_rate_finder.md    |  16 +-
 .../learning_rates/learning_rate_schedules.md |  30 ++--
 .../learning_rate_schedules_advanced.md       |  30 ++--
 .../gluon/training/normalization/index.md     |  36 ++--
 .../tutorials/packages/kvstore/kvstore.md     |  16 +-
 .../legacy/ndarray/01-ndarray-intro.md        |  26 +--
 .../legacy/ndarray/02-ndarray-operations.md   |  32 ++--
 .../legacy/ndarray/gotchas_numpy_in_mxnet.md  |   6 +-
 .../packages/legacy/ndarray/sparse/csr.md     |  38 ++---
 .../legacy/ndarray/sparse/row_sparse.md       |  42 ++---
 .../legacy/ndarray/sparse/train_gluon.md      |  50 +++---
 .../tutorials/packages/np/cheat-sheet.md      | 154 +++++++++---------
 .../packages/onnx/fine_tuning_gluon.md        |  62 +++----
 .../packages/onnx/inference_on_onnx_model.md  |  40 ++---
 .../tutorials/packages/optimizer/index.md     |  30 ++--
 .../tutorials/performance/backend/amp.md      |  22 +--
 .../tutorials/performance/backend/profiler.md |  22 +--
 37 files changed, 499 insertions(+), 659 deletions(-)
 delete mode 100644 docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md

diff --git a/docs/python_docs/python/tutorials/deploy/export/onnx.md b/docs/python_docs/python/tutorials/deploy/export/onnx.md
index f44d476312aa..4867bc86a603 100644
--- a/docs/python_docs/python/tutorials/deploy/export/onnx.md
+++ b/docs/python_docs/python/tutorials/deploy/export/onnx.md
@@ -34,7 +34,7 @@ To run the tutorial you will need to have installed the following python modules
 *Note:* MXNet-ONNX importer and exporter follows version 7 of ONNX operator set which comes with ONNX v1.2.1.
 
 
-```python
+```{.python .input}
 import mxnet as mx
 import numpy as np
 from mxnet.contrib import onnx as onnx_mxnet
@@ -47,7 +47,7 @@ logging.basicConfig(level=logging.INFO)
 We download the pre-trained ResNet-18 [ImageNet](http://www.image-net.org/) model from the [MXNet Model Zoo](/api/python/docs/api/gluon/model_zoo/index.html).
 We will also download synset file to match labels.
 
-```python
+```{.python .input}
 # Download pre-trained resnet model - json and params by running following code.
 path='http://data.mxnet.io/models/imagenet/'
 [mx.test_utils.download(path+'resnet/18-layers/resnet-18-0000.params'),
@@ -61,7 +61,7 @@ Now, we have downloaded ResNet-18 symbol, params and synset file on the disk.
 
 Let us describe the MXNet's `export_model` API. 
 
-```python
+```{.python .input}
 help(onnx_mxnet.export_model)
 ```
 
@@ -109,7 +109,7 @@ Since we have downloaded pre-trained model files, we will use the `export_model`
 
 We will use the downloaded pre-trained model files (sym, params) and define input variables.
 
-```python
+```{.python .input}
 # Downloaded input symbol and params files
 sym = './resnet-18-symbol.json'
 params = './resnet-18-0000.params'
@@ -123,7 +123,7 @@ onnx_file = './mxnet_exported_resnet50.onnx'
 
 We have defined the input parameters required for the `export_model` API. Now, we are ready to covert the MXNet model into ONNX format.
 
-```python
+```{.python .input}
 # Invoke export model API. It returns path of the converted onnx model
 converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file)
 ```
@@ -134,7 +134,7 @@ This API returns path of the converted model which you can later use to import t
 
 Now we can check validity of the converted ONNX model by using ONNX checker tool. The tool will validate the model by checking if the content contains valid protobuf:
 
-```python
+```{.python .input}
 from onnx import checker
 import onnx
 
diff --git a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
index 5a697ca7960e..0a7a8d5d5bd2 100644
--- a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
+++ b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
@@ -73,7 +73,7 @@ And we are done. You can test the installation now by importing mxnet from pytho
 
 We are now ready to run a pre-trained model and run inference on a Jetson module. In this tutorial we are using ResNet-50 model trained on Imagenet dataset. We run the following classification script with either cpu/gpu context using python3.
 
-```python
+```{.python .input}
 from mxnet import gluon
 import mxnet as mx
 
diff --git a/docs/python_docs/python/tutorials/extend/customop.md b/docs/python_docs/python/tutorials/extend/customop.md
index f1ee1d2ae601..d7c08f4751eb 100644
--- a/docs/python_docs/python/tutorials/extend/customop.md
+++ b/docs/python_docs/python/tutorials/extend/customop.md
@@ -26,7 +26,7 @@ Custom operator in python is easy to develop and good for prototyping, but may h
 
 
 
-```python
+```{.python .input}
 import numpy as np
 import mxnet as mx
 from mxnet import gluon, autograd
@@ -42,7 +42,7 @@ This operator implements the standard sigmoid activation function. This is only
 First we implement the forward and backward computation by sub-classing `mx.operator.CustomOp`:
 
 
-```python
+```{.python .input}
 class Sigmoid(mx.operator.CustomOp):
     def forward(self, is_train, req, in_data, out_data, aux):
         """Implements forward computation.
@@ -75,7 +75,7 @@ class Sigmoid(mx.operator.CustomOp):
 Then we need to register the custom op and describe it's properties like input and output shapes so that mxnet can recognize it. This is done by sub-classing `mx.operator.CustomOpProp`:
 
 
-```python
+```{.python .input}
 @mx.operator.register("sigmoid")  # register with name "sigmoid"
 class SigmoidProp(mx.operator.CustomOpProp):
     def __init__(self):
@@ -110,7 +110,7 @@ class SigmoidProp(mx.operator.CustomOpProp):
 We can now use this operator by calling `mx.nd.Custom`:
 
 
-```python
+```{.python .input}
 x = mx.nd.array([0, 1, 2, 3])
 # attach gradient buffer to x for autograd
 x.attach_grad()
@@ -121,7 +121,7 @@ with autograd.record():
 print(y)
 ```
 
-```python
+```{.python .input}
 # call backward computation
 y.backward()
 # gradient is now saved to the grad buffer we attached previously
@@ -137,7 +137,7 @@ The dense operator performs a dot product between data and weight, then add bias
 ### Forward & backward implementation
 
 
-```python
+```{.python .input}
 class Dense(mx.operator.CustomOp):
     def __init__(self, bias):
         self._bias = bias
@@ -158,7 +158,7 @@ class Dense(mx.operator.CustomOp):
 ### Registration
 
 
-```python
+```{.python .input}
 @mx.operator.register("dense")  # register with name "sigmoid"
 class DenseProp(mx.operator.CustomOpProp):
     def __init__(self, bias):
@@ -192,7 +192,7 @@ class DenseProp(mx.operator.CustomOpProp):
 Parameterized CustomOp are usually used together with Blocks, which holds the parameter.
 
 
-```python
+```{.python .input}
 class DenseBlock(mx.gluon.Block):
     def __init__(self, in_channels, channels, bias, **kwargs):
         super(DenseBlock, self).__init__(**kwargs)
@@ -207,7 +207,7 @@ class DenseBlock(mx.gluon.Block):
 ### Example usage
 
 
-```python
+```{.python .input}
 dense = DenseBlock(3, 5, 0.1)
 dense.initialize()
 x = mx.nd.uniform(shape=(4, 3))
@@ -218,7 +218,7 @@ print(y)
 ## Using custom operators with fork
 In Linux systems, the default method in multiprocessing to create process is by using fork. If there are unfinished async custom operations when forking, the program will be blocked because of python GIL. Always use sync calls like `wait_to_read` or `waitall` before calling fork.
 
-```python
+```{.python .input}
 x = mx.nd.array([0, 1, 2, 3])
 y = mx.nd.Custom(x, op_type='sigmoid')
 # unfinished async sigmoid operation will cause blocking
@@ -227,10 +227,10 @@ os.fork()
 
 Correctly handling this will make mxnet depend upon libpython, so the workaround now is to ensure that all custom operations are executed before forking process.
 
-```python
+```{.python .input}
 x = mx.nd.array([0, 1, 2, 3])
 y = mx.nd.Custom(x, op_type='sigmoid')
 # force execution by reading y
 print(y.asnumpy())
 os.fork()
-```
\ No newline at end of file
+```
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index bd9dbacf3e97..7f34708c0f4c 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -44,7 +44,7 @@ We will use the [Oxford 102 Category Flower Dataset](http://www.robots.ox.ac.uk/
 We have prepared a utility file to help you download and organize your data into train, test, and validation sets. Run the following Python code to download and prepare the data:
 
 
-```python
+```{.python .input}
 import mxnet as mx
 data_util_file = "oxford_102_flower_dataset.py"
 base_url = "https://raw.githubusercontent.com/apache/incubator-mxnet/master/docs/tutorial_utils/data/{}?raw=true"
@@ -65,7 +65,7 @@ Now your data will be organized into train, test, and validation sets, images be
 Now let's first import necessary packages:
 
 
-```python
+```{.python .input}
 import math
 import os
 import time
@@ -80,7 +80,7 @@ from mxnet.gluon.model_zoo.vision import resnet50_v2
 Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](/api/python/docs/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training.
 Here we set the `epochs` to 1 for quick demonstration, please change to 40 for actual training.
 
-```python
+```{.python .input}
 classes = 102
 epochs = 1
 lr = 0.001
@@ -108,7 +108,7 @@ Now we will apply data augmentations on training images. This makes minor altera
 
 For validation and inference, we only need to apply step 1, 4, and 5. We also need to save the mean and standard deviation values for [inference using C++](/api/cpp/docs/tutorials/cpp_inference).
 
-```python
+```{.python .input}
 jitter_param = 0.4
 lighting_param = 0.1
 
@@ -165,7 +165,7 @@ Before we go to training, one unique Gluon feature you should be aware of is hyb
 
 
 
-```python
+```{.python .input}
 # load pre-trained resnet50_v2 from model zoo
 finetune_net = resnet50_v2(pretrained=True, ctx=ctx)
 
@@ -195,7 +195,7 @@ Now let's define the test metrics and start fine-tuning.
 
 
 
-```python
+```{.python .input}
 def test(net, val_data, ctx):
     metric = mx.metric.Accuracy()
     for i, (data, label) in enumerate(val_data):
@@ -254,7 +254,7 @@ We now have a trained our custom model. This can be serialized into model files
 
 
 
-```python
+```{.python .input}
 finetune_net.export("flower-recognition", epoch=epochs)
 
 ```
diff --git a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
index 277aa5d2d82c..e36e048f371a 100644
--- a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
+++ b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
@@ -23,7 +23,7 @@ Logistic Regression is one of the first models newcomers to Deep Learning are im
 Before anything else, let's import required packages for this tutorial.
 
 
-```python
+```{.python .input}
 import numpy as np
 import mxnet as mx
 from mxnet import nd, autograd, gluon
@@ -36,7 +36,7 @@ mx.random.seed(12345)  # Added for reproducibility
 In this tutorial we will use fake dataset, which contains 10 features drawn from a normal distribution with mean equals to 0 and standard deviation equals to 1, and a class label, which can be either 0 or 1. The size of the dataset is an arbitrary value. The function below helps us to generate a dataset. Class label `y` is generated via a non-random logic, so the network would have a pattern to look for. Boundary of 3 is selected to make sure that number of positive examples smaller than negative, but not too small
 
 
-```python
+```{.python .input}
 def get_random_data(size, ctx):
     x = nd.normal(0, 1, shape=(size, 10), ctx=ctx)
     y = x.sum(axis=1) > 3
@@ -46,7 +46,7 @@ def get_random_data(size, ctx):
 Also, let's define a set of hyperparameters, that we are going to use later. Since our model is simple and dataset is small, we are going to use CPU for calculations. Feel free to change it to GPU for a more advanced scenario.
 
 
-```python
+```{.python .input}
 ctx = mx.cpu()
 train_data_size = 1000
 val_data_size = 100
@@ -60,7 +60,7 @@ To work with data, Apache MXNet provides [Dataset](https://mxnet.apache.org/api/
 Below we define training and validation datasets, which we are going to use in the tutorial.
 
 
-```python
+```{.python .input}
 train_x, train_ground_truth_class = get_random_data(train_data_size, ctx)
 train_dataset = ArrayDataset(train_x, train_ground_truth_class)
 train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
@@ -77,7 +77,7 @@ The only requirement for the logistic regression is that the last layer of the n
 Below, we define a model which has an input layer of 10 neurons, a couple of inner layers of 10 neurons each, and output layer of 1 neuron. We stack the layers using [HybridSequential](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.HybridSequential) block and initialize parameters of the network using [Xavier](https://mxnet.apache.org/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initialization.
 
 
-```python
+```{.python .input}
 net = nn.HybridSequential()
 
 net.add(nn.Dense(units=10, activation='relu'))  # input layer
@@ -99,7 +99,7 @@ Metric helps us to estimate how good our model is in terms of a problem we are t
 Below we define these objects.
 
 
-```python
+```{.python .input}
 loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
 trainer = Trainer(params=net.collect_params(), optimizer='sgd',
                   optimizer_params={'learning_rate': 0.1})
@@ -110,7 +110,7 @@ f1 = mx.metric.F1()
 The next step is to define the training function in which we iterate over all batches of training data, execute the forward pass on each batch and calculate training loss. On line 19, we sum losses of every batch per epoch into a single variable, because we calculate loss per single batch, but want to display it per epoch.
 
 
-```python
+```{.python .input}
 def train_model():
     cumulative_train_loss = 0
 
@@ -159,7 +159,7 @@ For `F1` metric to work, instead of one number per class, we must pass probabili
 Then we pass this stacked matrix to `F1` score.
 
 
-```python
+```{.python .input}
 def validate_model(threshold):
     cumulative_val_loss = 0
 
@@ -193,7 +193,7 @@ def validate_model(threshold):
 By using the defined above functions, we can finally write our main training loop.
 
 
-```python
+```{.python .input}
 epochs = 10
 threshold = 0.5
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md
index 755253708b43..e5ba40353a9f 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md
@@ -25,7 +25,7 @@ If you are looking to answer the question, 'which activation function should I u
 In order to compare the various activation functions and to understand the nuances of their differences we have a snippet of code to plot the activation functions (used in the forward pass) and their gradients (used in the backward pass).
 
 
-```python
+```{.python .input}
 import numpy as np
 import mxnet as mx
 from matplotlib import pyplot as plt
@@ -62,7 +62,7 @@ $$ \sigma(x) = \dfrac{e^x}{e^x + 1} $$
 Warning: the term sigmoid is overloaded and can be used to refer to the class of 's' shaped functions or particularly to the logistic function that we've just described. In MxNet the sigmoid activation specifically refers to logistic function sigmoid.
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.Activation('sigmoid'))
 ```
 
@@ -90,7 +90,7 @@ which shows its direct relation to sigmoid by the following equation:
 $$ tanh(x) = 2\sigma(2x) - 1$$
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.Activation('tanh'))
 ```
 
@@ -107,7 +107,7 @@ The SoftSign activation is an alternative to tanh that is also centered at zero
 $$ softsign(x) = \dfrac{x}{abs(x) + 1} $$
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.Activation('softsign'))
 ```
 
@@ -129,7 +129,7 @@ ReLU was introduced to neural networks in the [paper by Hahnloser et al](https:/
 ReLU is the most widely used activation due to its simplicity and performance across multiple datasets and although there have been efforts to introduce activation functions, many of them described in this tutorial, that improve on ReLU, they have not gained as much widespread adoption.
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.Activation('relu'))
 ```
 
@@ -148,7 +148,7 @@ $$ SoftReLU(x) = log(1 + e^x)$$
 The SoftReLU can be seen as a smooth version of the ReLU by observing that its derivative is the sigmoid, seen below, which is a smooth version of the gradient of the ReLU shown above.
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.Activation('softrelu'))
 ```
 
@@ -170,7 +170,7 @@ where $\alpha > 0$ is small positive number. In MXNet, by default the $\alpha$ p
 Here is a visualization for the LeakyReLU with $\alpha = 0.05$
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.LeakyReLU(0.05))
 ```
 
@@ -184,7 +184,7 @@ As shown in the graph, the LeakyReLU's gradient is non-zero everywhere, in an at
 The PReLU activation function, or Parametric Leaky ReLU introduced by [He et al](https://arxiv.org/pdf/1502.01852.pdf), is a version of LeakyReLU that learns the parameter $\alpha$ during training. An initialization parameter is passed into the PreLU activation layer and this is treated as a learnable parameter that is updated via gradient descent during training. This is in contrast to LeakyReLU where $\alpha$ is a hyperparameter.
 
 
-```python
+```{.python .input}
 prelu = mx.gluon.nn.PReLU(mx.init.Normal(0.05))
 prelu.initialize()
 visualize_activation(prelu)
@@ -208,7 +208,7 @@ $$ ELU(\alpha, x) = \begin{cases}
 \end{cases}$$
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.ELU())
 ```
 
@@ -229,7 +229,7 @@ $$ SELU(\alpha, x) = \lambda \cdot\begin{cases}
 In SELU, unlike ELU, the parameters $\alpha$ and $\lambda$ are fixed parameters calculated from the data. For standard scaled inputs, these values are $$\alpha=1.6732, \lambda=1.0507$$ as calculated in the paper.
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.SELU())
 ```
 
@@ -247,7 +247,7 @@ $$ swish(x) = x\cdot\sigma(\beta x)$$
 where $\sigma$ is the sigmoid activation function $\sigma(x) = \frac{1}{1 + e^{-x}}$ described above and $\beta$ is a hyperparameter set to 1 by default in MXNet.
 
 
-```python
+```{.python .input}
 visualize_activation(mx.gluon.nn.Swish())
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
index 8a6a2cb6c21a..ff62a55d2617 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
@@ -31,7 +31,7 @@ The only instance method needed to be implemented is [forward(self, x)](https://
 In the example below, we define a new layer and implement `forward()` method to normalize input data by fitting it into a range of [0, 1].
 
 
-```python
+```{.python .input}
 # Do some initial imports used throughout this tutorial 
 from __future__ import print_function
 import mxnet as mx
@@ -41,7 +41,7 @@ mx.random.seed(1)                      # Set seed for reproducable results
 ```
 
 
-```python
+```{.python .input}
 class NormalizationLayer(gluon.Block):
     def __init__(self):
         super(NormalizationLayer, self).__init__()
@@ -69,7 +69,7 @@ To support hybridization, it is important to use only methods avaible directly f
 Knowing this, we can can rewrite our example layer, using HybridBlock:
 
 
-```python
+```{.python .input}
 class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self):
         super(NormalizationHybridLayer, self).__init__()
@@ -81,7 +81,7 @@ class NormalizationHybridLayer(gluon.HybridBlock):
 Thanks to inheriting from HybridBlock, one can easily do forward pass on a given ndarray, either on CPU or GPU:
 
 
-```python
+```{.python .input}
 layer = NormalizationHybridLayer()
 layer(nd.array([1, 2, 3], ctx=mx.cpu()))
 ```
@@ -109,7 +109,7 @@ Depending on which class you used as a base one, you can use either [Sequential]
 Below is an example of how to create a simple neural network with a custom layer. In this example, `NormalizationHybridLayer` gets as an input the output from `Dense(5)` layer and pass its output as an input to `Dense(1)` layer.
 
 
-```python
+```{.python .input}
 net = gluon.nn.HybridSequential()                         # Define a Neural Network as a sequence of hybrid blocks
 net.add(Dense(5))                                     # Add Dense layer with 5 neurons
 net.add(NormalizationHybridLayer())                   # Add our custom layer
@@ -142,7 +142,7 @@ Usually, a layer has a set of associated parameters, sometimes also referred as
 All parameters of a block are stored and accessed via [ParameterDict](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/parameter.py#L508) class. This class helps with initialization, updating, saving and loading of the parameters. Each layer can have multiple set of parameters, and all of them can be stored in a single instance of the `ParameterDict` class. On a block level, the instance of the `ParameterDict` class is accessible via `self.params` field, and outside of a block one can access all parameters of the network via [collect_params()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.collect_params) method called on a `container`. `ParameterDict` uses [Parameter](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Parameter) class to represent parameters inside of Apache MxNet neural network. If parameter doesn't exist, trying to get a parameter via `self.params` will create it automatically.
 
 
-```python
+```{.python .input}
 class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self, hidden_units, scales):
         super(NormalizationHybridLayer, self).__init__()
@@ -179,7 +179,7 @@ The last peculiarity is due to support of imperative and symbolic programming by
 Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let's run whole training for a few epochs to show that `scales` parameter doesn't change during the training while `weights` parameter is changing.
 
 
-```python
+```{.python .input}
 def print_params(title, net):
     """
     Helper function to print out the state of parameters of NormalizationHybridLayer
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
index 99fed59678ca..005ecd510a56 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
@@ -29,7 +29,7 @@ The only instance method needed to be implemented is [forward(self, x)](https://
 
 In the example below, we define a new layer and implement `forward()`  method to normalize input data by fitting it into a range of [0, 1].
 
-```python
+```{.python .input}
 # Do some initial imports used throughout this tutorial
 from __future__ import print_function
 import mxnet as mx
@@ -38,7 +38,7 @@ from mxnet.gluon.nn import Dense
 mx.random.seed(1)                      # Set seed for reproducable results
 ```
 
-```python
+```{.python .input}
 class NormalizationLayer(gluon.Block):
     def __init__(self):
         super(NormalizationLayer, self).__init__()
@@ -65,7 +65,7 @@ To support hybridization, it is important to use only methods available directly
 
 Knowing this, we can can rewrite our example layer, using HybridBlock:
 
-```python
+```{.python .input}
 class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self):
         super(NormalizationHybridLayer, self).__init__()
@@ -76,12 +76,12 @@ class NormalizationHybridLayer(gluon.HybridBlock):
 
 Thanks to inheriting from HybridBlock, one can easily do forward pass on a given ndarray, either on CPU or GPU:
 
-```python
+```{.python .input}
 layer = NormalizationHybridLayer()
 layer(nd.array([1, 2, 3], ctx=mx.cpu()))
 ```
 
-```python
+```{.python .input}
 [0.  0.5 1. ]
 <NDArray 3 @cpu(0)>
 ```
@@ -100,7 +100,7 @@ Depending on which class you used as a base one, you can use either [Sequential]
 
 Below is an example of how to create a simple neural network with a custom layer. In this example, `NormalizationHybridLayer` gets as an input the output from `Dense(5)` layer and pass its output as an input to `Dense(1)` layer.
 
-```python
+```{.python .input}
 net = gluon.nn.HybridSequential()                         # Define a Neural Network as a sequence of hybrid blocks
 net.add(Dense(5))                                     # Add Dense layer with 5 neurons
 net.add(NormalizationHybridLayer())                   # Add our custom layer
@@ -113,7 +113,7 @@ input = nd.random_uniform(low=-10, high=10, shape=(5, 2)) # Create 5 random exam
 net(input)
 ```
 
-```python
+```{.python .input}
 [[-0.13601446]
  [ 0.26103732]
  [-0.05046433]
@@ -128,7 +128,7 @@ Usually, a layer has a set of associated parameters, sometimes also referred as
 
 All parameters of a block are stored and accessed via [ParameterDict](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/parameter.py#L508) class. This class helps with initialization, updating, saving and loading of the parameters. Each layer can have multiple set of parameters, and all of them can be stored in a single instance of the `ParameterDict` class. On a block level, the instance of the `ParameterDict` class is accessible via `self.params` field, and outside of a block one can access all parameters of the network via [collect_params()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.collect_params) method called on a `container`. `ParamterDict` uses [Parameter](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Parameter) class to represent parameters inside of Apache MxNet neural network. If parameter doesn’t exist, trying to get a parameter via `self.params` will create it automatically.
 
-```python
+```{.python .input}
 class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self, hidden_units, scales):
         super(NormalizationHybridLayer, self).__init__()
@@ -165,7 +165,7 @@ The last peculiarity is due to support of imperative and symbolic programming by
 
 Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let’s run whole training for a few epochs to show that `scales` parameter doesn’t change during the training while `weights` parameter is changing.
 
-```python
+```{.python .input}
 def print_params(title, net):
     """
     Helper function to print out the state of parameters of NormalizationHybridLayer
@@ -206,7 +206,7 @@ trainer.step(input.shape[0])                                  # Trainer updates
 print_params("=========== Parameters after backward pass ===========\n", net)
 ```
 
-```python
+```{.python .input}
 =========== Parameters after forward pass ===========
 
 hybridsequential94_normalizationhybridlayer0_weights =
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
index 5f28699dd860..a0d18e3ae7aa 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
@@ -225,7 +225,7 @@ The difference between a purely imperative `Block` and hybridizable `HybridBlock
 
 When trying to access specific elements in a tensor like this:
 
-```python
+```{.python .input}
 def hybrid_forward(self, F, x):
     return x[0,0]
 ```
@@ -240,7 +240,7 @@ There are however several operators that can help you with array manipulations l
 
 Sometimes one can be tempted to use conditional logic on the type of the input tensors however the following block:
 
-```python
+```{.python .input}
 def hybrid_forward(self, F, x):
     if x.dtype =='float16':
         return x
@@ -255,7 +255,7 @@ You cannot use the `dtype` of the symbol at runtime. Symbols only describe opera
 
 Similarly you cannot use the compute context of symbol for the same reason that symbols only describe the operations on the data and not the data (or context). You cannot do this:
 
-```python
+```{.python .input}
 def hybrid_forward(self, F, x):
     if x.context == mx.cpu():
         return x
@@ -270,7 +270,7 @@ Accessing the current compute context is not possible with symbols. Consider pas
 
 Accessing shape information of tensors is very often used for example when trying to flatten a tensor and then reshape it back to its original shape.
 
-```python
+```{.python .input}
 def hybrid_forward(self, F, x):
     return x*x.shape[0]
 ```
@@ -286,7 +286,7 @@ There are also a lot of operators that support special indices to help with most
 
 Last but not least, you cannot directly assign values in tensor in a symbolic graph, the resulting tensors always needs to be the results of operations performed on the inputs of the computational graph. The following code:
 
-```python
+```{.python .input}
 def hybrid_forward(self, F, x):
     x[0] = 2
     return x
@@ -298,7 +298,7 @@ Direct item assignment is not possible in symbolic graph since it needs to be pa
 
 e.g to set the first element to 2 you can do:
 
-```python
+```{.python .input}
 x = mx.nd.array([1,2,3])
 value = mx.nd.ones_like(x)*2
 condition = mx.nd.array([0,1,1])
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
index 6f98a2f6b2ce..511bd9b2a4b5 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
@@ -22,7 +22,7 @@ In gluon, each Parameter or Block has a name. Parameter names and Block names ca
 In this tutorial we talk about the best practices on naming. First, let's import MXNet and Gluon:
 
 
-```python
+```{.python .input}
 from __future__ import print_function
 import mxnet as mx
 from mxnet import gluon
@@ -33,7 +33,7 @@ from mxnet import gluon
 When creating a block, you can simply do as follows:
 
 
-```python
+```{.python .input}
 mydense = gluon.nn.Dense(100)
 print(mydense.name)
 ```
@@ -41,7 +41,7 @@ print(mydense.name)
 When you create more Blocks of the same kind, they will be named with incrementing suffixes to avoid collision:
 
 
-```python
+```{.python .input}
 dense1 = gluon.nn.Dense(100)
 print(dense1.name)
 ```
@@ -51,7 +51,7 @@ print(dense1.name)
 Parameters will be named automatically by a unique name in the format of `param_{uuid4}_{name}`:
 
 
-```python
+```{.python .input}
 param = gluon.Parameter(name = 'bias')
 print(param.name)
 ```
@@ -61,7 +61,7 @@ print(param.name)
 When getting parameters within a Block, you should use the structure based name as the key:
 
 
-```python
+```{.python .input}
 print(dense0.collect_params())
 ```
 
@@ -70,7 +70,7 @@ print(dense0.collect_params())
 In MXNet 2, we don't have to define children blocks within a `name_scope` any more. Let's demonstrate this by defining and initiating a simple neural net:
 
 
-```python
+```{.python .input}
 class Model(gluon.HybridBlock):
     def __init__(self):
         super(Model, self).__init__()
@@ -92,7 +92,7 @@ model0(mx.nd.zeros((1, 20)))
 The same principle also applies to container blocks like Sequential. We can simply do as follows:
 
 
-```python
+```{.python .input}
 net = gluon.nn.Sequential()
 net.add(gluon.nn.Dense(20))
 net.add(gluon.nn.Dense(20))
@@ -105,7 +105,7 @@ net.add(gluon.nn.Dense(20))
 For `HybridBlock`, we use `save_parameters`/`load_parameters`, which uses model structure, instead of parameter name, to match parameters.
 
 
-```python
+```{.python .input}
 model0.save_parameters('model.params')
 model1.load_parameters('model.params')
 print(mx.nd.load('model.params').keys())
@@ -113,7 +113,7 @@ print(mx.nd.load('model.params').keys())
 
 For `SymbolBlock.imports`, we use `export`, which uses parameter name `param.name`, to save parameters.
 
-```python
+```{.python .input}
 model0.export('model0')
 model2 = gluon.SymbolBlock.imports('model0-symbol.json', ['data'], 'model0-0000.params')
 ```
@@ -130,7 +130,7 @@ To see how to do this, we first load a pretrained AlexNet.
 - Note that the output layer is a dense block with 1000 dimension outputs.
 
 
-```python
+```{.python .input}
 alexnet = gluon.model_zoo.vision.alexnet(pretrained=True)
 print(alexnet.output)
 ```
@@ -139,7 +139,7 @@ print(alexnet.output)
 To change the output to 100 dimension, we replace it with a new block.
 
 
-```python
+```{.python .input}
 alexnet.output = gluon.nn.Dense(100)
 alexnet.output.initialize()
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
index 38f3b5dae159..631a3151be2d 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
@@ -31,7 +31,7 @@ The Model architecture of `Hybrid` models stays static and don't change during e
 
 Let's look at the above methods in more detail. Let's start by importing the modules we'll need.
 
-```python
+```{.python .input}
 from __future__ import print_function
 
 import mxnet as mx
@@ -48,7 +48,7 @@ We need a trained model before we can save it to a file. So let's go ahead and b
 
 Let's define a helper function to build a LeNet model and another helper to train LeNet with MNIST.
 
-```python
+```{.python .input}
 # Use GPU if one exists, else use CPU
 ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 
@@ -115,7 +115,7 @@ def train_model(model):
 
 Let's build a model and train it. After training, we will save and restore this model from a file.
 
-```python
+```{.python .input}
 net = build_lenet(gluon.nn.Sequential())
 train_model(net)
 ```
@@ -144,7 +144,7 @@ Epoch: 0; Batch 900; Loss 0.008402 <!--notebook-skip-line-->
 
 Okay, we now have a model (`net`) that we can save to a file. Let's save the parameters of this model to a file using the `save_parameters` function.
 
-```python
+```{.python .input}
 file_name = "net.params"
 net.save_parameters(file_name)
 ```
@@ -155,7 +155,7 @@ We have successfully saved the parameters of the model into a file.
 
 Let's now create a network with the parameters we saved into the file. We build the network again using the helper first and then load the weights from the file we saved using the `load_parameters` function.
 
-```python
+```{.python .input}
 new_net = build_lenet(gluon.nn.Sequential())
 new_net.load_parameters(file_name, ctx=ctx)
 ```
@@ -166,7 +166,7 @@ If our network is [Hybrid](https://mxnet.apache.org/tutorials/gluon/hybrid.html)
 
 Let's test the model we just loaded from file.
 
-```python
+```{.python .input}
 import matplotlib.pyplot as plt
 
 def verify_loaded_model(net):
@@ -209,7 +209,7 @@ Model predictions:  [1. 1. 4. 5. 0. 5. 7. 0. 3. 6.] <!--notebook-skip-line-->
 
 Note that the network we created above is not a Hybrid network and therefore cannot be serialized into a JSON file. So, let's create a Hybrid version of the same network and train it.
 
-```python
+```{.python .input}
 net = build_lenet(gluon.nn.HybridSequential())
 net.hybridize()
 train_model(net)
@@ -238,7 +238,7 @@ Epoch: 0; Batch 900; Loss 0.037809 <!--notebook-skip-line-->
 
 We now have a trained hybrid network. This can be exported into files using the `export` function. The `export` function will export the model architecture into a `.json` file and model parameters into a `.params` file.
 
-```python
+```{.python .input}
 net.export("lenet", epoch=1)
 ```
 
@@ -256,7 +256,7 @@ One of the main reasons to serialize model architecture into a JSON file is to l
 
 Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above.
 
-```python
+```{.python .input}
 import warnings
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
@@ -265,7 +265,7 @@ with warnings.catch_warnings():
 
 `deserialized_net` now contains the network we deserialized from files. Let's test the deserialized network to make sure it works.
 
-```python
+```{.python .input}
 verify_loaded_model(deserialized_net)
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md b/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md
index 0e320fc2890e..3b4c26a637ef 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md
@@ -30,7 +30,7 @@ You should be familiar with the concept of a transform and how to apply it to a
 You can find them in the `mxnet.gluon.data.vision.transforms` module, alongside the deterministic transforms we've seen previously, such as `ToTensor`, `Normalize`, `CenterCrop` and `Resize`. Augmentations involve an element of randomness and all the augmentation transforms are prefixed with `Random`, such as `RandomResizedCrop` and `RandomBrightness`. We'll start by importing MXNet and the `transforms`.
 
 
-```python
+```{.python .input}
 import matplotlib.pyplot as plt
 import mxnet as mx
 from mxnet.gluon.data.vision import transforms
@@ -41,7 +41,7 @@ from mxnet.gluon.data.vision import transforms
 So that we can see the effects of all the vision augmentations, we'll take a sample image of a giraffe and apply various augmentations to it. We can see what it looks like to begin with.
 
 
-```python
+```{.python .input}
 image_url = 'https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/data_aug/inputs/0.jpg'
 mx.test_utils.download(image_url, "giraffe.jpg")
 example_image = mx.image.imread("giraffe.jpg")
@@ -54,7 +54,7 @@ plt.imshow(example_image.asnumpy())
 Since these augmentations are random, we'll apply the same augmentation a few times and plot all of the outputs. We define a few utility functions to help with this.
 
 
-```python
+```{.python .input}
 def show_images(imgs, num_rows, num_cols, scale=2):
     # show augmented images in a grid layout 
     aspect_ratio = imgs[0].shape[0]/imgs[0].shape[1]
@@ -90,7 +90,7 @@ As an example, we randomly (using a uniform distribution) crop a region of the i
 And then we resize this cropped region to 200 by 200 pixels.
 
 
-```python
+```{.python .input}
 shape_aug = transforms.RandomResizedCrop(size=(200, 200),
                                          scale=(0.1, 1),
                                          ratio=(0.5, 2))
@@ -105,7 +105,7 @@ apply(example_image, shape_aug)
 A simple augmentation technique is flipping. Usually flipping horizontally doesn't change the category of object and results in an image that's still plausible in the real world. Using `RandomFlipLeftRight`, we randomly flip the image horizontally 50% of the time.
 
 
-```python
+```{.python .input}
 apply(example_image, transforms.RandomFlipLeftRight())
 ```
 
@@ -117,7 +117,7 @@ apply(example_image, transforms.RandomFlipLeftRight())
 Although it's not as common as flipping left and right, you can flip the image vertically 50% of the time with `RandomFlipTopBottom`. With our giraffe example, we end up with less plausible samples that horizontal flipping, with the ground above the sky in some cases.
 
 
-```python
+```{.python .input}
 apply(example_image, transforms.RandomFlipTopBottom())
 ```
 
@@ -140,7 +140,7 @@ image *= alpha
 So by setting this to 0.5 we randomly change the brightness of the image to a value between 50% ($1-0.5$) and 150% ($1+0.5$) of the original image.
 
 
-```python
+```{.python .input}
 apply(example_image, transforms.RandomBrightness(0.5))
 ```
 
@@ -161,7 +161,7 @@ image += gray
 ```
 
 
-```python
+```{.python .input}
 apply(example_image, transforms.RandomContrast(0.5))
 ```
 
@@ -173,7 +173,7 @@ apply(example_image, transforms.RandomContrast(0.5))
 Use `RandomSaturation` to add a random saturation jitter to an image. Saturation can be thought of as the 'amount' of color in an image. Use the `saturation` parameter to control the amount of jitter in saturation, with value from 0 (no change) to 1 (potentially large change). `saturation` doesn't specify whether the saturation of the augmented image will be higher or lower, just the potential strength of the effect. Specifically the augmentation is using the method detailed [here](https://beesbuzz.biz/code/16-hsv-color-transforms).
 
 
-```python
+```{.python .input}
 apply(example_image, transforms.RandomSaturation(0.5))
 ```
 
@@ -185,7 +185,7 @@ apply(example_image, transforms.RandomSaturation(0.5))
 Use `RandomHue` to add a random hue jitter to images. Hue can be thought of as the 'shade' of the colors in an image. Use the `hue` parameter to control the amount of jitter in hue, with value from 0 (no change) to 1 (potentially large change). `hue` doesn't specify whether the hue of the augmented image will be shifted one way or the other, just the potential strength of the effect. Specifically the augmentation is using the method detailed [here](https://beesbuzz.biz/code/16-hsv-color-transforms).
 
 
-```python
+```{.python .input}
 apply(example_image, transforms.RandomHue(0.5))
 ```
 
@@ -197,7 +197,7 @@ apply(example_image, transforms.RandomHue(0.5))
 `RandomColorJitter` is a convenience transform that can be used to perform multiple color augmentations at once. You can set the `brightness`, `contrast`, `saturation` and `hue` jitters, that function the same as above for their individual transforms.
 
 
-```python
+```{.python .input}
 color_aug = transforms.RandomColorJitter(brightness=0.5,
                                          contrast=0.5,
                                          saturation=0.5,
@@ -213,7 +213,7 @@ apply(example_image, color_aug)
 Use `RandomLighting` for an AlexNet-style PCA-based noise augmentation.
 
 
-```python
+```{.python .input}
 apply(example_image, transforms.RandomLighting(alpha=1))
 ```
 
@@ -224,7 +224,7 @@ apply(example_image, transforms.RandomLighting(alpha=1))
 In practice, we apply multiple augmentation techniques to an image to increase the variety of images in the dataset. Using the `Compose` transform that was introduced in the [Data Transforms tutorial](), we can apply 3 of the transforms we previously used above.
 
 
-```python
+```{.python .input}
 augs = transforms.Compose([
     transforms.RandomFlipLeftRight(), color_aug, shape_aug])
 apply(example_image, augs)
@@ -232,4 +232,4 @@ apply(example_image, augs)
 
 ![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_41_0.png)
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
index bd291dd92253..c09e62a8cd2c 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
@@ -27,7 +27,7 @@ One of the most critical steps for model training and inference is loading the d
 We first start by generating random data `X` (with 3 variables) and corresponding random labels `y` to simulate a typical supervised learning task. We generate 10 samples and we pass them all to the `ArrayDataset`.
 
 
-```python
+```{.python .input}
 import mxnet as mx
 import os
 import tarfile
@@ -41,7 +41,7 @@ dataset = mx.gluon.data.dataset.ArrayDataset(X, y)
 A key feature of a `Dataset` is the __*ability to retrieve a single sample given an index*__. Our random data and labels were generated in memory, so this `ArrayDataset` doesn't have to load anything from disk, but the interface is the same for all `Dataset`'s.
 
 
-```python
+```{.python .input}
 sample_idx = 4
 sample = dataset[sample_idx]
 
@@ -67,7 +67,7 @@ A [DataLoader](/api/python/docs/api/gluon/data/index.html#dataloader) is used to
 Another benefit of using `DataLoader` is the ability to easily load data in parallel using [multiprocessing](https://docs.python.org/3.6/library/multiprocessing.html). You can set the `num_workers` parameter to the number of CPUs available on your machine for maximum performance, or limit it to a lower number to spare resources.
 
 
-```python
+```{.python .input}
 from multiprocessing import cpu_count
 CPU_COUNT = cpu_count()
 
@@ -95,7 +95,7 @@ Using Gluon `Dataset` objects, we define the data to be included in each of thes
 Many of the image `Dataset`'s accept a function (via the optional `transform` parameter) which is applied to each sample returned by the `Dataset`. It's useful for performing data augmentation, but can also be used for more simple data type conversion and pixel value scaling as seen below.
 
 
-```python
+```{.python .input}
 def transform(data, label):
     data = data.astype('float32')/255
     return data, label
@@ -105,7 +105,7 @@ valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False, transfor
 ```
 
 
-```python
+```{.python .input}
 %matplotlib inline
 from matplotlib.pylab import imshow
 
@@ -136,7 +136,7 @@ When training machine learning models it is important to shuffle the training sa
 If you have more complex shuffling requirements (e.g. when handling sequential data), take a look at [mxnet.gluon.data.BatchSampler](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.BatchSampler) and pass this to your `DataLoader` instead.
 
 
-```python
+```{.python .input}
 batch_size = 32
 train_data_loader = mx.gluon.data.DataLoader(train_dataset, batch_size, shuffle=True, num_workers=CPU_COUNT)
 valid_data_loader = mx.gluon.data.DataLoader(valid_dataset, batch_size, num_workers=CPU_COUNT)
@@ -145,7 +145,7 @@ valid_data_loader = mx.gluon.data.DataLoader(valid_dataset, batch_size, num_work
 With both `DataLoader`s defined, we can now train a model to classify each image and evaluate the validation loss at each epoch. Our Fashion MNIST dataset has 10 classes including shirt, dress, sneakers, etc. We define a simple fully connected network with a softmax output and use cross entropy as our loss.
 
 
-```python
+```{.python .input}
 from mxnet import gluon, autograd, ndarray
 
 def construct_net():
@@ -166,7 +166,7 @@ criterion = gluon.loss.SoftmaxCrossEntropyLoss()
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
 ```
 
-```python
+```{.python .input}
 
 
 epochs = 5
@@ -224,7 +224,7 @@ We will run through an example for image classification, but a similar process a
 
 You can download the Caltech 101 dataset if you don't already have images to work with for this example, but please note the download is 126MB.
 
-```python
+```{.python .input}
 
 data_folder = "data"
 dataset_name = "101_ObjectCategories"
@@ -243,7 +243,7 @@ if not os.path.isfile(archive_path):
 
 After downloading and extracting the data archive, we have two folders: `data/101_ObjectCategories` and `data/101_ObjectCategories_test`. We load the data into separate training and testing  [ImageFolderDataset](/api/python/docs/api/gluon/data/vision/datasets/index.html#mxnet.gluon.data.vision.datasets.ImageFolderDataset)s.
 
-```python
+```{.python .input}
 training_path = os.path.join(data_folder, dataset_name)
 testing_path = os.path.join(data_folder, "{}_test".format(dataset_name))
 ```
@@ -253,7 +253,7 @@ We instantiate the [ImageFolderDataset](https://mxnet.incubator.apache.org/api/p
 Optionally, you can pass a `transform` parameter to these `Dataset`'s as we've seen before.
 
 
-```python
+```{.python .input}
 train_dataset = mx.gluon.data.vision.datasets.ImageFolderDataset(training_path)
 test_dataset = mx.gluon.data.vision.datasets.ImageFolderDataset(testing_path)
 ```
@@ -263,7 +263,7 @@ Samples from these datasets are tuples of data and label. Images are loaded from
 As with the Fashion MNIST dataset the labels will be integer encoded. You can use the `synsets` property of the [ImageFolderDataset](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=imagefolderdataset#mxnet.gluon.data.vision.datasets.ImageFolderDataset)s to retrieve the original descriptions (e.g. `train_dataset.synsets[i]`).
 
 
-```python
+```{.python .input}
 sample_idx = 539
 sample = train_dataset[sample_idx]
 data = sample[0]
@@ -299,7 +299,7 @@ Before Gluon's [DataLoader](/api/python/docs/api/gluon/data/index.html#dataloade
 So you can get up and running with Gluon quicker if you have already implemented complex pre-processing steps using `DataIter`, we have provided a simple class to wrap existing `DataIter` objects so they can be used in a typical Gluon training loop. You can use this class for `DataIter`s such as [mxnet.image.ImageIter](/api/python/docs/api/mxnet/image/index.html#mxnet.image.ImageIter) and [mxnet.io.ImageRecordIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.ImageDetRecordIter) that have single data and label arrays.
 
 
-```python
+```{.python .input}
 class DataIterLoader():
     def __init__(self, data_iter):
         self.data_iter = data_iter
@@ -320,7 +320,7 @@ class DataIterLoader():
 ```
 
 
-```python
+```{.python .input}
 data_iter = mx.io.NDArrayIter(data=X, label=y, batch_size=5)
 data_iter_loader = DataIterLoader(data_iter)
 for X_batch, y_batch in data_iter_loader:
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md b/docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md
deleted file mode 100644
index 70be781be3d6..000000000000
--- a/docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md
+++ /dev/null
@@ -1,148 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Image Augmentation
-
-Image augmentation technology expands the scale of
-training data sets by making a series of random changes to the training images
-to produce similar, but different, training examples. Given its popularity in
-computer vision, the `mxnet.gluon.data.vision.transforms` model provides
-multiple pre-defined image augmentation methods. In this section we will briefly
-go through this module.
-
-First, import the module required for this section.
-
-```python
-from matplotlib import pyplot as plt
-from mxnet import image
-from mxnet.gluon import data as gdata, utils
-```
-
-Then read the sample $400\times 500$ image.
-
-```python
-utils.download('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/cat.jpg')
-img = image.imread('cat.jpg')
-plt.imshow(img.asnumpy())
-plt.show()
-```
-
-In addition, we define a function to draw a list of images.
-
-```python
-def show_images(imgs, num_rows, num_cols, scale=2):
-    figsize = (num_cols * scale, num_rows * scale)
-    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
-    for i in range(num_rows):
-        for j in range(num_cols):
-            axes[i][j].imshow(imgs[i * num_cols + j].asnumpy())
-            axes[i][j].axes.get_xaxis().set_visible(False)
-            axes[i][j].axes.get_yaxis().set_visible(False)
-    return axes
-```
-
-Most image augmentation methods have a certain degree of randomness. To make it
-easier for us to observe the effect of image augmentation, we next define the
-auxiliary function `apply`. This function runs the image augmentation method
-`aug` multiple times on the input image `img` and shows all results.
-
-```python
-def apply(img, aug, num_rows=2, num_cols=4, scale=3):
-    Y = [aug(img) for _ in range(num_rows * num_cols)]
-    show_images(Y, num_rows, num_cols, scale)
-```
-
-## Flip and Crop
-
-Flipping the image left and right usually does not change the
-category of the object. This is one of the earliest and most widely used methods
-of image augmentation. Next, we use the `transforms` module to create the
-`RandomFlipLeftRight` instance, which introduces a 50% chance that the image is
-flipped left and right.
-
-```python
-apply(img, gdata.vision.transforms.RandomFlipLeftRight())
-```
-
-Flipping up and down is not as commonly used as flipping left and right.
-However, at least for this example image, flipping up and down does not hinder
-recognition. Next, we create a `RandomFlipTopBottom` instance for a 50% chance
-of flipping the image up and down.
-
-```python
-apply(img, gdata.vision.transforms.RandomFlipTopBottom())
-```
-
-In the example image we used, the cat is in the middle of the image, but this
-may not be the case for all images. In the [Pooling Layer](https://d2l.ai/chapter_convolutional-neural-networks/pooling.html) section of the d2l.ai book, we explain that the pooling layer can reduce the sensitivity of the convolutional
-layer to the target location. In addition, we can make objects appear at
-different positions in the image in different proportions by randomly cropping
-the image. This can also reduce the sensitivity of the model to the target
-position.
-
-In the following code, we randomly crop a region with an area of 10%
-to 100% of the original area, and the ratio of width to height of the region is
-randomly selected from between 0.5 and 2. Then, the width and height of the
-region are both scaled to 200 pixels. Unless otherwise stated, the random number
-between $a$ and $b$ in this section refers to a continuous value obtained by
-uniform sampling in the interval $[a,b]$.
-
-```{.python .input  n=7}
-shape_aug = gdata.vision.transforms.RandomResizedCrop(
-    (200, 200), scale=(0.1, 1), ratio=(0.5, 2))
-apply(img, shape_aug)
-```
-
-## Change Color
-
-Another augmentation method is changing colors. We can change
-four aspects of the image color: brightness, contrast, saturation, and hue. In
-the example below, we randomly change the brightness of the image to a value
-between 50% ($1-0.5$) and 150% ($1+0.5$) of the original image.
-
-```{.python .input  n=8}
-apply(img, gdata.vision.transforms.RandomBrightness(0.5))
-```
-
-Similarly, we can randomly change the hue of the image.
-
-```{.python .input  n=9}
-apply(img, gdata.vision.transforms.RandomHue(0.5))
-```
-
-We can also create a `RandomColorJitter` instance and set how to randomly change
-the `brightness`, `contrast`, `saturation`, and `hue` of the image at the same
-time.
-
-```{.python .input  n=10}
-color_aug = gdata.vision.transforms.RandomColorJitter(
-    brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5)
-apply(img, color_aug)
-```
-
-## Overlying Multiple Image Augmentation Methods
-
-In practice, we will overlay
-multiple image augmentation methods. We can overlay the different image
-augmentation methods defined above and apply them to each image by using a
-`Compose` instance.
-
-```{.python .input  n=11}
-augs = gdata.vision.transforms.Compose([
-    gdata.vision.transforms.RandomFlipLeftRight(), color_aug, shape_aug])
-apply(img, augs)
-```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
index 463ee341e7c4..d7cd28aef5fe 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
@@ -22,7 +22,7 @@ This notebook shows how to implement an InfoGAN based on Gluon. InfoGAN is an ex
 The codes are made meaningful by maximizing the mutual information between code and generator output. InfoGAN learns a disentangled representation in a completely unsupervised manner. It can be used for many applications such as image similarity search. This notebook uses the DCGAN example from the [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html) and extends it to create an InfoGAN. 
 
 
-```python
+```{.python .input}
 from __future__ import print_function
 from datetime import datetime
 import logging
@@ -46,7 +46,7 @@ from mxnet import autograd
 The latent code vector can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10.
 
 
-```python
+```{.python .input}
 batch_size   = 64
 z_dim        = 100
 n_continuous = 2
@@ -57,7 +57,7 @@ ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 Some functions to load and normalize images.
 
 
-```python
+```{.python .input}
 lfw_url = 'http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz'
 data_path = 'lfw_dataset'
 if not os.path.exists(data_path):
@@ -69,7 +69,7 @@ if not os.path.exists(data_path):
 ```
 
 
-```python
+```{.python .input}
 def transform(data, width=64, height=64):
     data = mx.image.imresize(data, width, height)
     data = nd.transpose(data, (2,0,1))
@@ -80,7 +80,7 @@ def transform(data, width=64, height=64):
 ```
 
 
-```python
+```{.python .input}
 def get_files(data_dir):
     images    = []
     filenames = []
@@ -99,7 +99,7 @@ def get_files(data_dir):
 Load the dataset `lfw_dataset` which contains images of celebrities.
 
 
-```python
+```{.python .input}
 data_dir = 'lfw_dataset'
 images, filenames = get_files(data_dir)
 split = int(len(images)*0.8)
@@ -116,7 +116,7 @@ train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuf
 Define the Generator model. Architecture is taken from the DCGAN implementation in [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html). The Generator consist of  4 layers where each layer involves a strided convolution, batch normalization, and rectified nonlinearity. It takes as input random noise and the latent code and produces an `(64,64,3)` output image.
 
 
-```python
+```{.python .input}
 class Generator(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(Generator, self).__init__(**kwargs)
@@ -149,7 +149,7 @@ class Generator(gluon.HybridBlock):
 Define the Discriminator and Q model. The Q model shares many layers with the Discriminator. Its task is to estimate the code `c` for a given fake image.  It is used to maximize the lower bound to the mutual information.
 
 
-```python
+```{.python .input}
 class Discriminator(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(Discriminator, self).__init__(**kwargs)
@@ -195,7 +195,7 @@ Discriminator and Generator are the same as in the DCGAN example. On top of the
 Initialize Generator and Discriminator and define correspoing trainer function.
 
 
-```python
+```{.python .input}
 generator = Generator()
 generator.hybridize()
 generator.initialize(mx.init.Normal(0.002), ctx=ctx)
@@ -215,7 +215,7 @@ q_trainer = gluon.Trainer(discriminator.Q.collect_params(), 'adam', {'learning_r
 Create vectors with real (=1) and fake labels (=0).
 
 
-```python
+```{.python .input}
 real_label = nd.ones((batch_size,), ctx=ctx)
 fake_label = nd.zeros((batch_size,),ctx=ctx)
 ```
@@ -223,7 +223,7 @@ fake_label = nd.zeros((batch_size,),ctx=ctx)
 Load a pretrained model.
 
 
-```python
+```{.python .input}
 if os.path.isfile('infogan_d_latest.params') and os.path.isfile('infogan_g_latest.params'):
     discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True)
     generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True)
@@ -243,7 +243,7 @@ where `V(D,G)` is the GAN loss and the mutual information `I(c, G(z, c))` goes i
 Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code,  `L2Loss` for the continious code and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss.
 
 
-```python
+```{.python .input}
 loss1 = gluon.loss.SigmoidBinaryCrossEntropyLoss()
 loss2 = gluon.loss.L2Loss()
 loss3 = gluon.loss.SoftmaxCrossEntropyLoss()
@@ -252,7 +252,7 @@ loss3 = gluon.loss.SoftmaxCrossEntropyLoss()
 This function samples `c`, `z`, and concatenates them to create the generator input.
 
 
-```python
+```{.python .input}
 def create_generator_input():
     
     #create random noise
@@ -273,7 +273,7 @@ Define the training loop.
 4. Update Generator and Q
 
 
-```python
+```{.python .input}
 with SummaryWriter(logdir='./logs/') as sw:
     
     epochs = 1
@@ -351,7 +351,7 @@ Once the InfoGAN is trained, we can use the Discriminator to do an image similar
 Load the trained discriminator and retrieve one of its last layers.
 
 
-```python
+```{.python .input}
 discriminator = Discriminator()
 discriminator.load_parameters("infogan_d_latest.params", ctx=ctx, ignore_extra=True)
 
@@ -364,7 +364,7 @@ discriminator.hybridize()
 Nearest neighbor function, which takes a matrix of features and an input feature vector. It returns the 3 closest features.
 
 
-```python
+```{.python .input}
 def get_knn(features, input_vector, k=3):
     dist = (nd.square(features - input_vector).sum(axis=1))/features.shape[0]
     indices = dist.asnumpy().argsort()[:k]
@@ -374,7 +374,7 @@ def get_knn(features, input_vector, k=3):
 A helper function to visualize image data.
 
 
-```python
+```{.python .input}
 def visualize(img_array):
     plt.imshow(((img_array.asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(np.uint8))
     plt.axis('off')
@@ -383,7 +383,7 @@ def visualize(img_array):
 Take some images from the test data, obtain its feature vector from `discriminator.D[:11]` and plot images of the corresponding closest vectors in the feature space.
 
 
-```python
+```{.python .input}
 feature_size = 8192 
 
 features = nd.zeros((len(test_images), feature_size), ctx=ctx)
@@ -425,7 +425,7 @@ We trained the Generator for a couple of epochs and stored a couple of fake imag
 The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/) 
 
 
-```python
+```{.python .input}
 import json
 
 from sklearn.manifold import TSNE
@@ -449,4 +449,4 @@ Load the file with TSNEViewer. You can now inspect whether similiar looking imag
 
 <img src="https://raw.githubusercontent.com/NRauschmayr/web-data/master/mxnet/doc/tutorials/info_gan/tsne.png" style="width:800px;height:600px;">
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index f18ec1a2357f..bed7f0848052 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -47,7 +47,7 @@ Before we define the model, let's first fetch the [MNIST](http://yann.lecun.com/
 
 The following source code downloads and loads the images and the corresponding labels into memory.
 
-```python
+```{.python .input}
 import mxnet as mx
 
 # Fixing the random seed
@@ -63,7 +63,7 @@ Data iterators take care of this by randomly shuffling the inputs. Note that we
 
 The following source code initializes the data iterators for the MNIST dataset. Note that we initialize two iterators: one for train data and one for test data.
 
-```python
+```{.python .input}
 batch_size = 100
 train_data = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True)
 val_data = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
@@ -75,7 +75,7 @@ We will cover a couple of approaches for performing the hand written digit recog
 
 Now, let's import required nn modules
 
-```python
+```{.python .input}
 from __future__ import print_function
 import mxnet as mx
 from mxnet import gluon
@@ -96,7 +96,7 @@ The last fully connected layer often has its hidden size equal to the number of
 
 To do this, we will use [Sequential layer](https://mxnet.io/api/python/docs/api/gluon/_autogen/mxnet.gluon.nn.Sequential.html) type. This is simply a linear stack of neural network layers. `nn.Dense` layers are nothing but the fully connected layers we discussed above.
 
-```python
+```{.python .input}
 # define network
 net = nn.Sequential()
 net.add(nn.Dense(128, activation='relu'))
@@ -115,7 +115,7 @@ We will use [Trainer](/api/python/docs/api/gluon/trainer.html) class to apply th
 [SGD optimizer](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.SGD) on the
 initialized parameters.
 
-```python
+```{.python .input}
 gpus = mx.test_utils.list_gpus()
 ctx =  [mx.gpu()] if gpus else [mx.cpu(0), mx.cpu(1)]
 net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
@@ -140,7 +140,7 @@ There are many predefined loss functions in gluon.loss. Here we use
 [softmax_cross_entropy_loss](https://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.loss.softmax_cross_entropy_loss) for digit classification. We will compute loss and do backward propagation inside
 training scope which is defined by `autograd.record()`.
 
-```python
+```{.python .input}
 %%time
 epoch = 10
 # Use Accuracy as the evaluation metric.
@@ -183,7 +183,7 @@ for i in range(epoch):
 
 After the above training completes, we can evaluate the trained model by running predictions on validation dataset. Since the dataset also has labels for all test images, we can compute the accuracy metric over validation data as follows:
 
-```python
+```{.python .input}
 # Use Accuracy as the evaluation metric.
 metric = mx.metric.Accuracy()
 # Reset the validation data iterator.
@@ -218,7 +218,7 @@ The following source code defines a convolutional neural network architecture ca
 A typical way to write your network is creating a new class inherited from `gluon.Block`
 class. We can define the network by composing and inheriting Block class as follows:
 
-```python
+```{.python .input}
 import mxnet.ndarray as F
 
 class Net(gluon.Block):
@@ -248,7 +248,7 @@ We also imported `mxnet.ndarray` package to use activation functions from `ndarr
 
 Now, We will create the network as follows:
 
-```python
+```{.python .input}
 net = Net()
 ```
 
@@ -264,7 +264,7 @@ Training and prediction can be done in the similar way as we did for MLP.
 
 We will initialize the network parameters as follows:
 
-```python
+```{.python .input}
 # set the context on GPU is available otherwise CPU
 ctx = [mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()]
 net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
@@ -273,7 +273,7 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03})
 
 #### Training
 
-```python
+```{.python .input}
 # Use Accuracy as the evaluation metric.
 metric = mx.metric.Accuracy()
 softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss()
@@ -315,7 +315,7 @@ for i in range(epoch):
 
 Finally, we'll use the trained LeNet model to generate predictions for the test data.
 
-```python
+```{.python .input}
 # Use Accuracy as the evaluation metric.
 metric = mx.metric.Accuracy()
 # Reset the validation data iterator.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/index.rst b/docs/python_docs/python/tutorials/packages/gluon/index.rst
index 7cb083cd96d1..2b771c0dfec9 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/index.rst
+++ b/docs/python_docs/python/tutorials/packages/gluon/index.rst
@@ -59,12 +59,6 @@ Getting started
 
       Saving and loading trained models.
 
-   .. card::
-      :title: Using pre-trained models in MXNet
-      :link: image/pretrained_models.html
-
-      Using pre-trained models with Apache MXNet.
-
 Data
 ----
 
@@ -74,12 +68,6 @@ Data
       :title: Data Augmentation
       :link: data/data_augmentation.html
 
-      A guide to data augmentation.
-
-   .. card::
-      :title: Image Augmentation
-      :link: image/image-augmentation.html
-
       Boost your training dataset with image augmentation.
 
    .. card::
diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
index 1cae0c91b837..6aa556f92d3a 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
@@ -25,7 +25,7 @@ All neural networks need a loss function for training. A loss function is a quan
 
 However, we may sometimes want to solve problems that require customized loss functions; this tutorial shows how we can do that in Gluon. We will implement contrastive loss which is typically used in Siamese networks.
 
-```python
+```{.python .input}
 import matplotlib.pyplot as plt
 import mxnet as mx
 from mxnet import autograd, gluon, nd
@@ -48,7 +48,7 @@ The loss function uses a margin *m* which is has the effect that dissimlar pairs
 In order to implement such a customized loss function in Gluon, we only need to define a new class that is inheriting from the [Loss](/api/python/docs/api/gluon/loss/index.html#mxnet.gluon.loss.Loss) base class. We then define the contrastive loss logic in the [hybrid_forward](/api/python/docs/api/gluon/hybrid_block.html#mxnet.gluon.HybridBlock.hybrid_forward) method. This method takes the images `image1`, `image2` and the label which defines whether  `image1` and `image2` are similar (=0) or  dissimilar (=1). The input F is an `mxnet.ndarry` or an `mxnet.symbol` if we hybridize the network. Gluon's `Loss` base class is in fact a [HybridBlock](/api/python/docs/api/gluon/hybrid_block.html). This means we can either run  imperatively or symbolically. When we hybridize our custom loss function, we can get performance speedups.
 
 
-```python
+```{.python .input}
 class ContrastiveLoss(Loss):
     def __init__(self, margin=6., weight=None, batch_axis=0, **kwargs):
         super(ContrastiveLoss, self).__init__(weight, batch_axis, **kwargs)
@@ -71,7 +71,7 @@ A [Siamese network](https://papers.nips.cc/paper/769-signature-verification-usin
 Our network consists of 2 convolutional and max pooling layers that downsample the input image. The output is then fed through a fully connected layer with 256 hidden units and another fully connected layer with 2 hidden units.
 
 
-```python
+```{.python .input}
 class Siamese(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(Siamese, self).__init__(**kwargs)
@@ -95,7 +95,7 @@ class Siamese(gluon.HybridBlock):
 We train our network on the [Ominglot](http://www.omniglot.com/) dataset which is a collection of 1623 hand drawn characters from 50 alphabets. You can download it from [here](https://github.com/brendenlake/omniglot/tree/master/python). We need to create a dataset that contains a random set of similar and dissimilar images. We use Gluon's `ImageFolderDataset` where we overwrite `__getitem__` and randomly return similar and dissimilar pairs of images.
 
 
-```python
+```{.python .input}
 class GetImagePairs(mx.gluon.data.vision.ImageFolderDataset):
     def __init__(self, root):
         super(GetImagePairs, self).__init__(root, flag=0)
@@ -125,7 +125,7 @@ class GetImagePairs(mx.gluon.data.vision.ImageFolderDataset):
 We train the network on a subset of the data, the  [*Tifinagh*](https://www.omniglot.com/writing/tifinagh.htm) alphabet. Once the model is trained we test it on the [*Inuktitut*](https://www.omniglot.com/writing/inuktitut.htm) alphabet.
 
 
-```python
+```{.python .input}
 def transform(img0, img1, label):
     normalized_img0 = nd.transpose(img0.astype('float32'), (2, 0, 1))/255.0
     normalized_img1 = nd.transpose(img1.astype('float32'), (2, 0, 1))/255.0
@@ -144,7 +144,7 @@ test_dataloader = gluon.data.DataLoader(test.transform(transform),
 Following code plots some examples from the test dataset.
 
 
-```python
+```{.python .input}
 img1, img2, label = test[0]
 print("Same: {}".format(int(label.asscalar()) == 0))
 fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10, 5))
@@ -164,7 +164,7 @@ plt.show()
 Before we can start training, we need to instantiate the custom constrastive loss function and initialize the model.
 
 
-```python
+```{.python .input}
 model = Siamese()
 model.initialize(init=mx.init.Xavier())
 trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': 0.001})
@@ -174,7 +174,7 @@ loss = ContrastiveLoss(margin=6.0)
 Start the training loop:
 
 
-```python
+```{.python .input}
 for epoch in range(10):
     for i, data in enumerate(train_dataloader):
         image1, image2, label = data
@@ -192,7 +192,7 @@ for epoch in range(10):
 During inference we compute the Euclidean distance between the output vectors of the Siamese network. High distance indicates dissimilarity, low values indicate similarity.
 
 
-```python
+```{.python .input}
 for i, data in enumerate(test_dataloader):
     img1, img2, label = data
     output1, output2 = model(img1, img2)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md
index 018e75f5fcab..e7efbcadc467 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md
@@ -209,7 +209,7 @@ The loss is large, if the predicted probability distribution is far from the gro
 
 For instance, in the following example we get a KL divergence of 0.02. We set ```from_logits=False```, so the loss functions will apply ```log_softmax``` on the network output, before computing the KL divergence.
 
-```python
+```{.python .input}
 output = mx.nd.array([[0.39056206, 1.3068528, 0.39056206, -0.30258512]])
 print('output.softmax(): {}'.format(output.softmax().asnumpy().tolist()))
 target_dist = mx.nd.array([[0.3, 0.4, 0.1, 0.2]])
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
index 3858d0f5e17f..ce270da32d3d 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
@@ -31,7 +31,7 @@ To complete this tutorial, you will need:
 - [Jupyter Notebook](https://jupyter.org/index.html) (For interactively running the provided .ipynb file)
 
 
-```python
+```{.python .input}
 import mxnet as mx
 from mxnet import gluon
 from mxnet.gluon.model_zoo import vision
@@ -52,7 +52,7 @@ ctx = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu()
 We will use the ```gluon.data.vision``` package to directly import the Fashion-MNIST dataset and perform pre-processing on it.
 
 
-```python
+```{.python .input}
 # Get the training data
 fashion_mnist_train = gluon.data.vision.FashionMNIST(train=True)
 
@@ -61,7 +61,7 @@ fashion_mnist_val = gluon.data.vision.FashionMNIST(train=False)
 ```
 
 
-```python
+```{.python .input}
 transforms = [gluon.data.vision.transforms.Resize(224), # We pick 224 as the model we use takes an input of size 224.
                 gluon.data.vision.transforms.ToTensor()]
 
@@ -70,14 +70,14 @@ transforms = gluon.data.vision.transforms.Compose(transforms)
 ```
 
 
-```python
+```{.python .input}
 # Apply the transformations
 fashion_mnist_train = fashion_mnist_train.transform_first(transforms)
 fashion_mnist_val = fashion_mnist_val.transform_first(transforms)
 ```
 
 
-```python
+```{.python .input}
 batch_size = 256 # Batch size of the images
 num_workers = 4 # The number of parallel workers for loading the data using Data Loaders.
 
@@ -92,7 +92,7 @@ val_data_loader = gluon.data.DataLoader(fashion_mnist_val, batch_size=batch_size
 Let's load the resnet-18 model architecture from [Gluon Model Zoo](https://mxnet.apache.org/api/python/gluon/model_zoo.html) and initialize its parameters. The Gluon Model Zoo contains a repository of pre-trained models as well the model architecture definitions. We are using the model architecture from the model zoo in order to train it from scratch.
 
 
-```python
+```{.python .input}
 resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes = 10)
 resnet_18_v1.initialize(init = mx.init.Xavier(), ctx=ctx)
 ```
@@ -101,14 +101,14 @@ We will be using `SoftmaxCrossEntropyLoss` as the loss function since this is a
 You can experiment with a [different loss](/api/python/docs/api/gluon/loss/index.html) or [optimizer](/api/python/docs/api/optimizer/index.html) as well.
 
 
-```python
+```{.python .input}
 loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
 ```
 
 Let's define the trainer object for training the model.
 
 
-```python
+```{.python .input}
 learning_rate = 0.04 # You can experiment with your own learning rate here
 num_epochs = 2 # You can run training for more epochs
 trainer = gluon.Trainer(resnet_18_v1.collect_params(),
@@ -124,7 +124,7 @@ In the basic usage example, with just 2 lines of code, we will set up our model
 ### Basic Usage
 
 
-```python
+```{.python .input}
 train_acc = mx.metric.Accuracy() # Metric to monitor
 
 # Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
@@ -174,7 +174,7 @@ Our custom event handler is a simple one: record the loss values at the end of e
 
 Note: For each of the method, the `Estimator` object is passed along, so you can access training metrics.
 
-```python
+```{.python .input}
 class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd):
     def __init__(self):
         super(LossRecordHandler, self).__init__()
@@ -201,7 +201,7 @@ class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd):
 ```
 
 
-```python
+```{.python .input}
 # Let's reset the model, trainer and accuracy objects from above
 
 resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), ctx=ctx)
@@ -211,7 +211,7 @@ train_acc = mx.metric.Accuracy()
 ```
 
 
-```python
+```{.python .input}
 # Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
 est = estimator.Estimator(net=resnet_18_v1,
                           loss=loss_fn,
@@ -255,7 +255,7 @@ with warnings.catch_warnings():
 You can load the saved model, by using the `load_parameters` API in Gluon. For more details refer to the [Loading model parameters from file tutorial](/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html#saving-model-parameters-to-file)
 
 
-```python
+```{.python .input}
 resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes=10)
 resnet_18_v1.load_parameters('./my_model-best.params', ctx=ctx)
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
index a32c8a1e92cd..0cab619a72bb 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
@@ -41,7 +41,7 @@ As expected, for very small learning rates we don't see much change in the loss
 Usually, our unit of work is an epoch (a full pass through the dataset) and the learning rate would typically be held constant throughout the epoch. With the Learning Rate Finder (and cyclical learning rate schedules) we are required to vary the learning rate every iteration. As such we structure our training code so that a single iteration can be run with a given learning rate. You can implement Learner as you wish. Just initialize the network, define the loss and trainer in `__init__` and keep your training logic for a single batch in `iteration`.
 
 
-```python
+```{.python .input}
 import mxnet as mx
 
 # Set seed for reproducibility
@@ -96,7 +96,7 @@ class Learner():
 We also adjust our `DataLoader` so that it continuously provides batches of data and doesn't stop after a single epoch. We can then call `iteration` as many times as required for the loss to diverge as part of the Learning Rate Finder process. We implement a custom `BatchSampler` for this, that keeps returning random indices of samples to be included in the next batch. We use the CIFAR-10 dataset for image classification to test our Learning Rate Finder.
 
 
-```python
+```{.python .input}
 from mxnet.gluon.data.vision import transforms
 
 transform = transforms.Compose([
@@ -133,7 +133,7 @@ data_loader = mx.gluon.data.DataLoader(dataset, batch_sampler=batch_sampler)
 With preparation complete, we're ready to write our Learning Rate Finder that wraps the `Learner` we defined above. We implement a `find` method for the procedure, and `plot` for the visualization. Starting with a very low learning rate as defined by `lr_start` we train one iteration at a time and keep multiplying the learning rate by `lr_multiplier`. We analyse the loss and continue until it diverges according to `LRFinderStoppingCriteria` (which is defined later on). You may also notice that we save the parameters and state of the optimizer before the process and restore afterwards. This is so the Learning Rate Finder process doesn't impact the state of the model, and can be used at any point during training.
 
 
-```python
+```{.python .input}
 from matplotlib import pyplot as plt
 
 class LRFinder():
@@ -197,7 +197,7 @@ class LRFinder():
 You can define the `LRFinderStoppingCriteria` as you wish, but empirical testing suggests using a smoothed average gives a more consistent stopping rule (see `smoothing`). We stop when the smoothed average of the loss exceeds twice the initial loss, assuming there have been a minimum number of iterations (see `min_iter`).
 
 
-```python
+```{.python .input}
 class LRFinderStoppingCriteria():
     def __init__(self, smoothing=0.3, min_iter=20):
         """
@@ -230,7 +230,7 @@ class LRFinderStoppingCriteria():
 Using a Pre-activation ResNet-18 from the Gluon model zoo, we instantiate our Learner and fire up our Learning Rate Finder!
 
 
-```python
+```{.python .input}
 ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
 learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
@@ -246,7 +246,7 @@ lr_finder.plot()
 As discussed before, we should select a learning rate where the loss is falling (i.e. from 0.001 to 0.05) but before the loss starts to diverge (i.e. 0.1). We prefer higher learning rates where possible, so we select an initial learning rate of 0.05. Just as a test, we will run 500 epochs using this learning rate and evaluate the loss on the final batch. As we're working with a single batch of 128 samples, the variance of the loss estimates will be reasonably high, but it will give us a general idea. We save the initialized parameters for a later comparison with other learning rates.
 
 
-```python
+```{.python .input}
 learner.net.save_parameters("net.params")
 lr = 0.05
 
@@ -272,7 +272,7 @@ We see a sizable drop in the loss from approx. 2.7 to 1.2.
 And now we have a baseline, let's see what happens when we train with a learning rate that's higher than advisable at 0.5.
 
 
-```python
+```{.python .input}
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
 learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
 learner.net.load_parameters("net.params", ctx=ctx)
@@ -300,7 +300,7 @@ We still observe a fall in the loss but aren't able to reach as low as before.
 And lastly, we see how the model trains with a more conservative learning rate of 0.005.
 
 
-```python
+```{.python .input}
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
 learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
 learner.net.load_parameters("net.params", ctx=ctx)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
index 86d0f8fdd2c8..e26218a05836 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
@@ -28,7 +28,7 @@ Schedules define how the learning rate changes over time and are typically speci
 In this tutorial, we visualize the schedules defined in `mx.lr_scheduler`, show how to implement custom schedules and see an example of using a schedule while training models. Since schedules are passed to `mx.optimizer.Optimizer` classes, these methods work with both Module and Gluon APIs.
 
 
-```python
+```{.python .input}
 from __future__ import print_function
 import math
 import matplotlib.pyplot as plt
@@ -39,7 +39,7 @@ import numpy as np
 %matplotlib inline
 ```
 
-```python
+```{.python .input}
 def plot_schedule(schedule_fn, iterations=1500):
     # Iteration count starting at 1
     iterations = [i+1 for i in range(iterations)]
@@ -59,7 +59,7 @@ In this section, we take a look at the schedules in `mx.lr_scheduler`. All of th
 One of the most commonly used learning rate schedules is called stepwise decay, where the learning rate is reduced by a factor at certain intervals. MXNet implements a `FactorScheduler` for equally spaced intervals, and `MultiFactorScheduler` for greater control. We start with an example of halving the learning rate every 250 iterations. More precisely, the learning rate will be multiplied by `factor` _after_ the `step` index and multiples thereafter. So in the example below the learning rate of the 250th iteration will be 1 and the 251st iteration will be 0.5.
 
 
-```python
+```{.python .input}
 schedule = mx.lr_scheduler.FactorScheduler(step=250, factor=0.5)
 schedule.base_lr = 1
 plot_schedule(schedule)
@@ -74,7 +74,7 @@ Note: the `base_lr` is used to determine the initial learning rate. It takes a d
 We can define non-uniform intervals with `MultiFactorScheduler` and in the example below we halve the learning rate _after_ the 250th, 750th (i.e. a step length of 500 iterations) and 900th (a step length of 150 iterations). As before, the learning rate of the 250th iteration will be 1 and the 251th iteration will be 0.5.
 
 
-```python
+```{.python .input}
 schedule = mx.lr_scheduler.MultiFactorScheduler(step=[250, 750, 900], factor=0.5)
 schedule.base_lr = 1
 plot_schedule(schedule)
@@ -89,7 +89,7 @@ plot_schedule(schedule)
 Stepwise schedules and the discontinuities they introduce may sometimes lead to instability in the optimization, so in some cases smoother schedules are preferred. `PolyScheduler` gives a smooth decay using a polynomial function and reaches a learning rate of 0 after `max_update` iterations. In the example below, we have a quadratic function (`pwr=2`) that falls from 0.998 at iteration 1 to 0 at iteration 1000. After this the learning rate stays at 0, so nothing will be learnt from `max_update` iterations onwards.
 
 
-```python
+```{.python .input}
 schedule = mx.lr_scheduler.PolyScheduler(max_update=1000, base_lr=1, pwr=2)
 plot_schedule(schedule)
 ```
@@ -107,7 +107,7 @@ And we don't evaluate at `iteration=0` (to get `base_lr`) since we are working w
 You can implement your own custom schedule with a function or callable class, that takes an integer denoting the iteration index (starting at 1) and returns a float representing the learning rate to be used for that iteration. We implement the Cosine Annealing Schedule in the example below as a callable class (see `__call__` method).
 
 
-```python
+```{.python .input}
 class CosineAnnealingSchedule():
     def __init__(self, min_lr, max_lr, cycle_length):
         self.min_lr = min_lr
@@ -138,7 +138,7 @@ While training a simple handwritten digit classifier on the MNIST dataset, we ta
 As discussed above, the schedule should return a learning rate given an (1-based) iteration index.
 
 
-```python
+```{.python .input}
 # Use GPU if one exists, else use CPU
 ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
 
@@ -176,7 +176,7 @@ net = build_cnn()
 We then initialize our network (technically deferred until we pass the first batch) and define the loss.
 
 
-```python
+```{.python .input}
 # Initialize the parameters with Xavier initializer
 net.initialize(mx.init.Xavier(), ctx=ctx)
 # Use cross entropy loss
@@ -186,7 +186,7 @@ softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss()
 We're now ready to create our schedule, and in this example we opt for a stepwise decay schedule using `MultiFactorScheduler`. Since we're only training a demonstration model for a limited number of epochs (10 in total) we will exaggerate the schedule and drop the learning rate by 90% after the 4th, 7th and 9th epochs. We call these steps, and the drop occurs _after_ the step index. Schedules are defined for iterations (i.e. training batches), so we must represent our steps in iterations too.
 
 
-```python
+```{.python .input}
 steps_epochs = [4, 7, 9]
 # assuming we keep partial batches, see `last_batch` parameter of DataLoader
 iterations_per_epoch = math.ceil(len(train_dataset) / batch_size)
@@ -201,26 +201,26 @@ Learning rate drops after iterations: [3752, 6566, 8442]
 ```
 
 
-```python
+```{.python .input}
 schedule = mx.lr_scheduler.MultiFactorScheduler(step=steps_iterations, factor=0.1)
 ```
 
 **We create our `Optimizer` and pass the schedule via the `lr_scheduler` parameter.** In this example we're using Stochastic Gradient Descent.
 
 
-```python
+```{.python .input}
 sgd_optimizer = mx.optimizer.SGD(learning_rate=0.03, lr_scheduler=schedule)
 ```
 
 And we use this optimizer (with schedule) in our `Trainer` and train for 10 epochs. Alternatively, we could have set the `optimizer` to the string `sgd`, and pass a dictionary of the optimizer parameters directly to the trainer using `optimizer_params`.
 
 
-```python
+```{.python .input}
 trainer = mx.gluon.Trainer(params=net.collect_params(), optimizer=sgd_optimizer)
 ```
 
 
-```python
+```{.python .input}
 num_epochs = 10
 # epoch and batch counts starting at 1
 for epoch in range(1, num_epochs+1):
@@ -277,7 +277,7 @@ When using the method above you don't need to manually keep track of iteration c
 We replicate the example above, but now keep track of the `iteration_idx`, call the schedule and set the learning rate appropriately using `set_learning_rate`. We also use `schedule.base_lr` to set the initial learning rate for the schedule since we are calling the schedule directly and not using it as part of the `Optimizer`.
 
 
-```python
+```{.python .input}
 net = build_cnn()
 net.initialize(mx.init.Xavier(), ctx=ctx)
 
@@ -343,4 +343,4 @@ Once again, we see the learning rate start at 0.03, and fall to 0.00003 by the e
 
 We have a related tutorial on Advanced Learning Rate Schedules that shows reference implementations of schedules that give state-of-the-art results. We look at cyclical schedules applied to a variety of cycle shapes, and many other techniques such as warm-up and cool-down.
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md
index c59c9515f02e..e6c40cd555da 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md
@@ -23,7 +23,7 @@ Given the importance of learning rate and the learning rate schedule for trainin
 See the "Learning Rate Schedules" tutorial for a more basic overview of learning rates, and an example of how to use them while training your own models.
 
 
-```python
+```{.python .input}
 %matplotlib inline
 import copy
 import math
@@ -32,7 +32,7 @@ import numpy as np
 import matplotlib.pyplot as plt
 ```
 
-```python
+```{.python .input}
 def plot_schedule(schedule_fn, iterations=1500):
     # Iteration count starting at 1
     iterations = [i+1 for i in range(iterations)]
@@ -54,7 +54,7 @@ We look at "warm-up" in more detail later in the tutorial, but this could be vie
 One adjustment proposed by [Jeremy Howard, Sebastian Ruder (2018)](https://arxiv.org/abs/1801.06146) was to change the ratio between the increasing and decreasing stages, instead of the 50:50 split. Changing the increasing fraction (`inc_fraction!=0.5`) leads to a **"slanted triangular"** schedule. Using `inc_fraction<0.5` tends to give better results.
 
 
-```python
+```{.python .input}
 class TriangularSchedule():
     def __init__(self, min_lr, max_lr, cycle_length, inc_fraction=0.5):
         """
@@ -82,7 +82,7 @@ class TriangularSchedule():
 We look an example of a slanted triangular schedule that increases from a learning rate of 1 to 2, and back to 1 over 1000 iterations. Since we set `inc_fraction=0.2`, 200 iterations are used for the increasing stage, and 800 for the decreasing stage. After this, the schedule stays at the lower bound indefinitely.
 
 
-```python
+```{.python .input}
 schedule = TriangularSchedule(min_lr=1, max_lr=2, cycle_length=1000, inc_fraction=0.2)
 plot_schedule(schedule)
 ```
@@ -96,7 +96,7 @@ plot_schedule(schedule)
 Continuing with the idea that smooth decay profiles give improved performance over stepwise decay, [Ilya Loshchilov, Frank Hutter (2016)](https://arxiv.org/abs/1608.03983) used **"cosine annealing"** schedules to good effect. As with triangular schedules, the original idea was that this should be used as part of a cyclical schedule, but we begin by implementing the cosine annealing component before the full Stochastic Gradient Descent with Warm Restarts (SGDR) method later in the tutorial.
 
 
-```python
+```{.python .input}
 class CosineAnnealingSchedule():
     def __init__(self, min_lr, max_lr, cycle_length):
         """
@@ -120,7 +120,7 @@ class CosineAnnealingSchedule():
 We look at an example of a cosine annealing schedule that smoothing decreases from a learning rate of 2 to 1 across 1000 iterations. After this, the schedule stays at the lower bound indefinietly.
 
 
-```python
+```{.python .input}
 schedule = CosineAnnealingSchedule(min_lr=1, max_lr=2, cycle_length=1000)
 plot_schedule(schedule)
 ```
@@ -140,7 +140,7 @@ Unlike the schedules above and those implemented in `mx.lr_scheduler`, these cla
 Using the idea of linear warm-up of the learning rate proposed in ["Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour" by Priya Goyal et al. (2017)](https://arxiv.org/abs/1706.02677), we implement a wrapper class that adds warm-up to an existing schedule. Going from `start_lr` to the initial learning rate of the `schedule` over `length` iterations, this adjustment is useful when training with large batch sizes.
 
 
-```python
+```{.python .input}
 class LinearWarmUp():
     def __init__(self, schedule, start_lr, length):
         """
@@ -164,7 +164,7 @@ class LinearWarmUp():
 As an example, we add a linear warm-up of the learning rate (from 0 to 1 over 250 iterations) to a stepwise decay schedule. We first create the `MultiFactorScheduler` (and set the `base_lr`) and then pass it to `LinearWarmUp` to add the warm-up at the start. We can use `LinearWarmUp` with any other schedule including `CosineAnnealingSchedule`.
 
 
-```python
+```{.python .input}
 schedule = mx.lr_scheduler.MultiFactorScheduler(step=[250, 750, 900], factor=0.5)
 schedule.base_lr = 1
 schedule = LinearWarmUp(schedule, start_lr=0, length=250)
@@ -180,7 +180,7 @@ plot_schedule(schedule)
 Similarly, we could add a linear cool-down period to our schedule and this is used in the "1-Cycle" schedule proposed by [Leslie N. Smith, Nicholay Topin (2017)](https://arxiv.org/abs/1708.07120) to train neural networks very quickly in certain circumstances (coined "super-convergence"). We reduce the learning rate from its value at `start_idx` of `schedule` to `finish_lr` over a period of `length`, and then maintain `finish_lr` thereafter.
 
 
-```python
+```{.python .input}
 class LinearCoolDown():
     def __init__(self, schedule, finish_lr, start_idx, length):
         """
@@ -209,7 +209,7 @@ class LinearCoolDown():
 As an example, we apply learning rate cool-down to a `MultiFactorScheduler`. Starting the cool-down at iteration 1000, we reduce the learning rate linearly from 0.125 to 0.001 over 500 iterations, and hold the learning rate at 0.001 after this.
 
 
-```python
+```{.python .input}
 schedule = mx.lr_scheduler.MultiFactorScheduler(step=[250, 750, 900], factor=0.5)
 schedule.base_lr = 1
 schedule = LinearCoolDown(schedule, finish_lr=0.001, start_idx=1000, length=500)
@@ -225,7 +225,7 @@ plot_schedule(schedule)
 So we can implement the "1-Cycle" schedule proposed by [Leslie N. Smith, Nicholay Topin (2017)](https://arxiv.org/abs/1708.07120) we use a single and symmetric cycle of the triangular schedule above (i.e. `inc_fraction=0.5`), followed by a cool-down period of `cooldown_length` iterations.
 
 
-```python
+```{.python .input}
 class OneCycleSchedule():
     def __init__(self, start_lr, max_lr, cycle_length, cooldown_length=0, finish_lr=None):
         """
@@ -251,7 +251,7 @@ class OneCycleSchedule():
 As an example, we linearly increase and then decrease the learning rate from 0.1 to 0.5 and back over 500 iterations (i.e. single triangular cycle), before reducing the learning rate further to 0.001 over the next 750 iterations (i.e. cool-down).
 
 
-```python
+```{.python .input}
 schedule = OneCycleSchedule(start_lr=0.1, max_lr=0.5, cycle_length=500, cooldown_length=750, finish_lr=0.001)
 plot_schedule(schedule)
 ```
@@ -265,7 +265,7 @@ plot_schedule(schedule)
 Originally proposed by [Leslie N. Smith (2015)](https://arxiv.org/abs/1506.01186), the idea of cyclically increasing and decreasing the learning rate has been shown to give faster convergence and more optimal solutions. We implement a wrapper class that loops existing cycle-based schedules such as `TriangularSchedule` and `CosineAnnealingSchedule` to provide infinitely repeating schedules. We pass the schedule class (rather than an instance) because one feature of the `CyclicalSchedule` is to vary the `cycle_length` over time as seen in [Ilya Loshchilov, Frank Hutter (2016)](https://arxiv.org/abs/1608.03983) using `cycle_length_decay`. Another feature is the ability to decay the cycle magnitude over time with `cycle_magnitude_decay`.
 
 
-```python
+```{.python .input}
 class CyclicalSchedule():
     def __init__(self, schedule_class, cycle_length, cycle_length_decay=1, cycle_magnitude_decay=1, **kwargs):
         """
@@ -298,7 +298,7 @@ class CyclicalSchedule():
 As an example, we implement the triangular cyclical schedule presented in ["Cyclical Learning Rates for Training Neural Networks" by Leslie N. Smith (2015)](https://arxiv.org/abs/1506.01186). We use slightly different terminology to the paper here because we use `cycle_length` that is twice the 'stepsize' used in the paper. We repeat cycles, each with a length of 500 iterations and lower and upper learning rate bounds of 0.5 and 2 respectively.
 
 
-```python
+```{.python .input}
 schedule = CyclicalSchedule(TriangularSchedule, min_lr=0.5, max_lr=2, cycle_length=500)
 plot_schedule(schedule)
 ```
@@ -310,7 +310,7 @@ plot_schedule(schedule)
 And lastly, we implement the scheduled used in ["SGDR: Stochastic Gradient Descent with Warm Restarts" by Ilya Loshchilov, Frank Hutter (2016)](https://arxiv.org/abs/1608.03983). We repeat cosine annealing schedules, but each time we halve the magnitude and double the cycle length.
 
 
-```python
+```{.python .input}
 schedule = CyclicalSchedule(CosineAnnealingSchedule, min_lr=0.01, max_lr=2,
                             cycle_length=250, cycle_length_decay=2, cycle_magnitude_decay=0.5)
 plot_schedule(schedule)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md b/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md
index 962834909a97..c17abe101613 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md
@@ -41,7 +41,7 @@ Warning: You should calculate the normalization means and standard deviations us
 
 When using pre-trained models from the [Gluon Model Zoo](https://mxnet.apache.org/api/python/gluon/model_zoo.html) you'll usually see the normalization statistics used for training (i.e. statistics from step 1). You'll want to use these statistics to normalize your own input data for fine-tuning or inference with these models. Using `transforms.Normalize` is one way of applying the normalization, and this should be used in the `Dataset`.
 
-```python
+```{.python .input}
 import mxnet as mx
 from mxnet.gluon.data.vision.transforms import Normalize
 
@@ -81,7 +81,7 @@ Warning: it seems that `BatchNorm` is better suited to convolutional networks (C
 As an example, we'll apply `BatchNorm` to a batch of 2 samples, each with 2 channels, and both height and width of 2 (in NCHW format).
 
 
-```python
+```{.python .input}
 data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
 print(data)
 ```
@@ -89,14 +89,14 @@ print(data)
 With MXNet Gluon we can apply batch normalization with the `mx.gluon.nn.BatchNorm` block. It can be created and used just like any other MXNet Gluon block (such as `Conv2D`). Its input will typically be unnormalized activations from the previous layer, and the output will be the normalized activations ready for the next layer. Since we're using data in NCHW format we can use the default axis.
 
 
-```python
+```{.python .input}
 net = mx.gluon.nn.BatchNorm()
 ```
 
 We still need to initialize the block because it has a number of trainable parameters, as we'll see later on.
 
 
-```python
+```{.python .input}
 net.initialize()
 ```
 
@@ -107,7 +107,7 @@ Remember: `BatchNorm` runs differently during training and inference. When train
 Warning: `BatchNorm` assumes the channel dimension is the 2nd in order (i.e. `axis=1`). You need to ensure your data has a channel dimension, and change the `axis` parameter of `BatchNorm` if it's not the 2nd dimension. A batch of greyscale images of shape `(100,32,32)` would not work, since the 2nd dimension is height and not channel. You'd need to add a channel dimension using `data.expand_dims(1)` in this case to give shape `(100,1,32,32)`.
 
 
-```python
+```{.python .input}
 with mx.autograd.record():
     output = net(data)
     loss = output.abs()
@@ -118,7 +118,7 @@ print(output)
 We can immediately see the activations have been scaled down and centered around zero. Activations are the same for each channel, because each channel was normalized independently. We can do a quick sanity check on these results, by manually calculating the batch mean and variance for each channel.
 
 
-```python
+```{.python .input}
 batch_means = data.mean(axis=1, exclude=True)
 batch_vars = (data - batch_means.reshape(1, -1, 1, 1)).square().mean(axis=1, exclude=True)
 print('batch_means:', batch_means.asnumpy())
@@ -128,7 +128,7 @@ print('batch_vars:', batch_vars.asnumpy())
 And use these to scale the first entry in `data`, to confirm the `BatchNorm` calculation of `-1.324` was correct.
 
 
-```python
+```{.python .input}
 print("manually calculated:", ((data[0][0][0][0] - batch_means[0])/batch_vars[0].sqrt()).asnumpy())
 print("automatically calculated:", output[0][0][0][0].asnumpy())
 ```
@@ -141,7 +141,7 @@ Advanced: when using a pre-trained model inside another model (e.g. a pre-traine
 
 After a single step (specifically after the `backward` call) we can see the `running_mean` and `running_var` have been updated.
 
-```python
+```{.python .input}
 print('running_mean:', net.running_mean.data().asnumpy())
 print('running_var:', net.running_var.data().asnumpy())
 ```
@@ -149,7 +149,7 @@ print('running_var:', net.running_var.data().asnumpy())
 You should notice though that these running statistics do not match the batch statistics we just calculated. And instead they are just 10% of the value we'd expect. We see this because of the exponential average process, and because the `momentum` parameter of `BatchNorm` is equal to 0.9 : i.e. 10% of the new value, 90% of the old value (which was initialized to 0). Over time the running statistics will converge to the statistics of the input distribution, while still being flexible enough to adjust to shifts in the input distribution. Using the same batch another 100 times (which wouldn't happen in practice), we can see the running statistics converge to the batch statsitics calculated before.
 
 
-```python
+```{.python .input}
 for i in range(100):
     with mx.autograd.record():
         output = net(data)
@@ -168,7 +168,7 @@ Advanced: Sometimes used for input normalization, you can prevent `beta` shiftin
 We haven't updated these parameters yet, so they should still be as initialized. You can see the default for `beta` is 0 (i.e. not shift) and `gamma` is 1 (i.e. not scale), so the initial behaviour is to keep the distribution unit normalized.
 
 
-```python
+```{.python .input}
 print('beta:', net.beta.data().asnumpy())
 print('gamma:', net.gamma.data().asnumpy())
 ```
@@ -176,7 +176,7 @@ print('gamma:', net.gamma.data().asnumpy())
 We can also check the gradient on these parameters. Since we were finding the gradient of the sum of absolute values, we would expect the gradient of `gamma` to be equal to the number of points in the data (i.e. 16). So to minimize the loss we'd decrease the value of `gamma`, which would happen as part of a `trainer.step`.
 
 
-```python
+```{.python .input}
 print('beta gradient:', net.beta.grad().asnumpy())
 print('gamma gradient:', net.gamma.grad().asnumpy())
 ```
@@ -186,7 +186,7 @@ print('gamma gradient:', net.gamma.grad().asnumpy())
 When it comes to inference, `BatchNorm` uses the global statistics that were calculated during training. Since we're using the same batch of data over and over again (and our global running statistics have converged), we get a very similar result to using training mode. `beta` and `gamma` are also applied by default (unless explicitly removed).
 
 
-```python
+```{.python .input}
 output = net(data)
 print(output)
 ```
@@ -211,7 +211,7 @@ Figure 3: `LayerNorm` on NCHW data | Figure 4: `LayerNorm` on NTC data
 As an example, we'll apply `LayerNorm` to a batch of 2 samples, each with 4 time steps and 2 channels (in NTC format).
 
 
-```python
+```{.python .input}
 data = mx.nd.arange(start=0, stop=2*4*2).reshape(2, 4, 2)
 print(data)
 ```
@@ -219,7 +219,7 @@ print(data)
 With MXNet Gluon we can apply layer normalization with the `mx.gluon.nn.LayerNorm` block. We need to call `initialize` because `LayerNorm` has two learnable parameters by default: `beta` and `gamma` that are used for post normalization shifting and scaling of each channel.
 
 
-```python
+```{.python .input}
 net = mx.gluon.nn.LayerNorm()
 net.initialize()
 output = net(data)
@@ -231,7 +231,7 @@ We can see that normalization has been applied across all channels for each time
 We can also check the parameters `beta` and `gamma` and see that they are per channel (i.e. 2 of each in this example).
 
 
-```python
+```{.python .input}
 print('beta:', net.beta.data().asnumpy())
 print('gamma:', net.gamma.data().asnumpy())
 ```
@@ -250,7 +250,7 @@ Figure 3: `InstanceNorm` on NCHW data | Figure 4: `InstanceNorm` on NTC data
 As an example, we'll apply `InstanceNorm` to a batch of 2 samples, each with 2 channels, and both height and width of 2 (in NCHW format).
 
 
-```python
+```{.python .input}
 data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2)
 print(data)
 ```
@@ -258,7 +258,7 @@ print(data)
 With MXNet Gluon we can apply instance normalization with the `mx.gluon.nn.InstanceNorm` block. We need to call `initialize` because InstanceNorm has two learnable parameters by default: `beta` and `gamma` that are used for post normalization shifting and scaling of each channel.
 
 
-```python
+```{.python .input}
 net = mx.gluon.nn.InstanceNorm()
 net.initialize()
 output = net(data)
@@ -268,7 +268,7 @@ print(output)
 We can also check the parameters `beta` and `gamma` and see that they are per channel (i.e. 2 of each in this example).
 
 
-```python
+```{.python .input}
 print('beta:', net.beta.data().asnumpy())
 print('gamma:', net.gamma.data().asnumpy())
 ```
diff --git a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
index c03a03d1080e..81502901aafd 100644
--- a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
+++ b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
@@ -26,7 +26,7 @@ and pull data out.
 Let's consider a simple example: initializing
 a (`int`, `NDArray`) pair into the store, and then pulling the value out:
 
-```python
+```{.python .input}
 import mxnet as mx
 
 kv = mx.kv.create('local') # create a local kv store.
@@ -43,7 +43,7 @@ print(a.asnumpy())
 
 For any key that has been initialized, you can push a new value with the same shape to the key:
 
-```python
+```{.python .input}
 kv.push(3, mx.nd.ones(shape)*8)
 kv.pull(3, out = a) # pull out the value
 print(a.asnumpy())
@@ -56,7 +56,7 @@ values into the same key, where KVStore will first sum all of these
 values and then push the aggregated value. Here we will just demonstrate pushing a list of values on CPU.
 Please note summation only happens if the value list is longer than one
 
-```python
+```{.python .input}
 contexts = [mx.cpu(i) for i in range(4)]
 b = [mx.nd.ones(shape, ctx) for ctx in contexts]
 kv.push(3, b)
@@ -70,7 +70,7 @@ For each push, KVStore combines the pushed value with the value stored using an
 `updater`. The default updater is `ASSIGN`. You can replace the default to
 control how data is merged:
 
-```python
+```{.python .input}
 def update(key, input, stored):
     print("update on key: %d" % key)
     stored += input * 2
@@ -81,7 +81,7 @@ print(a.asnumpy())
 
 `[[ 4.  4.  4.],[ 4.  4.  4.]]`<!--notebook-skip-line-->
 
-```python
+```{.python .input}
 kv.push(3, mx.nd.ones(shape))
 kv.pull(3, out=a)
 print(a.asnumpy())
@@ -97,7 +97,7 @@ print(a.asnumpy())
 You've already seen how to pull a single key-value pair. Similarly, to push, you can
 pull the value onto several devices with a single call:
 
-```python
+```{.python .input}
 b = [mx.nd.ones(shape, ctx) for ctx in contexts]
 kv.pull(3, out = b)
 print(b[1].asnumpy())
@@ -112,7 +112,7 @@ an interface for a list of key-value pairs.
 
 For a single device:
 
-```python
+```{.python .input}
 keys = [5, 7, 9]
 kv.init(keys, [mx.nd.ones(shape)]*len(keys))
 kv.push(keys, [mx.nd.ones(shape)]*len(keys))
@@ -131,7 +131,7 @@ print(b[1].asnumpy())
 
 For multiple devices:
 
-```python
+```{.python .input}
 b = [[mx.nd.ones(shape, ctx) for ctx in contexts]] * len(keys)
 kv.push(keys, b)
 kv.pull(keys, out = b)
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md
index 6bc373e356e8..750f66ae438e 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md
@@ -43,21 +43,21 @@ To get started, let's import
 habit of setting a random seed so that you always get the same results that we
 do.
 
-```python
+```{.python .input}
 import mxnet as mx
 from mxnet import nd
 ```
 
 Let's start with a very simple 1-dimensional array with a python list.
 
-```python
+```{.python .input}
 x = nd.array([1,2,3])
 print(x)
 ```
 
 Now a 2-dimensional array.
 
-```python
+```{.python .input}
 y = nd.array([[1,2,3,4], [1,2,3,4], [1,2,3,4]])
 print(y)
 ```
@@ -67,7 +67,7 @@ Specifically, we'll create a 2D array (also called a *matrix*) with 3 rows and 4
 columns using the `.empty` function. We'll also try out `.full` which takes an
 additional parameter for what value you want to fill in the array.
 
-```python
+```{.python .input}
 x = nd.empty((3, 3))
 print(x)
 x = nd.full((3,3), 7)
@@ -85,7 +85,7 @@ different here (3,10) since the zeros may not produce anything different from
 empty... or use the two demonstrations to show something interesting or
 unique... when would I use one over the other?-->
 
-```python
+```{.python .input}
 x = nd.zeros((3, 10))
 print(x)
 ```
@@ -93,7 +93,7 @@ print(x)
 Similarly, `ndarray` has a function to create a matrix of all ones aptly named
 [ones](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.ones).
 
-```python
+```{.python .input}
 x = nd.ones((3, 4))
 print(x)
 ```
@@ -109,7 +109,7 @@ Is it that important to introduce zero mean and unit variance right now?
 Describe more? Or how about explain which is which for the 0 and the 1 and what
 they're going to do... if it actually matters at this point. -->
 
-```python
+```{.python .input}
 y = nd.random_normal(0, 1, shape=(3, 4))
 print(y)
 ```
@@ -117,7 +117,7 @@ print(y)
 Sometimes you will want to copy an array by its shape but not its contents. You
 can do this with `.zeros_like`.
 
-```python
+```{.python .input}
 z = nd.zeros_like(y)
 print(z)
 ```
@@ -125,7 +125,7 @@ print(z)
 As in NumPy, the dimensions of each `NDArray` are accessible via the `.shape`
 attribute.
 
-```python
+```{.python .input}
 y.shape
 ```
 
@@ -135,13 +135,13 @@ how much memory the array occupies.
 <!-- is there a function for that or do you
 just do it manually? Should we show that? -->
 
-```python
+```{.python .input}
 y.size
 ```
 
 We can query the data type using `.dtype`.
 
-```python
+```{.python .input}
 y.dtype
 ```
 
@@ -150,7 +150,7 @@ precision, or you might want to use a different data type. You can force the
 data type when you create the array using a numpy type. This requires you to
 import numpy first.
 
-```python
+```{.python .input}
 import numpy as np
 a = nd.array([1,2,3])
 b = nd.array([1,2,3], dtype=np.int32)
@@ -163,7 +163,7 @@ happen on specific devices that you can set. You can compute on CPU(s), GPU(s),
 specific GPU, or all of the above depending on your situation and preference.
 Using `.context` reveals the location of the variable.
 
-```python
+```{.python .input}
 y.context
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md
index 352f6b7a0f34..c2270f7953ef 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md
@@ -35,12 +35,12 @@ Such as element-wise addition:
 <!-- keeping it
 easy -->
 
-```python
+```{.python .input}
 import mxnet as mx
 from mxnet import nd
 ```
 
-```python
+```{.python .input}
 x = nd.ones((3, 4))
 y = nd.random_normal(0, 1, shape=(3, 4))
 print('x=', x)
@@ -51,7 +51,7 @@ print('x = x + y, x=', x)
 
 Multiplication:
 
-```python
+```{.python .input}
 x = nd.array([1, 2, 3])
 y = nd.array([2, 2, 2])
 x * y
@@ -61,7 +61,7 @@ And exponentiation:
 <!-- with these next ones we'll just have to take your word
 for it... -->
 
-```python
+```{.python .input}
 nd.exp(x)
 ```
 
@@ -69,7 +69,7 @@ We can also grab a matrix's transpose to compute a proper matrix-matrix product.
 <!-- because we need to do that before we have coffee every day... and you know
 how those dirty, improper matrixeses can be... -->
 
-```python
+```{.python .input}
 nd.dot(x, y.T)
 ```
 
@@ -93,7 +93,7 @@ detail, and quite possibily in its own notebook since I think it will help to
 show some gotchas like you mentioned verbally. I am still leaning toward
 delaying the introduction of this topic....-->
 
-```python
+```{.python .input}
 print('y=', y)
 print('id(y):', id(y))
 y = y + x
@@ -104,7 +104,7 @@ print('id(y):', id(y))
 We can assign the result to a previously allocated array with slice notation,
 e.g., `result[:] = ...`.
 
-```python
+```{.python .input}
 print('x=', x)
 z = nd.zeros_like(x)
 print('z is zeros_like x, z=', z)
@@ -120,7 +120,7 @@ before copying it to z. To make better use of memory, we can perform operations
 in place, avoiding temporary buffers. To do this we specify the `out` keyword
 argument every operator supports:
 
-```python
+```{.python .input}
 print('x=', x, 'is in id(x):', id(x))
 print('y=', y, 'is in id(y):', id(y))
 print('z=', z, 'is in id(z):', id(z))
@@ -136,7 +136,7 @@ itself. There are two ways to do this in MXNet.
 = x op y
 2. By using the op-equals operators like `+=`
 
-```python
+```{.python .input}
 print('x=', x, 'is in id(x):', id(x))
 x += y
 print('x=', x, 'is in id(x):', id(x))
@@ -155,7 +155,7 @@ the whole array: a[:]
 
 Here's an example of reading the second and third rows from `x`.
 
-```python
+```{.python .input}
 x = nd.array([1, 2, 3])
 print('1D complete array, x=', x)
 s = x[1:3]
@@ -168,7 +168,7 @@ print('slicing the 2nd and 3rd elements, s=', s)
 
 Now let's try writing to a specific element.
 
-```python
+```{.python .input}
 print('original x, x=', x)
 x[2] = 9.0
 print('replaced entire row with x[2] = 9.0, x=', x)
@@ -180,7 +180,7 @@ print('replaced range of elements with x[1:2,1:3] = 5.0, x=', x)
 
 Multi-dimensional slicing is also supported.
 
-```python
+```{.python .input}
 x = nd.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
 print('original x, x=', x)
 s = x[1:2,1:3]
@@ -214,7 +214,7 @@ a shape like (3,3) you lose some of the impact and miss some errors if people
 play with the values. Better to have a distinct shape so that it is more obvious
 what is happening and what can break.-->
 
-```python
+```{.python .input}
 x = nd.ones(shape=(3,6))
 print('x = ', x)
 y = nd.arange(6)
@@ -230,7 +230,7 @@ That's because broadcasting prefers to duplicate along the left most axis.
 We can alter this behavior by explicitly giving `y` a $2$D shape using `.reshape`.
 You can also chain `.arange` and `.reshape` to do this in one step.
 
-```python
+```{.python .input}
 y = y.reshape((3,1))
 print('y = ', y)
 print('x + y = ', x+y)
@@ -242,12 +242,12 @@ print('y = ', y)
 Converting MXNet NDArrays to and from
 NumPy is easy. The converted arrays do not share memory.
 
-```python
+```{.python .input}
 a = x.asnumpy()
 type(a)
 ```
 
-```python
+```{.python .input}
 y = nd.array(a)
 print('id(a)=', id(a), 'id(x)=', id(x), 'id(y)=', id(y))
 ```
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md
index 98d79d7510e9..2518f52e78bb 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md
@@ -55,7 +55,7 @@ If a required operator is missing from `NDArray API`, there are few things you c
 There are a situation, when you can assemble a higher level operator using existing operators. An example for that is the [np.full_like()](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.full_like.html) operator. This operator doesn't exist in `NDArray API`, but can be easily replaced with a combination of existing operators.
 
 
-```python
+```{.python .input}
 from mxnet import nd
 import numpy as np
 
@@ -80,7 +80,7 @@ Some operators may have slightly different name, but are similar in terms of fun
 One particular example of different input requirements is [nd.pad()](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.pad). The trick is that it can only work with 4-dimensional tensors. If your input has less dimensions, then you need to expand its number before using `nd.pad()` as it is shown in the code block below:
 
 
-```python
+```{.python .input}
 def pad_array(data, max_length):
     # expand dimensions to 4, because nd.pad can work only with 4 dims
     data_expanded = data.reshape(1, 1, 1, data.shape[0])
@@ -115,7 +115,7 @@ There are cases, when you have to use either `.asnumpy()` or `.asscalar()` metho
 You can minimize the impact of a blocking call by calling `.asnumpy()` or `.asscalar()` in the moment, when you think the calculation of this value is already done. In the example below, we introduce the `LossBuffer` class. It is used to cache the previous value of a loss function. By doing so, we delay printing by one iteration in hope that the `Execution Engine` would finish the previous iteration and blocking time would be minimized.
 
 
-```python
+```{.python .input}
 from __future__ import print_function
 
 import mxnet as mx
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md
index b91279cff4d4..b14bec66d861 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md
@@ -94,7 +94,7 @@ There are a few different ways to create a `CSRNDArray`, but first let's recreat
 You can create a CSRNDArray with data, indices and indptr by using the `csr_matrix` function:
 
 
-```python
+```{.python .input}
 import mxnet as mx
 # Create a CSRNDArray with python lists
 shape = (3, 4)
@@ -116,7 +116,7 @@ array([[ 7.,  0.,  8.,  0.],
 
 
 
-```python
+```{.python .input}
 import numpy as np
 # Create a CSRNDArray with numpy arrays
 data_np = np.array([7, 8, 9])
@@ -136,7 +136,7 @@ array([[7, 0, 8, 0],
 
 
 
-```python
+```{.python .input}
 # Compare the two. They are exactly the same.
 {'a':a.asnumpy(), 'b':b.asnumpy()}
 ```
@@ -155,7 +155,7 @@ array([[7, 0, 8, 0],
 You can create an MXNet CSRNDArray from a `scipy.sparse.csr.csr_matrix` object by using the `array` function:
 
 
-```python
+```{.python .input}
 try:
     import scipy.sparse as spsp
     # generate a csr matrix in scipy
@@ -177,7 +177,7 @@ d:[[7 0 8 0]
 What if you have a big set of data and you haven't calculated indices or indptr yet? Let's try a simple CSRNDArray from an existing array of data and derive those values with some built-in functions. We can mockup a "big" dataset with a random amount of the data being non-zero, then compress it by using the `tostype` function, which is explained further in the [Storage Type Conversion](#storage-type-conversion) section:
 
 
-```python
+```{.python .input}
 big_array = mx.nd.round(mx.nd.random.uniform(low=0, high=1, shape=(1000, 100)))
 print(big_array)
 big_array_csr = big_array.tostype('csr')
@@ -205,7 +205,7 @@ You can also create a CSRNDArray from another using the `array` function specify
 which accepts a numpy type. By default, `float32` is used.
 
 
-```python
+```{.python .input}
 # Float32 is used by default
 e = mx.nd.sparse.array(a)
 # Create a 16-bit float array
@@ -233,7 +233,7 @@ As you have seen already, we can inspect the contents of a `CSRNDArray` by filli
 its contents into a dense `numpy.ndarray` using the `asnumpy` function.
 
 
-```python
+```{.python .input}
 a.asnumpy()
 ```
 
@@ -249,7 +249,7 @@ array([[ 7.,  0.,  8.,  0.],
 You can also inspect the internal storage of a CSRNDArray by accessing attributes such as `indptr`, `indices` and `data`:
 
 
-```python
+```{.python .input}
 # Access data array
 data = a.data
 # Access indices array
@@ -281,7 +281,7 @@ You can also convert storage types with:
 To convert an NDArray to a CSRNDArray and vice versa by using the ``tostype`` function:
 
 
-```python
+```{.python .input}
 # Create a dense NDArray
 ones = mx.nd.ones((2,2))
 # Cast the storage type from `default` to `csr`
@@ -305,7 +305,7 @@ dense = csr.tostype('default')
 To convert the storage type by using the `cast_storage` operator:
 
 
-```python
+```{.python .input}
 # Create a dense NDArray
 ones = mx.nd.ones((2,2))
 # Cast the storage type to `csr`
@@ -332,7 +332,7 @@ You can use the `copy` method which makes a deep copy of the array and its data,
 You can also use the `copyto` method or the slice operator `[]` to deep copy to an existing array.
 
 
-```python
+```{.python .input}
 a = mx.nd.ones((2,2)).tostype('csr')
 b = a.copy()
 c = mx.nd.sparse.zeros('csr', (2,2))
@@ -357,7 +357,7 @@ the storage type of destination array will not change when copying with `copyto`
 the slice operator `[]`.
 
 
-```python
+```{.python .input}
 e = mx.nd.sparse.zeros('csr', (2,2))
 f = mx.nd.sparse.zeros('csr', (2,2))
 g = mx.nd.ones(e.shape)
@@ -377,7 +377,7 @@ g.copyto(f)
 You can slice a CSRNDArray on axis 0 with operator `[]`, which copies the slices and returns a new CSRNDArray.
 
 
-```python
+```{.python .input}
 a = mx.nd.array(np.arange(6).reshape(3,2)).tostype('csr')
 b = a[1:2].asnumpy()
 c = a[:].asnumpy()
@@ -403,7 +403,7 @@ Note that multi-dimensional indexing or slicing along a particular axis is curre
 Operators that have specialized implementation for sparse arrays can be accessed in `mx.nd.sparse`. You can read the [mxnet.ndarray.sparse API documentation](https://mxnet.apache.org/versions/master/api/python/ndarray/sparse.html) to find what sparse operators are available.
 
 
-```python
+```{.python .input}
 shape = (3, 4)
 data = [7, 8, 9]
 indptr = [0, 2, 2, 3]
@@ -428,7 +428,7 @@ out = mx.nd.sparse.dot(a, rhs)  # invoke sparse dot operator specialized for dot
 For any sparse operator, the storage type of output array is inferred based on inputs. You can either read the documentation or inspect the `stype` attribute of the output array to know what storage type is inferred:
 
 
-```python
+```{.python .input}
 b = a * 2  # b will be a CSRNDArray since zero multiplied by 2 is still zero
 c = a + mx.nd.ones(shape=(3, 4))  # c will be a dense NDArray
 {'b.stype':b.stype, 'c.stype':c.stype}
@@ -448,7 +448,7 @@ If sparse inputs are provided, MXNet will convert sparse inputs into dense ones
 If sparse outputs are provided, MXNet will convert the dense outputs generated by the dense operator into the provided sparse format.
 
 
-```python
+```{.python .input}
 e = mx.nd.sparse.zeros('csr', a.shape)
 d = mx.nd.log(a) # dense operator with a sparse input
 e = mx.nd.log(a, out=e) # dense operator with a sparse output
@@ -469,7 +469,7 @@ Note that warning messages will be printed when such a storage fallback event ha
 You can load data in batches from a CSRNDArray using `mx.io.NDArrayIter`:
 
 
-```python
+```{.python .input}
 # Create the source CSRNDArray
 data = mx.nd.array(np.arange(36).reshape((9,4))).tostype('csr')
 labels = np.ones([9, 1])
@@ -492,7 +492,7 @@ dataiter = mx.io.NDArrayIter(data, labels, batch_size, last_batch_handle='discar
 You can also load data stored in the [libsvm file format](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/) using `mx.io.LibSVMIter`, where the format is: ``<label> <col_idx1>:<value1> <col_idx2>:<value2> ... <col_idxN>:<valueN>``. Each line in the file records the label and the column indices and data for non-zero entries. For example, for a matrix with 6 columns, ``1 2:1.5 4:-3.5`` means the label is ``1``, the data is ``[[0, 0, 1,5, 0, -3.5, 0]]``. More detailed examples of `mx.io.LibSVMIter` are available in the [API documentation](https://mxnet.apache.org/versions/master/api/python/io/io.html#mxnet.io.LibSVMIter).
 
 
-```python
+```{.python .input}
 # Create a sample libsvm file in current working directory
 import os
 cwd = os.getcwd()
@@ -544,7 +544,7 @@ By default, `CSRNDArray` operators are executed on CPU. To create a `CSRNDArray`
 **Note** If a GPU is not available, an error will be reported in the following section. In order to execute it a cpu, set `gpu_device` to `mx.cpu()`.
 
 
-```python
+```{.python .input}
 import sys
 gpu_device=mx.gpu() # Change this to mx.cpu() in absence of GPUs.
 try:
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md
index 7500e82cf9e6..8dcdc967e43c 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md
@@ -26,7 +26,7 @@ the weights of models with sparse datasets, the derived gradients of the weights
 Let's say we perform a matrix multiplication of ``X``  and ``W``, where ``X`` is a 1x2 matrix, and ``W`` is a 2x3 matrix. Let ``Y`` be the matrix multiplication of the two matrices:
 
 
-```python
+```{.python .input}
 import mxnet as mx
 X = mx.nd.array([[1,0]])
 W = mx.nd.array([[3,4,5], [6,7,8]])
@@ -67,7 +67,7 @@ grad_W[1][2] = X[0][1] = 0
 As a matter of fact, you can calculate ``grad_W`` by multiplying the transpose of ``X`` with a matrix of ones:
 
 
-```python
+```{.python .input}
 grad_W = mx.nd.dot(X, mx.nd.ones_like(Y), transpose_a=True)
 grad_W
 ```
@@ -124,7 +124,7 @@ A RowSparseNDArray is typically used to represent non-zero row slices of a large
 Given this two-dimension matrix:
 
 
-```python
+```{.python .input}
 [[ 1, 2, 3],
  [ 0, 0, 0],
  [ 4, 0, 5],
@@ -137,7 +137,7 @@ The row sparse representation would be:
 - `indices` array stores the row index for each row slice with non-zero elements.
 
 
-```python
+```{.python .input}
 data = [[1, 2, 3], [4, 0, 5]]
 indices = [0, 2]
 ```
@@ -145,7 +145,7 @@ indices = [0, 2]
 `RowSparseNDArray` supports multidimensional arrays. Given this 3D tensor:
 
 
-```python
+```{.python .input}
 [[[1, 0],
   [0, 2],
   [3, 4]],
@@ -162,7 +162,7 @@ indices = [0, 2]
 The row sparse representation would be (with `data` and `indices` defined the same as above):
 
 
-```python
+```{.python .input}
 data = [[[1, 0], [0, 2], [3, 4]], [[5, 0], [6, 0], [0, 0]]]
 indices = [0, 1]
 ```
@@ -175,7 +175,7 @@ the value will be **"row_sparse"**.
 You can create a `RowSparseNDArray` with data and indices by using the `row_sparse_array` function:
 
 
-```python
+```{.python .input}
 import mxnet as mx
 import numpy as np
 # Create a RowSparseNDArray with python lists
@@ -216,7 +216,7 @@ Similar to `CSRNDArray`, the are several functions with `RowSparseNDArray` that
 You can create a `RowSparseNDArray` from another specifying the element data type with the option `dtype`, which accepts a numpy type. By default, `float32` is used.
 
 
-```python
+```{.python .input}
 # Float32 is used by default
 c = mx.nd.sparse.array(a)
 # Create a 16-bit float array
@@ -237,7 +237,7 @@ As with `CSRNDArray`, you can inspect the contents of a `RowSparseNDArray` by fi
 its contents into a dense `numpy.ndarray` using the `asnumpy` function.
 
 
-```python
+```{.python .input}
 a.asnumpy()
 ```
 
@@ -256,7 +256,7 @@ array([[ 0.,  0.],
 You can inspect the internal storage of a RowSparseNDArray by accessing attributes such as `indices` and `data`:
 
 
-```python
+```{.python .input}
 # Access data array
 data = a.data
 # Access indices array
@@ -281,7 +281,7 @@ indices = a.indices
 You can convert an NDArray to a RowSparseNDArray and vice versa by using the `tostype` function:
 
 
-```python
+```{.python .input}
 # Create a dense NDArray
 ones = mx.nd.ones((2,2))
 # Cast the storage type from `default` to `row_sparse`
@@ -305,7 +305,7 @@ dense = rsp.tostype('default')
 You can also convert the storage type by using the `cast_storage` operator:
 
 
-```python
+```{.python .input}
 # Create a dense NDArray
 ones = mx.nd.ones((2,2))
 # Cast the storage type to `row_sparse`
@@ -332,7 +332,7 @@ You can use the `copy` method which makes a deep copy of the array and its data,
 We can also use the `copyto` method or the slice operator `[]` to deep copy to an existing array.
 
 
-```python
+```{.python .input}
 a = mx.nd.ones((2,2)).tostype('row_sparse')
 b = a.copy()
 c = mx.nd.sparse.zeros('row_sparse', (2,2))
@@ -356,7 +356,7 @@ If the storage types of source array and destination array do not match,
 the storage type of destination array will not change when copying with `copyto` or the slice operator `[]`. The source array will be temporarily converted to desired storage type before the copy.
 
 
-```python
+```{.python .input}
 e = mx.nd.sparse.zeros('row_sparse', (2,2))
 f = mx.nd.sparse.zeros('row_sparse', (2,2))
 g = mx.nd.ones(e.shape)
@@ -377,7 +377,7 @@ g.copyto(f)
 You can retain a subset of row slices from a RowSparseNDArray specified by their row indices.
 
 
-```python
+```{.python .input}
 data = [[1, 2], [3, 4], [5, 6]]
 indices = [0, 2, 3]
 rsp = mx.nd.sparse.row_sparse_array((data, indices), shape=(5, 2))
@@ -407,7 +407,7 @@ rsp_retained = mx.nd.sparse.retain(rsp, mx.nd.array([0, 1]))
 Operators that have specialized implementation for sparse arrays can be accessed in ``mx.nd.sparse``. You can read the [mxnet.ndarray.sparse API documentation](http://mxnet.apache.org/api/python/ndarray/sparse.html) to find what sparse operators are available.
 
 
-```python
+```{.python .input}
 shape = (3, 5)
 data = [7, 8, 9]
 indptr = [0, 2, 2, 3]
@@ -436,7 +436,7 @@ transpose_dot = mx.nd.sparse.dot(lhs, rhs, transpose_a=True)
 For any sparse operator, the storage type of output array is inferred based on inputs. You can either read the documentation or inspect the `stype` attribute of output array to know what storage type is inferred:
 
 
-```python
+```{.python .input}
 a = transpose_dot.copy()
 b = a * 2  # b will be a RowSparseNDArray since zero multiplied by 2 is still zero
 c = a + mx.nd.ones((5, 2))  # c will be a dense NDArray
@@ -460,7 +460,7 @@ If sparse outputs are provided, MXNet will convert the dense outputs generated b
 For operators that don't specialize in sparse arrays, you can still use them with sparse inputs with some performance penalty.
 
 
-```python
+```{.python .input}
 e = mx.nd.sparse.zeros('row_sparse', a.shape)
 d = mx.nd.log(a) # dense operator with a sparse input
 e = mx.nd.log(a, out=e) # dense operator with a sparse output
@@ -501,7 +501,7 @@ This means that the lazy update leads to different optimization results if `weig
 To disable lazy update, please set `lazy_update` to be False when creating the optimizer.
 
 
-```python
+```{.python .input}
 # Create weight
 shape = (4, 2)
 weight = mx.nd.ones(shape).tostype('row_sparse')
@@ -534,7 +534,7 @@ momentum = sgd.create_state(0, weight)
 
 
 
-```python
+```{.python .input}
 sgd.update(0, weight, grad, momentum)
 # Only row 0 and row 2 are updated for both weight and momentum
 {"weight.asnumpy()":weight.asnumpy(), "momentum.asnumpy()":momentum.asnumpy()}
@@ -566,7 +566,7 @@ By default, RowSparseNDArray operators are executed on CPU. To create a RowSpars
 **Note** If a GPU is not available, an error will be reported in the following section. In order to execute it on a cpu, set gpu_device to mx.cpu().
 
 
-```python
+```{.python .input}
 import sys
 gpu_device=mx.gpu() # Change this to mx.cpu() in absence of GPUs.
 try:
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/train_gluon.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/train_gluon.md
index 8b8e551bca22..e1502585d6f7 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/train_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/train_gluon.md
@@ -23,7 +23,7 @@ When working on machine learning problems, you may encounter situations where th
 MXNet supports a number of sparse storage types (often called `stype` for short) for these situations. In this tutorial, we'll start by generating some sparse data, write it to disk in the LibSVM format and then read back using the [LibSVMIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter) for training. We use the Gluon API to train the model and leverage sparse storage types such as [CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) and [RowSparseNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.RowSparseNDArray) to maximise performance and memory efficiency.
 
 
-```python
+```{.python .input}
 import mxnet as mx
 import numpy as np
 import time
@@ -34,7 +34,7 @@ import time
 You will most likely have a sparse dataset in mind already if you're reading this tutorial, but let's create a dummy dataset to use in the examples that follow. Using `rand_ndarray` we will generate 1000 samples, each with 1,000,000 features of which 99.999% of values will be zero (i.e. 10 non-zero features for each sample). We take this as our input data for training and calculate a label based on an arbitrary rule: whether the feature sum is higher than average.
 
 
-```python
+```{.python .input}
 num_samples = 1000
 num_features = 1000000
 data = mx.test_utils.rand_ndarray((num_samples, num_features), stype='csr', density=0.00001)
@@ -43,7 +43,7 @@ label = data.sum(axis=1) > data.sum(axis=1).mean()
 ```
 
 
-```python
+```{.python .input}
 print(type(data))
 print(data[:10].asnumpy())
 print('{:,.0f} elements'.format(np.product(data.shape)))
@@ -66,7 +66,7 @@ print('{:,.0f} non-zero elements'.format(data.data.size))
 Our storage type is CSR (Compressed Sparse Row) which is the ideal type for sparse data along multiple axes. See [this in-depth tutorial](/api/python/docs/tutorials/packages/ndarray/sparse/csr.html) for more information. Just to confirm the generation process ran correctly, we can see that the vast majority of values are indeed zero. One of the first questions to ask would be how much memory is saved by storing this data in a [CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) versus a standard [NDArray](/api/python/docs/api/ndarray/ndarray.html#module-mxnet.ndarray). Since sparse arrays are constructed from many components (e.g. `data`, `indices` and `indptr`) we define a function called `get_nbytes` to calculate the number of bytes taken in memory to store an array. We compare the same data stored in a standard [NDArray](/api/python/docs/api/ndarray/ndarray.html#module-mxnet.ndarray) (with `data.tostype('default')`) to the [CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray).
 
 
-```python
+```{.python .input}
 def get_nbytes(array):
     fn = lambda a: a.size * np.dtype(a).itemsize
     if isinstance(array, mx.ndarray.sparse.CSRNDArray):
@@ -80,7 +80,7 @@ def get_nbytes(array):
 ```
 
 
-```python
+```{.python .input}
 print('NDarray:', get_nbytes(data.tostype('default'))/1000000, 'MBs')
 print('CSRNDArray', get_nbytes(data)/1000000, 'MBs')
 ```
@@ -99,7 +99,7 @@ Since there is such a large size difference between dense and sparse storage for
 A LibSVM file has a row for each sample, and each row starts with the label: in this case `0.0` or `1.0` since we have a classification task. After this we have a variable number of `key:value` pairs separated by spaces, where the key is column/feature index and the value is the value of that feature. When working with your own sparse data in a custom format you should try to convert your data into this format. We define a `save_as_libsvm` function to save the `data` ([CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray)) and `label` (`NDArray`) to disk in LibSVM format.
 
 
-```python
+```{.python .input}
 def save_as_libsvm(filepath, data, label):
     with open(filepath, 'w') as openfile:
         for row_idx in range(data.shape[0]):
@@ -115,7 +115,7 @@ def save_as_libsvm(filepath, data, label):
 ```
 
 
-```python
+```{.python .input}
 filepath = 'dataset.libsvm'
 save_as_libsvm(filepath, data, label)
 ```
@@ -123,7 +123,7 @@ save_as_libsvm(filepath, data, label)
 We have now written the `data` and `label` to disk, and can inspect the first 10 lines of the file:
 
 
-```python
+```{.python .input}
 with open(filepath, 'r') as openfile:
     lines = [openfile.readline() for _ in range(10)]
 for line in lines:
@@ -152,7 +152,7 @@ Using [LibSVMIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter)
 
 Similar to using a [DataLoader](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.DataLoader), you must specify the required `batch_size`. Since we're dealing with sparse data and the column shape isn't explicitly stored in the LibSVM file, we additionally need to provide the shape of the data and label. Our [LibSVMIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter) returns batches in a slightly different form to a [DataLoader](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.DataLoader). We get `DataBatch` objects instead of `tuple`. 
 
-```python
+```{.python .input}
 data_iter = mx.io.LibSVMIter(data_libsvm=filepath, data_shape=(num_features,), label_shape=(1,), batch_size=10)
 for batch in data_iter:
     data = batch.data[0]
@@ -170,12 +170,12 @@ label.stype: default
 We can see that `data` and `label` are in the appropriate storage formats, given their sparse and dense values respectively. We can avoid out-of-memory issues that might have occurred if `data` was in dense storage format. Another benefit of storing the data efficiently is the reduced data transfer time when using GPUs. Although the transfer time for a single batch is small, we transfer `data` and `label` to the GPU every iteration so this time can become significant. We will time the transfer of the sparse `data` to GPU (if available) and compare to the time for its dense counterpart.
 
 
-```python
+```{.python .input}
 ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
 ```
 
 
-```python
+```{.python .input}
 %%timeit
 data_on_ctx = data.as_in_context(ctx)
 data_on_ctx.wait_to_read()
@@ -186,7 +186,7 @@ data_on_ctx.wait_to_read()
 ```
 
 
-```python
+```{.python .input}
 print('sparse batch: {} MBs'.format(get_nbytes(data)/1000000))
 data = data.tostype('default')  # avoid timing this sparse to dense conversion
 print('dense batch: {} MBs'.format(get_nbytes(data)/1000000))
@@ -198,7 +198,7 @@ dense batch: 40.0 MBs
 ```
 
 
-```python
+```{.python .input}
 %%timeit
 data_on_ctx = data.as_in_context(ctx)
 data_on_ctx.wait_to_read()
@@ -217,7 +217,7 @@ Our next step is to define a network. We have an input of 1,000,000 features and
 Gluon's [nn.Dense](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) block can used with [CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) input arrays but it doesn't exploit the sparsity. Under the hood, [Dense](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) uses the [FullyConnected](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operator which isn't optimized for [CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) arrays. We'll implement a `Block` that does exploit this sparsity, *but first*, let's just remind ourselves of the [Dense](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) implementation by creating an equivalent `Block` called `FullyConnected`.
 
 
-```python
+```{.python .input}
 class FullyConnected(mx.gluon.HybridBlock):
     def __init__(self, in_units, units):
         super(FullyConnected, self).__init__()
@@ -240,7 +240,7 @@ $$Y = XW^T + b$$
 We could instead have created our parameter with shape `(in_units, units)` and avoid the transpose of the weight matrix. We'll see why this is so important later on. And instead of [FullyConnected](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) we could have used [mx.sparse.dot](/api/python/docs/api/ndarray/sparse/index.html?#mxnet.ndarray.sparse.dot) to fully exploit the sparsity of the [CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) input arrays. We'll now implement an alternative `Block` called `FullyConnectedSparse` using these ideas. We take `grad_stype` of the `weight` as an argument (called `weight_grad_stype`), since we're going to change this later on.
 
 
-```python
+```{.python .input}
 class FullyConnectedSparse(mx.gluon.HybridBlock):
     def __init__(self, in_units, units, weight_grad_stype='default'):
         super(FullyConnectedSparse, self).__init__()
@@ -261,7 +261,7 @@ Once again, we're using a dense `weight`, so both `FullyConnected` and `FullyCon
 We will use [timeit](https://docs.python.org/2/library/timeit.html) to check the performance of these two variants, and analyse some [MXNet Profiler](/api/python/docs/tutorials/performance/backend/profiler.html) traces that have been created from these benchmarks. Additionally, we will inspect the memory usage of the weights (and gradients) using the `print_memory_allocation` function defined below:
 
 
-```python
+```{.python .input}
 def print_memory_allocation(net, block_idxs):
     blocks = [net[block_idx] for block_idx in block_idxs]
     weight_nbytes = [get_nbytes(b.weight.data()) for b in blocks]
@@ -284,7 +284,7 @@ def print_memory_allocation(net, block_idxs):
 
 We'll create a network using `nn.Dense` and benchmark the training.
 
-```python
+```{.python .input}
 net = mx.gluon.nn.Sequential()
 net.add(
     mx.gluon.nn.Dense(in_units=num_features, units=128),
@@ -300,7 +300,7 @@ loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
 ```
 
 
-```python
+```{.python .input}
 %%timeit
 data_iter.reset()
 for batch in data_iter:
@@ -324,7 +324,7 @@ for batch in data_iter:
 We can see the first [FullyConnected](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operator takes a significant proportion of time to execute (~25% of the iteration) because there are 1,000,000 input features (to 128). After this, the other [FullyConnected](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) operators are much faster because they have input features of 128 (to 8) and 8 (to 1). On the backward pass, we see the same pattern (but in reverse). And finally, the parameter update step takes a large amount of time on the weight matrix of the first `FullyConnected` `Block`. When checking the memory allocations below, we can see the weight matrix of the first `FullyConnected` `Block` is responsible for 99.999% of the memory compared to other [FullyConnected](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.FullyConnected) weight matrices.
 
 
-```python
+```{.python .input}
 print_memory_allocation(net, block_idxs=[0, 2, 4])
 ```
 
@@ -344,7 +344,7 @@ Memory Allocation for Weight Gradient:
 We will now switch the first layer from `FullyConnected` to `FullyConnectedSparse`.
 
 
-```python
+```{.python .input}
 net = mx.gluon.nn.Sequential()
 net.add(
     FullyConnectedSparse(in_units=num_features, units=128),
@@ -360,7 +360,7 @@ loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
 ```
 
 
-```python
+```{.python .input}
 %%timeit
 data_iter.reset()
 for batch in data_iter:
@@ -388,7 +388,7 @@ We see the forward pass of `dot` and `add` (equivalent to [FullyConnected](/api/
 Our first weight matrix and its gradients still take up the same amount of memory as before.
 
 
-```python
+```{.python .input}
 print_memory_allocation(net, block_idxs=[0, 2, 4])
 ```
 
@@ -408,7 +408,7 @@ Memory Allocation for Weight Gradient:
 One useful outcome of sparsity in our [CSRNDArray](/api/python/docs/api/ndarray/sparse/index.html#mxnet.ndarray.sparse.CSRNDArray) input is that our gradients will be row sparse. We can exploit this fact to give us potentially huge memory savings and speed improvements. Creating our `weight` parameter with shape `(units, in_units)` and not transposing in the forward pass are important pre-requisite for obtaining row sparse gradients. Using [nn.Dense](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) would have led to column sparse gradients which are not supported in MXNet. We previously had `grad_stype` of the `weight` parameter in the first layer set to `'default'` so we were handling the gradient as a dense array. Switching this to `'row_sparse'` can give us these potential improvements.
 
 
-```python
+```{.python .input}
 net = mx.gluon.nn.Sequential()
 net.add(
     FullyConnectedSparse(in_units=num_features, units=128, weight_grad_stype='row_sparse'),
@@ -424,7 +424,7 @@ loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
 ```
 
 
-```python
+```{.python .input}
 %%timeit
 data_iter.reset()
 for batch in data_iter:
@@ -448,7 +448,7 @@ for batch in data_iter:
 We can see a huge reduction in the time taken for the backward pass and parameter update step: 3.99ms vs 0.18ms. And this reduces the overall time of the epoch significantly. Our gradient consumes a much smaller amount of memory and means only a subset of parameters need updating as part of the `sgd_update` step. Some optimizers don't support sparse gradients however, so reference the specific optimizer's documentation for more details.
 
 
-```python
+```{.python .input}
 print_memory_allocation(net, block_idxs=[0, 2, 4])
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md b/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
index 7f4d82605653..cbcbc9378528 100644
--- a/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
+++ b/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
@@ -20,7 +20,7 @@
 To begin, import the `np` and `npx` module and update MXNet to run in
 NumPy-like mode.
 
-```{.python .input  n=1}
+```{.python .input}
 from mxnet import np, npx
 npx.set_np()  # Change MXNet to the numpy-like mode.
 ```
@@ -29,50 +29,50 @@ NDArray figure (TODO)
 
 ## Creating arrays
 
-```{.python .input  n=2}
+```{.python .input}
 np.array([1, 2, 3])  # default datatype is float32
 ```
 
-```{.python .input  n=3}
+```{.python .input}
 np.array([(1.5, 2, 3), (4, 5, 6)], dtype='float16')
 ```
 
-```{.python .input  n=4}
+```{.python .input}
 np.array([[(15,2,3), (4,5,6)], [(3,2,1), (4,5,6)]], dtype='int32')
 ```
 
 ### Initial placeholders
 
-```{.python .input  n=5}
+```{.python .input}
 np.zeros((3, 4))  # Create an array of zeros
 ```
 
-```{.python .input  n=6}
+```{.python .input}
 np.ones((2, 3, 4), dtype='int8')  # Create an array of ones
 ```
 
-```{.python .input  n=7}
+```{.python .input}
 np.arange(10, 25, 5)  # Create an array of evenly spaced values (step value)
 ```
 
-```{.python .input  n=8}
+```{.python .input}
 # Create an array of evenly spaced values (number of samples)
 # np.linspace(0, 2, 9)
 ```
 
-```{.python .input  n=9}
+```{.python .input}
 # np.full((2, 2), 7)  # Create a constant array
 ```
 
-```{.python .input  n=10}
+```{.python .input}
 # np.eye(2)  # Create a 2X2 identity matrix
 ```
 
-```{.python .input  n=11}
+```{.python .input}
 # np.random.random((2, 2))  # Create an array with random values
 ```
 
-```{.python .input  n=12}
+```{.python .input}
 np.empty((3,2))  # Create an empty array
 ```
 
@@ -80,14 +80,14 @@ np.empty((3,2))  # Create an empty array
 
 ### Saving and loading on disk
 
-```{.python .input  n=12}
+```{.python .input}
 # Save one array
 a = np.array([1, 2, 3])
 npx.save('my_array', a)
 npx.load('my_array')
 ```
 
-```{.python .input  n=20}
+```{.python .input}
 # Save a list of arrays
 b = np.array([4, 6, 8])
 npx.save('my_arrays', [a, b])  # FIXME, cannot be a tuple
@@ -96,7 +96,7 @@ npx.load('my_arrays')
 
 ### Saving and loading text files
 
-```{.python .input  n=20}
+```{.python .input}
 # np.loadtxt("myfile.txt")
 # np.genfromtxt("my_file.csv", delimiter=',')
 # np.savetxt("myarray.txt", a, delimiter=" ")
@@ -104,7 +104,7 @@ npx.load('my_arrays')
 
 ## Data types
 
-```{.python .input  n=20}
+```{.python .input}
 # np.int64    # Signed 64-bit integer types
 # np.float32  # Standard double-precision floating point
 # np.complex  # Complex numbers represented by 128 floats
@@ -116,37 +116,37 @@ npx.load('my_arrays')
 
 ## Inspecting your array
 
-```{.python .input  n=21}
+```{.python .input}
 a.shape # Array dimensions
 ```
 
-```{.python .input  n=22}
+```{.python .input}
 len(a) # Length of array
 ```
 
-```{.python .input  n=23}
+```{.python .input}
 b.ndim # Number of array dimensions
 ```
 
-```{.python .input  n=24}
+```{.python .input}
 b.size # Number of array elements
 ```
 
-```{.python .input  n=25}
+```{.python .input}
 b.dtype # Data type of array elements
 ```
 
-```{.python .input  n=29}
+```{.python .input}
 # b.dtype.name # Name of data type
 ```
 
-```{.python .input  n=35}
+```{.python .input}
 b.astype('int') # Convert an array to a different type
 ```
 
 ## Asking For Help
 
-```{.python .input  n=36}
+```{.python .input}
 # np.info(np.ndarray.dtype)
 ```
 
@@ -154,59 +154,59 @@ b.astype('int') # Convert an array to a different type
 
 ### Arithmetic operations
 
-```{.python .input  n=37}
+```{.python .input}
 a - b # Subtraction
 ```
 
-```{.python .input  n=38}
+```{.python .input}
 np.subtract(a, b) # Subtraction
 ```
 
-```{.python .input  n=39}
+```{.python .input}
 b + a # Addition
 ```
 
-```{.python .input  n=40}
+```{.python .input}
 np.add(b, a) # Addition
 ```
 
-```{.python .input  n=41}
+```{.python .input}
 a / b # Division
 ```
 
-```{.python .input  n=42}
+```{.python .input}
 np.divide(a,b) # Division
 ```
 
-```{.python .input  n=43}
+```{.python .input}
 a * b # Multiplication
 ```
 
-```{.python .input  n=44}
+```{.python .input}
 np.multiply(a, b) # Multiplication
 ```
 
-```{.python .input  n=45}
+```{.python .input}
 np.exp(b) # Exponentiation
 ```
 
-```{.python .input  n=46}
+```{.python .input}
 np.sqrt(b) # Square root
 ```
 
-```{.python .input  n=47}
+```{.python .input}
 np.sin(a) # Sines of an array
 ```
 
-```{.python .input  n=48}
+```{.python .input}
 np.cos(b) # Element-wise cosine
 ```
 
-```{.python .input  n=49}
+```{.python .input}
 np.log(a) # Element-wise natural logarithm
 ```
 
-```{.python .input  n=50}
+```{.python .input}
 a.dot(b) # Dot product
 ```
 
@@ -214,60 +214,60 @@ a.dot(b) # Dot product
 
 ### Aggregate functions
 
-```{.python .input  n=51}
+```{.python .input}
 a.sum() # Array-wise sum
 ```
 
-```{.python .input  n=53}
+```{.python .input}
 # a.min() # Array-wise minimum value
 ```
 
-```{.python .input  n=57}
+```{.python .input}
 c = np.array(([[1,2,3], [2,3,4]]))
 # c.max(axis=0) # Maximum value of an array row
 ```
 
-```{.python .input  n=56}
+```{.python .input}
 # c.cumsum(axis=1) # Cumulative sum of the elements
 ```
 
-```{.python .input  n=58}
+```{.python .input}
 a.mean() # Mean
 ```
 
-```{.python .input  n=60}
+```{.python .input}
 # b.median() # Median
 ```
 
-```{.python .input  n=61}
+```{.python .input}
 # a.corrcoef() # Correlation coefficient
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.std(b) # Standard deviation
 ```
 
 ## Copying arrays
 
-```{.python .input  n=63}
+```{.python .input}
 # a.view() # Create a view of the array with the same data
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 np.copy(a) # Create a copy of the array
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 a.copy() # Create a deep copy of the array
 ```
 
 ## Sorting Arrays
 
-```{.python .input  n=63}
+```{.python .input}
 # a.sort() # Sort an array
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # c.sort(axis=0) # Sort the elements of an array's axis
 ```
 
@@ -275,49 +275,49 @@ a.copy() # Create a deep copy of the array
 
 ### Subsetting
 
-```{.python .input  n=63}
+```{.python .input}
 a[2] # Select the element at the 2nd index 3
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 c[0,1] # Select the element at row 1 column 2
 ```
 
 ### Slicing
 
-```{.python .input  n=63}
+```{.python .input}
 a[0:2] # Select items at index 0 and 1
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 c[0:2,1] # Select items at rows 0 and 1 in column 1
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 c[:1] # Select all items at row 0
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # c[1,...] # Same as [1,:,:]
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 a[ : :-1] #Reversed array a array([3, 2, 1])
 ```
 
 ### Boolean Indexing
 
-```{.python .input  n=63}
+```{.python .input}
 # a[a<2] # Select elements from a less than 2
 ```
 
 ### Fancy indexing
 
-```{.python .input  n=63}
+```{.python .input}
 c[[1,0,1,0], [0,1,2,0]] # Select elements (1,0),(0,1),(1,2) and (0,0)
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 c[[1,0,1,0]][:,[0,1,2,0]] # Select a subset of the matrix’s rows
 ```
 
@@ -325,75 +325,75 @@ c[[1,0,1,0]][:,[0,1,2,0]] # Select a subset of the matrix’s rows
 
 ### Transposing array
 
-```{.python .input  n=63}
+```{.python .input}
 np.transpose(c) # Permute array dimensions
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 c.T # Permute array dimensions
 ```
 
 ### Changing array shape
 
-```{.python .input  n=63}
+```{.python .input}
 # b.ravel() # Flatten the array
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # c.reshape(3,-2) # Reshape, but don’t change data
 ```
 
 ### Adding and removing elements
 
-```{.python .input  n=63}
+```{.python .input}
 # c.resize((6,2)) # Return a new array with shape (6, 2)
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.append(h,g) # Append items to an array
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.insert(a, 1, 5) # Insert items in an array
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.delete(a, [1]) # Delete items from an array
 ```
 
 ### Combining arrays
 
-```{.python .input  n=63}
+```{.python .input}
 np.concatenate((a,b),axis=0) # Concatenate arrays
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.vstack((a,b)) # Stack arrays vertically (row-wise)
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.r_[e,f] # Stack arrays vertically (row-wise)
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.hstack((e,f)) # Stack arrays horizontally (column-wise)
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.column_stack((a,d)) # Create stacked column-wise arrays
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.c_[a,d] # Create stacked column-wise arrays
 ```
 
 ### Splitting arrays
 
-```{.python .input  n=63}
+```{.python .input}
 # np.hsplit(a,3) # Split the array horizontally at the 3rd index
 ```
 
-```{.python .input  n=63}
+```{.python .input}
 # np.vsplit(c,2) # Split the array vertically at the 2nd index
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
index 4b90acc8d1bd..6e369fa57d9e 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
@@ -39,7 +39,7 @@ We recommend that you have first followed this tutorial:
 - [Inference using an ONNX model on MXNet Gluon](/api/python/docs/tutorials/packages/onnx/inference_on_onnx_model.html)
 
 
-```python
+```{.python .input}
 import json
 import logging
 import multiprocessing
@@ -64,7 +64,7 @@ import numpy as np
 These are images and a vizualisation script:
 
 
-```python
+```{.python .input}
 image_folder = "images"
 utils_file = "utils.py" # contain utils function to plot nice visualization
 images = ['wrench.jpg', 'dolphin.jpg', 'lotus.jpg']
@@ -83,7 +83,7 @@ from utils import *
 We download a pre-trained model, in our case the [GoogleNet](https://arxiv.org/abs/1409.4842) model, trained on [ImageNet](http://www.image-net.org/) from the [ONNX model zoo](https://github.com/onnx/models). The model comes packaged in an archive `tar.gz` file containing an `model.onnx` model file.
 
 
-```python
+```{.python .input}
 base_url = "https://s3.amazonaws.com/download.onnx/models/opset_3/"
 current_model = "bvlc_googlenet"
 model_folder = "model"
@@ -112,7 +112,7 @@ The [Caltech101 dataset](http://www.vision.caltech.edu/Image_Datasets/Caltech101
 Based Vision. 2004*
 
 
-```python
+```{.python .input}
 data_folder = "data"
 dataset_name = "101_ObjectCategories"
 archive_file = "{}.tar.gz".format(dataset_name)
@@ -129,7 +129,7 @@ if not os.path.isfile(archive_path):
 ```
 
 
-```python
+```{.python .input}
 training_path = os.path.join(data_folder, dataset_name)
 testing_path = os.path.join(data_folder, "{}_test".format(dataset_name))
 ```
@@ -139,7 +139,7 @@ testing_path = os.path.join(data_folder, "{}_test".format(dataset_name))
 We need to transform the images to a format accepted by the network
 
 
-```python
+```{.python .input}
 EDGE = 224
 SIZE = (EDGE, EDGE)
 BATCH_SIZE = 32
@@ -152,7 +152,7 @@ We transform the dataset images using the following operations:
 - transpose the channels to be (3,224,224)
 
 
-```python
+```{.python .input}
 def transform(image, label):
     resized = mx.image.resize_short(image, EDGE)
     cropped, crop_info = mx.image.center_crop(resized, SIZE)
@@ -172,7 +172,7 @@ ____image4
 ```
 
 
-```python
+```{.python .input}
 dataset_train = ImageFolderDataset(root=training_path)
 dataset_test = ImageFolderDataset(root=testing_path)
 ```
@@ -180,7 +180,7 @@ dataset_test = ImageFolderDataset(root=testing_path)
 We use several worker processes, which means the dataloading and pre-processing is going to be distributed across multiple processes. This will help preventing our GPU from starving and waiting for the data to be copied across
 
 
-```python
+```{.python .input}
 dataloader_train = DataLoader(dataset_train.transform(transform, lazy=False), batch_size=BATCH_SIZE, last_batch='rollover',
                               shuffle=True, num_workers=NUM_WORKERS)
 dataloader_test = DataLoader(dataset_test.transform(transform, lazy=False), batch_size=BATCH_SIZE, last_batch='rollover',
@@ -193,7 +193,7 @@ print("Train dataset: {} images, Test dataset: {} images".format(len(dataset_tra
 
 
 
-```python
+```{.python .input}
 categories = dataset_train.synsets
 NUM_CLASSES = len(categories)
 BATCH_SIZE = 32
@@ -202,7 +202,7 @@ BATCH_SIZE = 32
 Let's plot the 1000th image to test the dataset
 
 
-```python
+```{.python .input}
 N = 1000
 plt.imshow((transform(dataset_train[N][0], 0)[0].asnumpy().transpose((1,2,0))))
 plt.axis('off')
@@ -224,14 +224,14 @@ print(categories[dataset_train[N][1]])
 Load the ONNX model
 
 
-```python
+```{.python .input}
 sym, arg_params, aux_params = onnx_mxnet.import_model(onnx_path)
 ```
 
 This function get the output of a given layer
 
 
-```python
+```{.python .input}
 def get_layer_output(symbol, arg_params, aux_params, layer_name):
     all_layers = symbol.get_internals()
     net = all_layers[layer_name+'_output']
@@ -244,7 +244,7 @@ def get_layer_output(symbol, arg_params, aux_params, layer_name):
 Here we print the different layers of the network to make it easier to pick the right one
 
 
-```python
+```{.python .input}
 sym.get_internals()
 ```
 
@@ -258,7 +258,7 @@ sym.get_internals()
 We get the network until the output of the `flatten0` layer
 
 
-```python
+```{.python .input}
 new_sym, new_arg_params, new_aux_params = get_layer_output(sym, arg_params, aux_params, 'flatten0')
 ```
 
@@ -271,14 +271,14 @@ We can now take advantage of the features and pattern detection knowledge that o
 We pick a context, fine-tuning on CPU will be **WAY** slower.
 
 
-```python
+```{.python .input}
 ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()
 ```
 
 We create a symbol block that is going to hold all our pre-trained layers, and assign the weights of the different pre-trained layers to the newly created SymbolBlock
 
 
-```python
+```{.python .input}
 import warnings
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
@@ -296,7 +296,7 @@ for param in new_aux_params:
 We create the new dense layer with the right new number of classes (101) and initialize the weights
 
 
-```python
+```{.python .input}
 dense_layer = gluon.nn.Dense(NUM_CLASSES)
 dense_layer.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
 ```
@@ -304,7 +304,7 @@ dense_layer.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
 We add the SymbolBlock and the new dense layer to a HybridSequential network
 
 
-```python
+```{.python .input}
 net = gluon.nn.HybridSequential()
 net.add(pre_trained)
 net.add(dense_layer)
@@ -314,7 +314,7 @@ net.add(dense_layer)
 Softmax cross entropy for multi-class classification
 
 
-```python
+```{.python .input}
 softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 ```
 
@@ -322,7 +322,7 @@ softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 Initialize trainer with common training parameters
 
 
-```python
+```{.python .input}
 LEARNING_RATE = 0.0005
 WDECAY = 0.00001
 MOMENTUM = 0.9
@@ -331,7 +331,7 @@ MOMENTUM = 0.9
 The trainer will retrain and fine-tune the entire network. If we use `dense_layer` instead of `net` in the cell below, the gradient updates would only be applied to the new last dense layer. Essentially we would be using the pre-trained network as a featurizer.
 
 
-```python
+```{.python .input}
 trainer = gluon.Trainer(net.collect_params(), 'sgd',
                         {'learning_rate': LEARNING_RATE,
                          'wd':WDECAY,
@@ -343,7 +343,7 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd',
 We measure the accuracy in a non-blocking way, using `nd.array` to take care of the parallelisation that MXNet and Gluon offers.
 
 
-```python
+```{.python .input}
  def evaluate_accuracy_gluon(data_iterator, net):
     num_instance = 0
     sum_metric = nd.zeros(1,ctx=ctx, dtype=np.int32)
@@ -359,7 +359,7 @@ We measure the accuracy in a non-blocking way, using `nd.array` to take care of
 ```
 
 
-```python
+```{.python .input}
 %%time
 print("Untrained network Test Accuracy: {0:.4f}".format(evaluate_accuracy_gluon(dataloader_test, net)))
 ```
@@ -371,7 +371,7 @@ print("Untrained network Test Accuracy: {0:.4f}".format(evaluate_accuracy_gluon(
 ### Training loop
 
 
-```python
+```{.python .input}
 val_accuracy = 0
 for epoch in range(5):
     for i, (data, label) in enumerate(dataloader_train):
@@ -407,20 +407,20 @@ In the previous tutorial, we saw that the network trained on ImageNet couldn't c
 Let's see if our network fine-tuned on Caltech101 is up for the task:
 
 
-```python
+```{.python .input}
 # Number of predictions to show
 TOP_P = 3
 ```
 
 
-```python
+```{.python .input}
 # Convert img to format expected by the network
 def transform(img):
     return nd.array(np.expand_dims(np.transpose(img, (2,0,1)),axis=0).astype(np.float32), ctx=ctx)
 ```
 
 
-```python
+```{.python .input}
 # Load and transform the test images
 caltech101_images_test = [plt.imread(os.path.join(image_folder, "{}".format(img))) for img in images]
 caltech101_images_transformed = [transform(img) for img in caltech101_images_test]
@@ -429,7 +429,7 @@ caltech101_images_transformed = [transform(img) for img in caltech101_images_tes
 Helper function to run batches of data
 
 
-```python
+```{.python .input}
 def run_batch(net, data):
     results = []
     for batch in data:
@@ -439,12 +439,12 @@ def run_batch(net, data):
 ```
 
 
-```python
+```{.python .input}
 result = run_batch(net, caltech101_images_transformed)
 ```
 
 
-```python
+```{.python .input}
 plot_predictions(caltech101_images_test, result, categories, TOP_P)
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md b/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md
index faad53bc6793..518674b4252d 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md
@@ -34,7 +34,7 @@ To run the tutorial you will need to have installed the following python modules
 - matplotlib
 
 
-```python
+```{.python .input}
 import numpy as np
 import mxnet as mx
 from mxnet.contrib import onnx as onnx_mxnet
@@ -51,7 +51,7 @@ logging.basicConfig(level=logging.INFO)
 These are images and a vizualisation script
 
 
-```python
+```{.python .input}
 image_folder = "images"
 utils_file = "utils.py" # contain utils function to plot nice visualization
 image_net_labels_file = "image_net_labels.json"
@@ -71,7 +71,7 @@ from utils import *
 We download a pre-trained model, in our case the [GoogleNet](https://arxiv.org/abs/1409.4842) model, trained on [ImageNet](http://www.image-net.org/) from the [ONNX model zoo](https://github.com/onnx/models). The model comes packaged in an archive `tar.gz` file containing an `model.onnx` model file.
 
 
-```python
+```{.python .input}
 base_url = "https://s3.amazonaws.com/download.onnx/models/opset_3/" 
 current_model = "bvlc_googlenet"
 model_folder = "model"
@@ -83,7 +83,7 @@ url = "{}{}".format(base_url, archive)
 Download and extract pre-trained model
 
 
-```python
+```{.python .input}
 mx.test_utils.download(url, dirname = model_folder)
 if not os.path.isdir(os.path.join(model_folder, current_model)):
     print('Extracting model...')
@@ -96,34 +96,34 @@ if not os.path.isdir(os.path.join(model_folder, current_model)):
 The models have been pre-trained on ImageNet, let's load the label mapping of the 1000 classes.
 
 
-```python
+```{.python .input}
 categories = json.load(open(image_net_labels_file, 'r'))
 ```
 
 ## Loading the model into MXNet Gluon
 
 
-```python
+```{.python .input}
 onnx_path = os.path.join(model_folder, current_model, "model.onnx")
 ```
 
 We get the symbol and parameter objects
 
 
-```python
+```{.python .input}
 sym, arg_params, aux_params = onnx_mxnet.import_model(onnx_path)
 ```
 
 We pick a context, CPU is fine for inference, switch to mx.gpu() if you want to use your GPU.
 
 
-```python
+```{.python .input}
 ctx = mx.cpu()
 ```
 
 We obtain the data names of the inputs to the model by using the model metadata API: 
 
-```python
+```{.python .input}
 model_metadata = onnx_mxnet.get_model_metadata(onnx_path)
 print(model_metadata)
 ```
@@ -133,7 +133,7 @@ print(model_metadata)
  'input_tensor_data': [(u'gpu_0/data_0', (1L, 3L, 224L, 224L))]}
 ```
 
-```python
+```{.python .input}
 data_names = [inputs[0] for inputs in model_metadata.get('input_tensor_data')]
 print(data_names)
 ```
@@ -143,7 +143,7 @@ print(data_names)
 
 And load them into a MXNet Gluon symbol block. 
 
-```python
+```{.python .input}
 import warnings
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
@@ -161,14 +161,14 @@ We can now cache the computational graph through [hybridization](https://mxnet.a
 
 
 
-```python
+```{.python .input}
 net.hybridize()
 ```
 
 We can visualize the network (requires graphviz installed)
 
 
-```python
+```{.python .input}
 mx.visualization.plot_network(sym,  node_attrs={"shape":"oval","fixedsize":"false"})
 ```
 
@@ -180,7 +180,7 @@ mx.visualization.plot_network(sym,  node_attrs={"shape":"oval","fixedsize":"fals
 This is a helper function to run M batches of data of batch-size N through the net and collate the outputs into an array of shape (K, 1000) where K=MxN is the total number of examples (mumber of batches x batch-size) run through the network.
 
 
-```python
+```{.python .input}
 def run_batch(net, data):
     results = []
     for batch in data:
@@ -192,7 +192,7 @@ def run_batch(net, data):
 ## Test using real images
 
 
-```python
+```{.python .input}
 TOP_P = 3 # How many top guesses we show in the visualization
 ```
 
@@ -200,7 +200,7 @@ TOP_P = 3 # How many top guesses we show in the visualization
 Transform function to set the data into the format the network expects, (N, 3, 224, 224) where N is the batch size.
 
 
-```python
+```{.python .input}
 def transform(img):
     return np.expand_dims(np.transpose(img, (2,0,1)),axis=0).astype(np.float32)
 ```
@@ -209,7 +209,7 @@ def transform(img):
 We load two sets of images in memory
 
 
-```python
+```{.python .input}
 image_net_images = [plt.imread('{}/{}.jpg'.format(image_folder, path)) for path in ['apron', 'hammerheadshark','dog']]
 caltech101_images = [plt.imread('{}/{}.jpg'.format(image_folder, path)) for path in ['wrench', 'dolphin','lotus']]
 images = image_net_images + caltech101_images
@@ -217,13 +217,13 @@ images = image_net_images + caltech101_images
 
 And run them as a batch through the network to get the predictions
 
-```python
+```{.python .input}
 batch = nd.array(np.concatenate([transform(img) for img in images], axis=0), ctx=ctx)
 result = run_batch(net, [batch])
 ```
 
 
-```python
+```{.python .input}
 plot_predictions(image_net_images, result[:3], categories, TOP_P)
 ```
 
@@ -236,7 +236,7 @@ plot_predictions(image_net_images, result[:3], categories, TOP_P)
 Let's now see the results on the 3 other images
 
 
-```python
+```{.python .input}
 plot_predictions(caltech101_images, result[3:7], categories, TOP_P)
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/index.md b/docs/python_docs/python/tutorials/packages/optimizer/index.md
index b68848b8760f..d514e0eac317 100644
--- a/docs/python_docs/python/tutorials/packages/optimizer/index.md
+++ b/docs/python_docs/python/tutorials/packages/optimizer/index.md
@@ -26,7 +26,7 @@ When training a deep learning model using the MXNet [gluon API](/api/python/docs
 Here is an example of how a trainer with an optimizer is created for, a simple Linear (Dense) Network.
 
 
-```python
+```{.python .input}
 from mxnet import gluon, optimizer
 
 net = gluon.nn.Dense(1)
@@ -40,7 +40,7 @@ In model training, the code snippet above would be followed by a training loop w
 We can also create the trainer by passing in the optimizer name and optimizer params into the trainer constructor directly, as shown below.
 
 
-```python
+```{.python .input}
 trainer = gluon.Trainer(net.collect_params(), optimizer='adam', optimizer_params={'learning_rate':1})
 ```
 
@@ -97,7 +97,7 @@ The use of SGD with momentum for learning in neural networks was introduced by R
 To create an SGD optimizer with momentum $\gamma$ and weight decay in MXNet simply use the following code.
 
 
-```python
+```{.python .input}
 sgd_optimizer = optimizer.SGD(learning_rate=0.1, wd=0., momentum=0.8)
 ```
 
@@ -124,7 +124,7 @@ The effects of using NAG over SGD and classical momentum are discussed in this [
 The NAG optimizer can be initialized in MXNet by using the code snippet below or by creating a trainer with argument `optimizer='nag'`.
 
 
-```python
+```{.python .input}
 nag_optimizer = optimizer.NAG(learning_rate=0.1, momentum=0.8)
 ```
 
@@ -148,7 +148,7 @@ The overaching benefit of AdaGrad over SGD is that it ensures the overall conver
 To instantiate the Adagrad optimizer in MXNet you can use the following line of code.
 
 
-```python
+```{.python .input}
 adagrad_optimizer = optimizer.AdaGrad(learning_rate=0.1, eps=1e-07)
 ```
 
@@ -180,7 +180,7 @@ $$ w_{i+1} = w_i + v_{i+1} $$
 Here is an example snippet creating the RMSProp optimizer in MXNet.
 
 
-```python
+```{.python .input}
 rmsprop_optimizer = optimizer.RMSProp(learning_rate=0.001, rho=0.9, momentum=0.9, epsilon=1e-07, centered=False)
 ```
 
@@ -201,7 +201,7 @@ As evident from the above equations, AdaDelta is similar to RMSProp but does not
 Here is the code snippet creating the AdaDelta optimizer in MXNet. The argument `rho` in the code is $\beta$ in the update equations. Notice there is no learning rate argument in the code.
 
 
-```python
+```{.python .input}
 adadelta_optimizer = optimizer.AdaDelta(rho=0.9, epsilon=1e-07)
 ```
 
@@ -219,7 +219,7 @@ $$ w_{i+1} = w_i + \dfrac{lr}{\sqrt{\tilde{\mathbb{E}[g^2]}_{i+1}} + \epsilon} \
 In MXNet, you can construct the Adam optimizer with the following line of code.
 
 
-```python
+```{.python .input}
 adam_optimizer = optimizer.Adam(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08)
 ```
 
@@ -234,7 +234,7 @@ $$ w_{i+1} = w_i + \dfrac{lr}{g^\infty_{i+1} + \epsilon} \cdot - \tilde{v}_{i+1}
 See the code snippet below for how to construct Adamax in MXNet.
 
 
-```python
+```{.python .input}
 adamax_optimizer = optimizer.Adamax(learning_rate=0.002, beta1=0.9, beta2=0.999)
 ```
 
@@ -252,7 +252,7 @@ $$ w_{i+1} = w_i + \dfrac{lr}{\sqrt{\tilde{\mathbb{E}[g^2]}_{i+1}} + \epsilon}\c
 Here is the line of code to create the NAdam optimizer in MXNet.
 
 
-```python
+```{.python .input}
 nadam_optimizer = optimizer.Nadam(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08)
 ```
 
@@ -277,7 +277,7 @@ $$ w_{i+1} =  w_i - lr \cdot sign(v_{i+1}) $$
 Here is how to create the signum optimizer in MXNet.
 
 
-```python
+```{.python .input}
 signum_optimizer = optimizer.Signum(learning_rate=0.01, momentum=0.9, wd_lh=0.0)
 ```
 
@@ -293,7 +293,7 @@ $$ w_{i+\tau+1} = w_{i+\tau} − lr \cdot (grad(w_i) + \lambda \cdot grad(w_i)^2
 The DCASGD optimizer in MXNet can be initialized using the code below.
 
 
-```python
+```{.python .input}
 dcasgd_optimizer = optimizer.DCASGD(momentum=0.0, lamda=0.04)
 ```
 
@@ -321,7 +321,7 @@ $$ w_{i+1} = (|z_{i+1}| > \lambda) \cdot \left[ \dfrac{-lr}{\beta + \sqrt{\eta_{
 Here is how to initialize the FTRL optimizer in MXNet
 
 
-```python
+```{.python .input}
 ftrl_optimizer = optimizer.Ftrl(lamda1=0.01, learning_rate=0.1, beta=1)
 ```
 
@@ -344,7 +344,7 @@ $$ w_{i+1} = \dfrac{-z_{i+1}}{d_{i+1}} $$
 In MXNet, you can initialize the FTML optimizer using
 
 
-```python
+```{.python .input}
 ftml_optimizer = optimizer.FTML(beta1=0.6, beta2=0.999, epsilon=1e-08)
 ```
 
@@ -365,7 +365,7 @@ where $ \eta_{i+1} \sim N(0, lr_{i+1})$ i.e $\eta_{i+1}$ is drawn from a zero ce
 SGLD was introduced by [Patterson and Teh](https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf) and the optimizer can be created in MXNet with the following line of code.
 
 
-```python
+```{.python .input}
 sgld_optimizer = optimizer.SGLD()
 ```
 
diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md
index 1a28f9179703..5bd9ad30aace 100644
--- a/docs/python_docs/python/tutorials/performance/backend/amp.md
+++ b/docs/python_docs/python/tutorials/performance/backend/amp.md
@@ -30,7 +30,7 @@ This tutorial shows how to get started with mixed precision training using AMP f
 For demonstration purposes we will use synthetic data loader.
 
 
-```python
+```{.python .input}
 import os
 import logging
 import warnings
@@ -61,7 +61,7 @@ smoothl1_metric = mx.metric.Loss('SmoothL1')
 ```
 
 
-```python
+```{.python .input}
 class SyntheticDataLoader(object):
     def __init__(self, data_shape, batch_size):
         super(SyntheticDataLoader, self).__init__()
@@ -90,7 +90,7 @@ train_data = SyntheticDataLoader(data_shape, batch_size)
 ```
 
 
-```python
+```{.python .input}
 def get_network():
     # SSD with RN50 backbone
     net_name = 'ssd_512_resnet50_v1_coco'
@@ -108,7 +108,7 @@ def get_network():
 First, let us create the network.
 
 
-```python
+```{.python .input}
 net = get_network()
 net.hybridize(static_alloc=True, static_shape=True)
 ```
@@ -118,14 +118,14 @@ net.hybridize(static_alloc=True, static_shape=True)
 Next, we need to create a Gluon Trainer.
 
 
-```python
+```{.python .input}
 trainer = gluon.Trainer(
     net.collect_params(), 'sgd',
     {'learning_rate': lr, 'wd': wd, 'momentum': momentum})
 ```
 
 
-```python
+```{.python .input}
 mbox_loss = gcv.loss.SSDMultiBoxLoss()
 
 for epoch in range(1):
@@ -176,7 +176,7 @@ INFO:root:[Epoch 0][Batch 199], Speed: 58.422 samples/sec, CrossEntropy=0.396, S
 In order to start using AMP, we need to import and initialize it. This has to happen before we create the network.
 
 
-```python
+```{.python .input}
 from mxnet.contrib import amp
 
 amp.init()
@@ -190,7 +190,7 @@ INFO:root:Using AMP
 After that, we can create the network exactly the same way we did in FP32 training.
 
 
-```python
+```{.python .input}
 net = get_network()
 net.hybridize(static_alloc=True, static_shape=True)
 ```
@@ -198,7 +198,7 @@ net.hybridize(static_alloc=True, static_shape=True)
 For some models that may be enough to start training in mixed precision, but the full FP16 recipe recommends using dynamic loss scaling to guard against over- and underflows of FP16 values. Therefore, as a next step, we create a trainer and initialize it with support for AMP's dynamic loss scaling. Currently, support for dynamic loss scaling is limited to trainers created with `update_on_kvstore=False` option, and so we add it to our trainer initialization.
 
 
-```python
+```{.python .input}
 trainer = gluon.Trainer(
     net.collect_params(), 'sgd',
     {'learning_rate': lr, 'wd': wd, 'momentum': momentum},
@@ -212,7 +212,7 @@ amp.init_trainer(trainer)
 The last step is to apply the dynamic loss scaling during the training loop and . We can achieve that using the `amp.scale_loss` function.
 
 
-```python
+```{.python .input}
 mbox_loss = gcv.loss.SSDMultiBoxLoss()
 
 for epoch in range(1):
@@ -266,7 +266,7 @@ Below, we demonstrate for a gluon model:
 - Conversion from FP32 model to mixed precision model.
 - Run inference on the mixed precision model.
 
-```python
+```{.python .input}
 with mx.Context(mx.gpu(0)):
     # Below is an example of converting a gluon hybrid block to a mixed precision block
     with warnings.catch_warnings(record=True) as w:
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index 1c63cc96d6d9..d34e5bb816e9 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -23,7 +23,7 @@ It is often helpful to check the execution time of each operation in a neural ne
 
 If you have just started to use MXNet, you might be tempted to measure the execution time of your model using Python's `time` module like shown below:
 
-```python
+```{.python .input}
 from time import time
 from mxnet import autograd, nd
 import mxnet as mx
@@ -52,7 +52,7 @@ While it is possible to use [NDArray.waitall()](https://mxnet.apache.org/api/pyt
 
 The correct way to measure running time of MXNet models is to use MXNet profiler. In the rest of this tutorial, we will learn how to use the MXNet profiler to measure the running time and memory consumption of MXNet models. You can import the profiler and configure it from Python code.
 
-```python
+```{.python .input}
 from mxnet import profiler
 
 profiler.set_config(profile_all=True,
@@ -74,7 +74,7 @@ profiler.set_config(profile_all=True,
 
 Let's build a small convolutional neural network that we can use to demonstrate profiling.
 
-```python
+```{.python .input}
 from mxnet import gluon
 
 net = gluon.nn.HybridSequential()
@@ -89,7 +89,7 @@ net.add(gluon.nn.Dense(10))
 
 We need data that we can run through the network for profiling. We'll use the MNIST dataset.
 
-```python
+```{.python .input}
 from mxnet.gluon.data.vision import transforms
 
 dataset = gluon.data.vision.MNIST(train=True)
@@ -99,7 +99,7 @@ dataloader = gluon.data.DataLoader(dataset, batch_size=64, shuffle=True)
 
 Let's define a function that will run a single training iteration given `data` and `label`.
 
-```python
+```{.python .input}
 # Use GPU if available
 if mx.context.num_gpus():
     ctx=mx.gpu()
@@ -134,7 +134,7 @@ def run_training_iteration(data, label):
 
 When the first forward pass is run on a network, MXNet does a number of housekeeping tasks including inferring the shapes of various parameters, allocating memory for intermediate and final outputs, etc. For these reasons, profiling the first iteration doesn't provide representative results for the rest of training. We will, therefore, skip the first iteration.
 
-```python
+```{.python .input}
 # Run the first iteration without profiling
 itr = iter(dataloader)
 run_training_iteration(*next(itr))
@@ -142,7 +142,7 @@ run_training_iteration(*next(itr))
 
 We'll run the next iteration with the profiler turned on.
 
-```python
+```{.python .input}
 data, label = next(itr)
 
 # Ask the profiler to start recording
@@ -184,7 +184,7 @@ There are a few ways to view the information collected by the profiler. You can
 
 You can use the `profiler.dumps()` method to view the information collected by the profiler in the console. The collected information contains time taken by each operator, time taken by each C API and memory consumed in both CPU and GPU.
 
-```python
+```{.python .input}
 profiler.set_state('run')
 profiler.set_state('stop')
 print(profiler.dumps())
@@ -196,7 +196,7 @@ print(profiler.dumps())
 
 You can also dump the information collected by the profiler into a `json` file using the `profiler.dump()` function and view it in a browser.
 
-```python
+```{.python .input}
 profiler.dump(finished=False)
 ```
 
@@ -239,7 +239,7 @@ Should the existing NDArray operators fail to meet all your model's needs, MXNet
 
 Let's try profiling custom operators with the following code example:
 
-```python
+```{.python .input}
 class MyAddOne(mx.operator.CustomOp):
     def forward(self, is_train, req, in_data, out_data, aux):  
         self.assign(out_data[0], req[0], in_data[0]+1)
@@ -288,7 +288,7 @@ As shown by the screenshot, in the **Custom Operator** domain where all the cust
 
 Please note that: to be able to see the previously described information, you need to set `profile_imperative` to `True` even when you are using custom operators in [symbolic mode](https://mxnet.apache.org/versions/master/tutorials/basic/symbol.html) (refer to the code snippet below, which is the symbolic-mode equivelent of the code example above). The reason is that within custom operators, pure python code and sub-operators are still called imperatively. 
 
-```python 
+```{.python .input} 
 # Set profile_all to True
 profiler.set_config(profile_all=True, aggregate_stats=True, continuous_dump=True)
 # OR, Explicitly Set profile_symbolic and profile_imperative to True