From a5ec9b9738ebbc30ebe38374a37bae6369946e07 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Fri, 14 Aug 2020 12:52:22 -0700 Subject: [PATCH] update notebooks --- .../python/tutorials/deploy/export/onnx.md | 12 +- .../inference/image_classification_jetson.md | 2 +- .../python/tutorials/extend/customop.md | 24 +-- .../gluon_from_experiment_to_deployment.md | 14 +- .../logistic_regression_explained.md | 18 +- .../gluon/blocks/activations/activations.md | 22 +-- .../packages/gluon/blocks/custom-layer.md | 14 +- .../gluon/blocks/custom_layer_beginners.md | 20 +-- .../packages/gluon/blocks/hybridize.md | 12 +- .../tutorials/packages/gluon/blocks/naming.md | 22 +-- .../packages/gluon/blocks/save_load_params.md | 20 +-- .../packages/gluon/data/data_augmentation.md | 28 ++-- .../tutorials/packages/gluon/data/datasets.md | 28 ++-- .../gluon/image/image-augmentation.md | 148 ----------------- .../packages/gluon/image/info_gan.md | 40 ++--- .../tutorials/packages/gluon/image/mnist.md | 24 +-- .../python/tutorials/packages/gluon/index.rst | 12 -- .../packages/gluon/loss/custom-loss.md | 18 +- .../tutorials/packages/gluon/loss/loss.md | 2 +- .../gluon/training/fit_api_tutorial.md | 26 +-- .../learning_rates/learning_rate_finder.md | 16 +- .../learning_rates/learning_rate_schedules.md | 30 ++-- .../learning_rate_schedules_advanced.md | 30 ++-- .../gluon/training/normalization/index.md | 36 ++-- .../tutorials/packages/kvstore/kvstore.md | 16 +- .../legacy/ndarray/01-ndarray-intro.md | 26 +-- .../legacy/ndarray/02-ndarray-operations.md | 32 ++-- .../legacy/ndarray/gotchas_numpy_in_mxnet.md | 6 +- .../packages/legacy/ndarray/sparse/csr.md | 38 ++--- .../legacy/ndarray/sparse/row_sparse.md | 42 ++--- .../legacy/ndarray/sparse/train_gluon.md | 50 +++--- .../tutorials/packages/np/cheat-sheet.md | 154 +++++++++--------- .../packages/onnx/fine_tuning_gluon.md | 62 +++---- .../packages/onnx/inference_on_onnx_model.md | 40 ++--- .../tutorials/packages/optimizer/index.md | 30 ++-- .../tutorials/performance/backend/amp.md | 22 +-- .../tutorials/performance/backend/profiler.md | 22 +-- 37 files changed, 499 insertions(+), 659 deletions(-) delete mode 100644 docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md diff --git a/docs/python_docs/python/tutorials/deploy/export/onnx.md b/docs/python_docs/python/tutorials/deploy/export/onnx.md index f44d476312aa..4867bc86a603 100644 --- a/docs/python_docs/python/tutorials/deploy/export/onnx.md +++ b/docs/python_docs/python/tutorials/deploy/export/onnx.md @@ -34,7 +34,7 @@ To run the tutorial you will need to have installed the following python modules *Note:* MXNet-ONNX importer and exporter follows version 7 of ONNX operator set which comes with ONNX v1.2.1. -```python +```{.python .input} import mxnet as mx import numpy as np from mxnet.contrib import onnx as onnx_mxnet @@ -47,7 +47,7 @@ logging.basicConfig(level=logging.INFO) We download the pre-trained ResNet-18 [ImageNet](http://www.image-net.org/) model from the [MXNet Model Zoo](/api/python/docs/api/gluon/model_zoo/index.html). We will also download synset file to match labels. -```python +```{.python .input} # Download pre-trained resnet model - json and params by running following code. path='http://data.mxnet.io/models/imagenet/' [mx.test_utils.download(path+'resnet/18-layers/resnet-18-0000.params'), @@ -61,7 +61,7 @@ Now, we have downloaded ResNet-18 symbol, params and synset file on the disk. Let us describe the MXNet's `export_model` API. -```python +```{.python .input} help(onnx_mxnet.export_model) ``` @@ -109,7 +109,7 @@ Since we have downloaded pre-trained model files, we will use the `export_model` We will use the downloaded pre-trained model files (sym, params) and define input variables. -```python +```{.python .input} # Downloaded input symbol and params files sym = './resnet-18-symbol.json' params = './resnet-18-0000.params' @@ -123,7 +123,7 @@ onnx_file = './mxnet_exported_resnet50.onnx' We have defined the input parameters required for the `export_model` API. Now, we are ready to covert the MXNet model into ONNX format. -```python +```{.python .input} # Invoke export model API. It returns path of the converted onnx model converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file) ``` @@ -134,7 +134,7 @@ This API returns path of the converted model which you can later use to import t Now we can check validity of the converted ONNX model by using ONNX checker tool. The tool will validate the model by checking if the content contains valid protobuf: -```python +```{.python .input} from onnx import checker import onnx diff --git a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md index 5a697ca7960e..0a7a8d5d5bd2 100644 --- a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md +++ b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md @@ -73,7 +73,7 @@ And we are done. You can test the installation now by importing mxnet from pytho We are now ready to run a pre-trained model and run inference on a Jetson module. In this tutorial we are using ResNet-50 model trained on Imagenet dataset. We run the following classification script with either cpu/gpu context using python3. -```python +```{.python .input} from mxnet import gluon import mxnet as mx diff --git a/docs/python_docs/python/tutorials/extend/customop.md b/docs/python_docs/python/tutorials/extend/customop.md index f1ee1d2ae601..d7c08f4751eb 100644 --- a/docs/python_docs/python/tutorials/extend/customop.md +++ b/docs/python_docs/python/tutorials/extend/customop.md @@ -26,7 +26,7 @@ Custom operator in python is easy to develop and good for prototyping, but may h -```python +```{.python .input} import numpy as np import mxnet as mx from mxnet import gluon, autograd @@ -42,7 +42,7 @@ This operator implements the standard sigmoid activation function. This is only First we implement the forward and backward computation by sub-classing `mx.operator.CustomOp`: -```python +```{.python .input} class Sigmoid(mx.operator.CustomOp): def forward(self, is_train, req, in_data, out_data, aux): """Implements forward computation. @@ -75,7 +75,7 @@ class Sigmoid(mx.operator.CustomOp): Then we need to register the custom op and describe it's properties like input and output shapes so that mxnet can recognize it. This is done by sub-classing `mx.operator.CustomOpProp`: -```python +```{.python .input} @mx.operator.register("sigmoid") # register with name "sigmoid" class SigmoidProp(mx.operator.CustomOpProp): def __init__(self): @@ -110,7 +110,7 @@ class SigmoidProp(mx.operator.CustomOpProp): We can now use this operator by calling `mx.nd.Custom`: -```python +```{.python .input} x = mx.nd.array([0, 1, 2, 3]) # attach gradient buffer to x for autograd x.attach_grad() @@ -121,7 +121,7 @@ with autograd.record(): print(y) ``` -```python +```{.python .input} # call backward computation y.backward() # gradient is now saved to the grad buffer we attached previously @@ -137,7 +137,7 @@ The dense operator performs a dot product between data and weight, then add bias ### Forward & backward implementation -```python +```{.python .input} class Dense(mx.operator.CustomOp): def __init__(self, bias): self._bias = bias @@ -158,7 +158,7 @@ class Dense(mx.operator.CustomOp): ### Registration -```python +```{.python .input} @mx.operator.register("dense") # register with name "sigmoid" class DenseProp(mx.operator.CustomOpProp): def __init__(self, bias): @@ -192,7 +192,7 @@ class DenseProp(mx.operator.CustomOpProp): Parameterized CustomOp are usually used together with Blocks, which holds the parameter. -```python +```{.python .input} class DenseBlock(mx.gluon.Block): def __init__(self, in_channels, channels, bias, **kwargs): super(DenseBlock, self).__init__(**kwargs) @@ -207,7 +207,7 @@ class DenseBlock(mx.gluon.Block): ### Example usage -```python +```{.python .input} dense = DenseBlock(3, 5, 0.1) dense.initialize() x = mx.nd.uniform(shape=(4, 3)) @@ -218,7 +218,7 @@ print(y) ## Using custom operators with fork In Linux systems, the default method in multiprocessing to create process is by using fork. If there are unfinished async custom operations when forking, the program will be blocked because of python GIL. Always use sync calls like `wait_to_read` or `waitall` before calling fork. -```python +```{.python .input} x = mx.nd.array([0, 1, 2, 3]) y = mx.nd.Custom(x, op_type='sigmoid') # unfinished async sigmoid operation will cause blocking @@ -227,10 +227,10 @@ os.fork() Correctly handling this will make mxnet depend upon libpython, so the workaround now is to ensure that all custom operations are executed before forking process. -```python +```{.python .input} x = mx.nd.array([0, 1, 2, 3]) y = mx.nd.Custom(x, op_type='sigmoid') # force execution by reading y print(y.asnumpy()) os.fork() -``` \ No newline at end of file +``` diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md index bd9dbacf3e97..7f34708c0f4c 100644 --- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md +++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md @@ -44,7 +44,7 @@ We will use the [Oxford 102 Category Flower Dataset](http://www.robots.ox.ac.uk/ We have prepared a utility file to help you download and organize your data into train, test, and validation sets. Run the following Python code to download and prepare the data: -```python +```{.python .input} import mxnet as mx data_util_file = "oxford_102_flower_dataset.py" base_url = "https://raw.githubusercontent.com/apache/incubator-mxnet/master/docs/tutorial_utils/data/{}?raw=true" @@ -65,7 +65,7 @@ Now your data will be organized into train, test, and validation sets, images be Now let's first import necessary packages: -```python +```{.python .input} import math import os import time @@ -80,7 +80,7 @@ from mxnet.gluon.model_zoo.vision import resnet50_v2 Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](/api/python/docs/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.html) to adjust learning rates during training. Here we set the `epochs` to 1 for quick demonstration, please change to 40 for actual training. -```python +```{.python .input} classes = 102 epochs = 1 lr = 0.001 @@ -108,7 +108,7 @@ Now we will apply data augmentations on training images. This makes minor altera For validation and inference, we only need to apply step 1, 4, and 5. We also need to save the mean and standard deviation values for [inference using C++](/api/cpp/docs/tutorials/cpp_inference). -```python +```{.python .input} jitter_param = 0.4 lighting_param = 0.1 @@ -165,7 +165,7 @@ Before we go to training, one unique Gluon feature you should be aware of is hyb -```python +```{.python .input} # load pre-trained resnet50_v2 from model zoo finetune_net = resnet50_v2(pretrained=True, ctx=ctx) @@ -195,7 +195,7 @@ Now let's define the test metrics and start fine-tuning. -```python +```{.python .input} def test(net, val_data, ctx): metric = mx.metric.Accuracy() for i, (data, label) in enumerate(val_data): @@ -254,7 +254,7 @@ We now have a trained our custom model. This can be serialized into model files -```python +```{.python .input} finetune_net.export("flower-recognition", epoch=epochs) ``` diff --git a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md index 277aa5d2d82c..e36e048f371a 100644 --- a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md +++ b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md @@ -23,7 +23,7 @@ Logistic Regression is one of the first models newcomers to Deep Learning are im Before anything else, let's import required packages for this tutorial. -```python +```{.python .input} import numpy as np import mxnet as mx from mxnet import nd, autograd, gluon @@ -36,7 +36,7 @@ mx.random.seed(12345) # Added for reproducibility In this tutorial we will use fake dataset, which contains 10 features drawn from a normal distribution with mean equals to 0 and standard deviation equals to 1, and a class label, which can be either 0 or 1. The size of the dataset is an arbitrary value. The function below helps us to generate a dataset. Class label `y` is generated via a non-random logic, so the network would have a pattern to look for. Boundary of 3 is selected to make sure that number of positive examples smaller than negative, but not too small -```python +```{.python .input} def get_random_data(size, ctx): x = nd.normal(0, 1, shape=(size, 10), ctx=ctx) y = x.sum(axis=1) > 3 @@ -46,7 +46,7 @@ def get_random_data(size, ctx): Also, let's define a set of hyperparameters, that we are going to use later. Since our model is simple and dataset is small, we are going to use CPU for calculations. Feel free to change it to GPU for a more advanced scenario. -```python +```{.python .input} ctx = mx.cpu() train_data_size = 1000 val_data_size = 100 @@ -60,7 +60,7 @@ To work with data, Apache MXNet provides [Dataset](https://mxnet.apache.org/api/ Below we define training and validation datasets, which we are going to use in the tutorial. -```python +```{.python .input} train_x, train_ground_truth_class = get_random_data(train_data_size, ctx) train_dataset = ArrayDataset(train_x, train_ground_truth_class) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) @@ -77,7 +77,7 @@ The only requirement for the logistic regression is that the last layer of the n Below, we define a model which has an input layer of 10 neurons, a couple of inner layers of 10 neurons each, and output layer of 1 neuron. We stack the layers using [HybridSequential](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.HybridSequential) block and initialize parameters of the network using [Xavier](https://mxnet.apache.org/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initialization. -```python +```{.python .input} net = nn.HybridSequential() net.add(nn.Dense(units=10, activation='relu')) # input layer @@ -99,7 +99,7 @@ Metric helps us to estimate how good our model is in terms of a problem we are t Below we define these objects. -```python +```{.python .input} loss = gluon.loss.SigmoidBinaryCrossEntropyLoss() trainer = Trainer(params=net.collect_params(), optimizer='sgd', optimizer_params={'learning_rate': 0.1}) @@ -110,7 +110,7 @@ f1 = mx.metric.F1() The next step is to define the training function in which we iterate over all batches of training data, execute the forward pass on each batch and calculate training loss. On line 19, we sum losses of every batch per epoch into a single variable, because we calculate loss per single batch, but want to display it per epoch. -```python +```{.python .input} def train_model(): cumulative_train_loss = 0 @@ -159,7 +159,7 @@ For `F1` metric to work, instead of one number per class, we must pass probabili Then we pass this stacked matrix to `F1` score. -```python +```{.python .input} def validate_model(threshold): cumulative_val_loss = 0 @@ -193,7 +193,7 @@ def validate_model(threshold): By using the defined above functions, we can finally write our main training loop. -```python +```{.python .input} epochs = 10 threshold = 0.5 diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md index 755253708b43..e5ba40353a9f 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md +++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/activations/activations.md @@ -25,7 +25,7 @@ If you are looking to answer the question, 'which activation function should I u In order to compare the various activation functions and to understand the nuances of their differences we have a snippet of code to plot the activation functions (used in the forward pass) and their gradients (used in the backward pass). -```python +```{.python .input} import numpy as np import mxnet as mx from matplotlib import pyplot as plt @@ -62,7 +62,7 @@ $$ \sigma(x) = \dfrac{e^x}{e^x + 1} $$ Warning: the term sigmoid is overloaded and can be used to refer to the class of 's' shaped functions or particularly to the logistic function that we've just described. In MxNet the sigmoid activation specifically refers to logistic function sigmoid. -```python +```{.python .input} visualize_activation(mx.gluon.nn.Activation('sigmoid')) ``` @@ -90,7 +90,7 @@ which shows its direct relation to sigmoid by the following equation: $$ tanh(x) = 2\sigma(2x) - 1$$ -```python +```{.python .input} visualize_activation(mx.gluon.nn.Activation('tanh')) ``` @@ -107,7 +107,7 @@ The SoftSign activation is an alternative to tanh that is also centered at zero $$ softsign(x) = \dfrac{x}{abs(x) + 1} $$ -```python +```{.python .input} visualize_activation(mx.gluon.nn.Activation('softsign')) ``` @@ -129,7 +129,7 @@ ReLU was introduced to neural networks in the [paper by Hahnloser et al](https:/ ReLU is the most widely used activation due to its simplicity and performance across multiple datasets and although there have been efforts to introduce activation functions, many of them described in this tutorial, that improve on ReLU, they have not gained as much widespread adoption. -```python +```{.python .input} visualize_activation(mx.gluon.nn.Activation('relu')) ``` @@ -148,7 +148,7 @@ $$ SoftReLU(x) = log(1 + e^x)$$ The SoftReLU can be seen as a smooth version of the ReLU by observing that its derivative is the sigmoid, seen below, which is a smooth version of the gradient of the ReLU shown above. -```python +```{.python .input} visualize_activation(mx.gluon.nn.Activation('softrelu')) ``` @@ -170,7 +170,7 @@ where $\alpha > 0$ is small positive number. In MXNet, by default the $\alpha$ p Here is a visualization for the LeakyReLU with $\alpha = 0.05$ -```python +```{.python .input} visualize_activation(mx.gluon.nn.LeakyReLU(0.05)) ``` @@ -184,7 +184,7 @@ As shown in the graph, the LeakyReLU's gradient is non-zero everywhere, in an at The PReLU activation function, or Parametric Leaky ReLU introduced by [He et al](https://arxiv.org/pdf/1502.01852.pdf), is a version of LeakyReLU that learns the parameter $\alpha$ during training. An initialization parameter is passed into the PreLU activation layer and this is treated as a learnable parameter that is updated via gradient descent during training. This is in contrast to LeakyReLU where $\alpha$ is a hyperparameter. -```python +```{.python .input} prelu = mx.gluon.nn.PReLU(mx.init.Normal(0.05)) prelu.initialize() visualize_activation(prelu) @@ -208,7 +208,7 @@ $$ ELU(\alpha, x) = \begin{cases} \end{cases}$$ -```python +```{.python .input} visualize_activation(mx.gluon.nn.ELU()) ``` @@ -229,7 +229,7 @@ $$ SELU(\alpha, x) = \lambda \cdot\begin{cases} In SELU, unlike ELU, the parameters $\alpha$ and $\lambda$ are fixed parameters calculated from the data. For standard scaled inputs, these values are $$\alpha=1.6732, \lambda=1.0507$$ as calculated in the paper. -```python +```{.python .input} visualize_activation(mx.gluon.nn.SELU()) ``` @@ -247,7 +247,7 @@ $$ swish(x) = x\cdot\sigma(\beta x)$$ where $\sigma$ is the sigmoid activation function $\sigma(x) = \frac{1}{1 + e^{-x}}$ described above and $\beta$ is a hyperparameter set to 1 by default in MXNet. -```python +```{.python .input} visualize_activation(mx.gluon.nn.Swish()) ``` diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md index 8a6a2cb6c21a..ff62a55d2617 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md +++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md @@ -31,7 +31,7 @@ The only instance method needed to be implemented is [forward(self, x)](https:// In the example below, we define a new layer and implement `forward()` method to normalize input data by fitting it into a range of [0, 1]. -```python +```{.python .input} # Do some initial imports used throughout this tutorial from __future__ import print_function import mxnet as mx @@ -41,7 +41,7 @@ mx.random.seed(1) # Set seed for reproducable results ``` -```python +```{.python .input} class NormalizationLayer(gluon.Block): def __init__(self): super(NormalizationLayer, self).__init__() @@ -69,7 +69,7 @@ To support hybridization, it is important to use only methods avaible directly f Knowing this, we can can rewrite our example layer, using HybridBlock: -```python +```{.python .input} class NormalizationHybridLayer(gluon.HybridBlock): def __init__(self): super(NormalizationHybridLayer, self).__init__() @@ -81,7 +81,7 @@ class NormalizationHybridLayer(gluon.HybridBlock): Thanks to inheriting from HybridBlock, one can easily do forward pass on a given ndarray, either on CPU or GPU: -```python +```{.python .input} layer = NormalizationHybridLayer() layer(nd.array([1, 2, 3], ctx=mx.cpu())) ``` @@ -109,7 +109,7 @@ Depending on which class you used as a base one, you can use either [Sequential] Below is an example of how to create a simple neural network with a custom layer. In this example, `NormalizationHybridLayer` gets as an input the output from `Dense(5)` layer and pass its output as an input to `Dense(1)` layer. -```python +```{.python .input} net = gluon.nn.HybridSequential() # Define a Neural Network as a sequence of hybrid blocks net.add(Dense(5)) # Add Dense layer with 5 neurons net.add(NormalizationHybridLayer()) # Add our custom layer @@ -142,7 +142,7 @@ Usually, a layer has a set of associated parameters, sometimes also referred as All parameters of a block are stored and accessed via [ParameterDict](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/parameter.py#L508) class. This class helps with initialization, updating, saving and loading of the parameters. Each layer can have multiple set of parameters, and all of them can be stored in a single instance of the `ParameterDict` class. On a block level, the instance of the `ParameterDict` class is accessible via `self.params` field, and outside of a block one can access all parameters of the network via [collect_params()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.collect_params) method called on a `container`. `ParameterDict` uses [Parameter](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Parameter) class to represent parameters inside of Apache MxNet neural network. If parameter doesn't exist, trying to get a parameter via `self.params` will create it automatically. -```python +```{.python .input} class NormalizationHybridLayer(gluon.HybridBlock): def __init__(self, hidden_units, scales): super(NormalizationHybridLayer, self).__init__() @@ -179,7 +179,7 @@ The last peculiarity is due to support of imperative and symbolic programming by Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let's run whole training for a few epochs to show that `scales` parameter doesn't change during the training while `weights` parameter is changing. -```python +```{.python .input} def print_params(title, net): """ Helper function to print out the state of parameters of NormalizationHybridLayer diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md index 99fed59678ca..005ecd510a56 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md +++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md @@ -29,7 +29,7 @@ The only instance method needed to be implemented is [forward(self, x)](https:// In the example below, we define a new layer and implement `forward()` method to normalize input data by fitting it into a range of [0, 1]. -```python +```{.python .input} # Do some initial imports used throughout this tutorial from __future__ import print_function import mxnet as mx @@ -38,7 +38,7 @@ from mxnet.gluon.nn import Dense mx.random.seed(1) # Set seed for reproducable results ``` -```python +```{.python .input} class NormalizationLayer(gluon.Block): def __init__(self): super(NormalizationLayer, self).__init__() @@ -65,7 +65,7 @@ To support hybridization, it is important to use only methods available directly Knowing this, we can can rewrite our example layer, using HybridBlock: -```python +```{.python .input} class NormalizationHybridLayer(gluon.HybridBlock): def __init__(self): super(NormalizationHybridLayer, self).__init__() @@ -76,12 +76,12 @@ class NormalizationHybridLayer(gluon.HybridBlock): Thanks to inheriting from HybridBlock, one can easily do forward pass on a given ndarray, either on CPU or GPU: -```python +```{.python .input} layer = NormalizationHybridLayer() layer(nd.array([1, 2, 3], ctx=mx.cpu())) ``` -```python +```{.python .input} [0. 0.5 1. ] ``` @@ -100,7 +100,7 @@ Depending on which class you used as a base one, you can use either [Sequential] Below is an example of how to create a simple neural network with a custom layer. In this example, `NormalizationHybridLayer` gets as an input the output from `Dense(5)` layer and pass its output as an input to `Dense(1)` layer. -```python +```{.python .input} net = gluon.nn.HybridSequential() # Define a Neural Network as a sequence of hybrid blocks net.add(Dense(5)) # Add Dense layer with 5 neurons net.add(NormalizationHybridLayer()) # Add our custom layer @@ -113,7 +113,7 @@ input = nd.random_uniform(low=-10, high=10, shape=(5, 2)) # Create 5 random exam net(input) ``` -```python +```{.python .input} [[-0.13601446] [ 0.26103732] [-0.05046433] @@ -128,7 +128,7 @@ Usually, a layer has a set of associated parameters, sometimes also referred as All parameters of a block are stored and accessed via [ParameterDict](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/parameter.py#L508) class. This class helps with initialization, updating, saving and loading of the parameters. Each layer can have multiple set of parameters, and all of them can be stored in a single instance of the `ParameterDict` class. On a block level, the instance of the `ParameterDict` class is accessible via `self.params` field, and outside of a block one can access all parameters of the network via [collect_params()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.collect_params) method called on a `container`. `ParamterDict` uses [Parameter](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Parameter) class to represent parameters inside of Apache MxNet neural network. If parameter doesn’t exist, trying to get a parameter via `self.params` will create it automatically. -```python +```{.python .input} class NormalizationHybridLayer(gluon.HybridBlock): def __init__(self, hidden_units, scales): super(NormalizationHybridLayer, self).__init__() @@ -165,7 +165,7 @@ The last peculiarity is due to support of imperative and symbolic programming by Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let’s run whole training for a few epochs to show that `scales` parameter doesn’t change during the training while `weights` parameter is changing. -```python +```{.python .input} def print_params(title, net): """ Helper function to print out the state of parameters of NormalizationHybridLayer @@ -206,7 +206,7 @@ trainer.step(input.shape[0]) # Trainer updates print_params("=========== Parameters after backward pass ===========\n", net) ``` -```python +```{.python .input} =========== Parameters after forward pass =========== hybridsequential94_normalizationhybridlayer0_weights = diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md index 5f28699dd860..a0d18e3ae7aa 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md +++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md @@ -225,7 +225,7 @@ The difference between a purely imperative `Block` and hybridizable `HybridBlock When trying to access specific elements in a tensor like this: -```python +```{.python .input} def hybrid_forward(self, F, x): return x[0,0] ``` @@ -240,7 +240,7 @@ There are however several operators that can help you with array manipulations l Sometimes one can be tempted to use conditional logic on the type of the input tensors however the following block: -```python +```{.python .input} def hybrid_forward(self, F, x): if x.dtype =='float16': return x @@ -255,7 +255,7 @@ You cannot use the `dtype` of the symbol at runtime. Symbols only describe opera Similarly you cannot use the compute context of symbol for the same reason that symbols only describe the operations on the data and not the data (or context). You cannot do this: -```python +```{.python .input} def hybrid_forward(self, F, x): if x.context == mx.cpu(): return x @@ -270,7 +270,7 @@ Accessing the current compute context is not possible with symbols. Consider pas Accessing shape information of tensors is very often used for example when trying to flatten a tensor and then reshape it back to its original shape. -```python +```{.python .input} def hybrid_forward(self, F, x): return x*x.shape[0] ``` @@ -286,7 +286,7 @@ There are also a lot of operators that support special indices to help with most Last but not least, you cannot directly assign values in tensor in a symbolic graph, the resulting tensors always needs to be the results of operations performed on the inputs of the computational graph. The following code: -```python +```{.python .input} def hybrid_forward(self, F, x): x[0] = 2 return x @@ -298,7 +298,7 @@ Direct item assignment is not possible in symbolic graph since it needs to be pa e.g to set the first element to 2 you can do: -```python +```{.python .input} x = mx.nd.array([1,2,3]) value = mx.nd.ones_like(x)*2 condition = mx.nd.array([0,1,1]) diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md index 6f98a2f6b2ce..511bd9b2a4b5 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md +++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md @@ -22,7 +22,7 @@ In gluon, each Parameter or Block has a name. Parameter names and Block names ca In this tutorial we talk about the best practices on naming. First, let's import MXNet and Gluon: -```python +```{.python .input} from __future__ import print_function import mxnet as mx from mxnet import gluon @@ -33,7 +33,7 @@ from mxnet import gluon When creating a block, you can simply do as follows: -```python +```{.python .input} mydense = gluon.nn.Dense(100) print(mydense.name) ``` @@ -41,7 +41,7 @@ print(mydense.name) When you create more Blocks of the same kind, they will be named with incrementing suffixes to avoid collision: -```python +```{.python .input} dense1 = gluon.nn.Dense(100) print(dense1.name) ``` @@ -51,7 +51,7 @@ print(dense1.name) Parameters will be named automatically by a unique name in the format of `param_{uuid4}_{name}`: -```python +```{.python .input} param = gluon.Parameter(name = 'bias') print(param.name) ``` @@ -61,7 +61,7 @@ print(param.name) When getting parameters within a Block, you should use the structure based name as the key: -```python +```{.python .input} print(dense0.collect_params()) ``` @@ -70,7 +70,7 @@ print(dense0.collect_params()) In MXNet 2, we don't have to define children blocks within a `name_scope` any more. Let's demonstrate this by defining and initiating a simple neural net: -```python +```{.python .input} class Model(gluon.HybridBlock): def __init__(self): super(Model, self).__init__() @@ -92,7 +92,7 @@ model0(mx.nd.zeros((1, 20))) The same principle also applies to container blocks like Sequential. We can simply do as follows: -```python +```{.python .input} net = gluon.nn.Sequential() net.add(gluon.nn.Dense(20)) net.add(gluon.nn.Dense(20)) @@ -105,7 +105,7 @@ net.add(gluon.nn.Dense(20)) For `HybridBlock`, we use `save_parameters`/`load_parameters`, which uses model structure, instead of parameter name, to match parameters. -```python +```{.python .input} model0.save_parameters('model.params') model1.load_parameters('model.params') print(mx.nd.load('model.params').keys()) @@ -113,7 +113,7 @@ print(mx.nd.load('model.params').keys()) For `SymbolBlock.imports`, we use `export`, which uses parameter name `param.name`, to save parameters. -```python +```{.python .input} model0.export('model0') model2 = gluon.SymbolBlock.imports('model0-symbol.json', ['data'], 'model0-0000.params') ``` @@ -130,7 +130,7 @@ To see how to do this, we first load a pretrained AlexNet. - Note that the output layer is a dense block with 1000 dimension outputs. -```python +```{.python .input} alexnet = gluon.model_zoo.vision.alexnet(pretrained=True) print(alexnet.output) ``` @@ -139,7 +139,7 @@ print(alexnet.output) To change the output to 100 dimension, we replace it with a new block. -```python +```{.python .input} alexnet.output = gluon.nn.Dense(100) alexnet.output.initialize() ``` diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md index 38f3b5dae159..631a3151be2d 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md +++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md @@ -31,7 +31,7 @@ The Model architecture of `Hybrid` models stays static and don't change during e Let's look at the above methods in more detail. Let's start by importing the modules we'll need. -```python +```{.python .input} from __future__ import print_function import mxnet as mx @@ -48,7 +48,7 @@ We need a trained model before we can save it to a file. So let's go ahead and b Let's define a helper function to build a LeNet model and another helper to train LeNet with MNIST. -```python +```{.python .input} # Use GPU if one exists, else use CPU ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() @@ -115,7 +115,7 @@ def train_model(model): Let's build a model and train it. After training, we will save and restore this model from a file. -```python +```{.python .input} net = build_lenet(gluon.nn.Sequential()) train_model(net) ``` @@ -144,7 +144,7 @@ Epoch: 0; Batch 900; Loss 0.008402 Okay, we now have a model (`net`) that we can save to a file. Let's save the parameters of this model to a file using the `save_parameters` function. -```python +```{.python .input} file_name = "net.params" net.save_parameters(file_name) ``` @@ -155,7 +155,7 @@ We have successfully saved the parameters of the model into a file. Let's now create a network with the parameters we saved into the file. We build the network again using the helper first and then load the weights from the file we saved using the `load_parameters` function. -```python +```{.python .input} new_net = build_lenet(gluon.nn.Sequential()) new_net.load_parameters(file_name, ctx=ctx) ``` @@ -166,7 +166,7 @@ If our network is [Hybrid](https://mxnet.apache.org/tutorials/gluon/hybrid.html) Let's test the model we just loaded from file. -```python +```{.python .input} import matplotlib.pyplot as plt def verify_loaded_model(net): @@ -209,7 +209,7 @@ Model predictions: [1. 1. 4. 5. 0. 5. 7. 0. 3. 6.] Note that the network we created above is not a Hybrid network and therefore cannot be serialized into a JSON file. So, let's create a Hybrid version of the same network and train it. -```python +```{.python .input} net = build_lenet(gluon.nn.HybridSequential()) net.hybridize() train_model(net) @@ -238,7 +238,7 @@ Epoch: 0; Batch 900; Loss 0.037809 We now have a trained hybrid network. This can be exported into files using the `export` function. The `export` function will export the model architecture into a `.json` file and model parameters into a `.params` file. -```python +```{.python .input} net.export("lenet", epoch=1) ``` @@ -256,7 +256,7 @@ One of the main reasons to serialize model architecture into a JSON file is to l Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above. -```python +```{.python .input} import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -265,7 +265,7 @@ with warnings.catch_warnings(): `deserialized_net` now contains the network we deserialized from files. Let's test the deserialized network to make sure it works. -```python +```{.python .input} verify_loaded_model(deserialized_net) ``` diff --git a/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md b/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md index 0e320fc2890e..3b4c26a637ef 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md +++ b/docs/python_docs/python/tutorials/packages/gluon/data/data_augmentation.md @@ -30,7 +30,7 @@ You should be familiar with the concept of a transform and how to apply it to a You can find them in the `mxnet.gluon.data.vision.transforms` module, alongside the deterministic transforms we've seen previously, such as `ToTensor`, `Normalize`, `CenterCrop` and `Resize`. Augmentations involve an element of randomness and all the augmentation transforms are prefixed with `Random`, such as `RandomResizedCrop` and `RandomBrightness`. We'll start by importing MXNet and the `transforms`. -```python +```{.python .input} import matplotlib.pyplot as plt import mxnet as mx from mxnet.gluon.data.vision import transforms @@ -41,7 +41,7 @@ from mxnet.gluon.data.vision import transforms So that we can see the effects of all the vision augmentations, we'll take a sample image of a giraffe and apply various augmentations to it. We can see what it looks like to begin with. -```python +```{.python .input} image_url = 'https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/data_aug/inputs/0.jpg' mx.test_utils.download(image_url, "giraffe.jpg") example_image = mx.image.imread("giraffe.jpg") @@ -54,7 +54,7 @@ plt.imshow(example_image.asnumpy()) Since these augmentations are random, we'll apply the same augmentation a few times and plot all of the outputs. We define a few utility functions to help with this. -```python +```{.python .input} def show_images(imgs, num_rows, num_cols, scale=2): # show augmented images in a grid layout aspect_ratio = imgs[0].shape[0]/imgs[0].shape[1] @@ -90,7 +90,7 @@ As an example, we randomly (using a uniform distribution) crop a region of the i And then we resize this cropped region to 200 by 200 pixels. -```python +```{.python .input} shape_aug = transforms.RandomResizedCrop(size=(200, 200), scale=(0.1, 1), ratio=(0.5, 2)) @@ -105,7 +105,7 @@ apply(example_image, shape_aug) A simple augmentation technique is flipping. Usually flipping horizontally doesn't change the category of object and results in an image that's still plausible in the real world. Using `RandomFlipLeftRight`, we randomly flip the image horizontally 50% of the time. -```python +```{.python .input} apply(example_image, transforms.RandomFlipLeftRight()) ``` @@ -117,7 +117,7 @@ apply(example_image, transforms.RandomFlipLeftRight()) Although it's not as common as flipping left and right, you can flip the image vertically 50% of the time with `RandomFlipTopBottom`. With our giraffe example, we end up with less plausible samples that horizontal flipping, with the ground above the sky in some cases. -```python +```{.python .input} apply(example_image, transforms.RandomFlipTopBottom()) ``` @@ -140,7 +140,7 @@ image *= alpha So by setting this to 0.5 we randomly change the brightness of the image to a value between 50% ($1-0.5$) and 150% ($1+0.5$) of the original image. -```python +```{.python .input} apply(example_image, transforms.RandomBrightness(0.5)) ``` @@ -161,7 +161,7 @@ image += gray ``` -```python +```{.python .input} apply(example_image, transforms.RandomContrast(0.5)) ``` @@ -173,7 +173,7 @@ apply(example_image, transforms.RandomContrast(0.5)) Use `RandomSaturation` to add a random saturation jitter to an image. Saturation can be thought of as the 'amount' of color in an image. Use the `saturation` parameter to control the amount of jitter in saturation, with value from 0 (no change) to 1 (potentially large change). `saturation` doesn't specify whether the saturation of the augmented image will be higher or lower, just the potential strength of the effect. Specifically the augmentation is using the method detailed [here](https://beesbuzz.biz/code/16-hsv-color-transforms). -```python +```{.python .input} apply(example_image, transforms.RandomSaturation(0.5)) ``` @@ -185,7 +185,7 @@ apply(example_image, transforms.RandomSaturation(0.5)) Use `RandomHue` to add a random hue jitter to images. Hue can be thought of as the 'shade' of the colors in an image. Use the `hue` parameter to control the amount of jitter in hue, with value from 0 (no change) to 1 (potentially large change). `hue` doesn't specify whether the hue of the augmented image will be shifted one way or the other, just the potential strength of the effect. Specifically the augmentation is using the method detailed [here](https://beesbuzz.biz/code/16-hsv-color-transforms). -```python +```{.python .input} apply(example_image, transforms.RandomHue(0.5)) ``` @@ -197,7 +197,7 @@ apply(example_image, transforms.RandomHue(0.5)) `RandomColorJitter` is a convenience transform that can be used to perform multiple color augmentations at once. You can set the `brightness`, `contrast`, `saturation` and `hue` jitters, that function the same as above for their individual transforms. -```python +```{.python .input} color_aug = transforms.RandomColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, @@ -213,7 +213,7 @@ apply(example_image, color_aug) Use `RandomLighting` for an AlexNet-style PCA-based noise augmentation. -```python +```{.python .input} apply(example_image, transforms.RandomLighting(alpha=1)) ``` @@ -224,7 +224,7 @@ apply(example_image, transforms.RandomLighting(alpha=1)) In practice, we apply multiple augmentation techniques to an image to increase the variety of images in the dataset. Using the `Compose` transform that was introduced in the [Data Transforms tutorial](), we can apply 3 of the transforms we previously used above. -```python +```{.python .input} augs = transforms.Compose([ transforms.RandomFlipLeftRight(), color_aug, shape_aug]) apply(example_image, augs) @@ -232,4 +232,4 @@ apply(example_image, augs) ![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/transforms/output_41_0.png) - \ No newline at end of file + diff --git a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md index bd291dd92253..c09e62a8cd2c 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md +++ b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md @@ -27,7 +27,7 @@ One of the most critical steps for model training and inference is loading the d We first start by generating random data `X` (with 3 variables) and corresponding random labels `y` to simulate a typical supervised learning task. We generate 10 samples and we pass them all to the `ArrayDataset`. -```python +```{.python .input} import mxnet as mx import os import tarfile @@ -41,7 +41,7 @@ dataset = mx.gluon.data.dataset.ArrayDataset(X, y) A key feature of a `Dataset` is the __*ability to retrieve a single sample given an index*__. Our random data and labels were generated in memory, so this `ArrayDataset` doesn't have to load anything from disk, but the interface is the same for all `Dataset`'s. -```python +```{.python .input} sample_idx = 4 sample = dataset[sample_idx] @@ -67,7 +67,7 @@ A [DataLoader](/api/python/docs/api/gluon/data/index.html#dataloader) is used to Another benefit of using `DataLoader` is the ability to easily load data in parallel using [multiprocessing](https://docs.python.org/3.6/library/multiprocessing.html). You can set the `num_workers` parameter to the number of CPUs available on your machine for maximum performance, or limit it to a lower number to spare resources. -```python +```{.python .input} from multiprocessing import cpu_count CPU_COUNT = cpu_count() @@ -95,7 +95,7 @@ Using Gluon `Dataset` objects, we define the data to be included in each of thes Many of the image `Dataset`'s accept a function (via the optional `transform` parameter) which is applied to each sample returned by the `Dataset`. It's useful for performing data augmentation, but can also be used for more simple data type conversion and pixel value scaling as seen below. -```python +```{.python .input} def transform(data, label): data = data.astype('float32')/255 return data, label @@ -105,7 +105,7 @@ valid_dataset = mx.gluon.data.vision.datasets.FashionMNIST(train=False, transfor ``` -```python +```{.python .input} %matplotlib inline from matplotlib.pylab import imshow @@ -136,7 +136,7 @@ When training machine learning models it is important to shuffle the training sa If you have more complex shuffling requirements (e.g. when handling sequential data), take a look at [mxnet.gluon.data.BatchSampler](/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.BatchSampler) and pass this to your `DataLoader` instead. -```python +```{.python .input} batch_size = 32 train_data_loader = mx.gluon.data.DataLoader(train_dataset, batch_size, shuffle=True, num_workers=CPU_COUNT) valid_data_loader = mx.gluon.data.DataLoader(valid_dataset, batch_size, num_workers=CPU_COUNT) @@ -145,7 +145,7 @@ valid_data_loader = mx.gluon.data.DataLoader(valid_dataset, batch_size, num_work With both `DataLoader`s defined, we can now train a model to classify each image and evaluate the validation loss at each epoch. Our Fashion MNIST dataset has 10 classes including shirt, dress, sneakers, etc. We define a simple fully connected network with a softmax output and use cross entropy as our loss. -```python +```{.python .input} from mxnet import gluon, autograd, ndarray def construct_net(): @@ -166,7 +166,7 @@ criterion = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1}) ``` -```python +```{.python .input} epochs = 5 @@ -224,7 +224,7 @@ We will run through an example for image classification, but a similar process a You can download the Caltech 101 dataset if you don't already have images to work with for this example, but please note the download is 126MB. -```python +```{.python .input} data_folder = "data" dataset_name = "101_ObjectCategories" @@ -243,7 +243,7 @@ if not os.path.isfile(archive_path): After downloading and extracting the data archive, we have two folders: `data/101_ObjectCategories` and `data/101_ObjectCategories_test`. We load the data into separate training and testing [ImageFolderDataset](/api/python/docs/api/gluon/data/vision/datasets/index.html#mxnet.gluon.data.vision.datasets.ImageFolderDataset)s. -```python +```{.python .input} training_path = os.path.join(data_folder, dataset_name) testing_path = os.path.join(data_folder, "{}_test".format(dataset_name)) ``` @@ -253,7 +253,7 @@ We instantiate the [ImageFolderDataset](https://mxnet.incubator.apache.org/api/p Optionally, you can pass a `transform` parameter to these `Dataset`'s as we've seen before. -```python +```{.python .input} train_dataset = mx.gluon.data.vision.datasets.ImageFolderDataset(training_path) test_dataset = mx.gluon.data.vision.datasets.ImageFolderDataset(testing_path) ``` @@ -263,7 +263,7 @@ Samples from these datasets are tuples of data and label. Images are loaded from As with the Fashion MNIST dataset the labels will be integer encoded. You can use the `synsets` property of the [ImageFolderDataset](https://mxnet.incubator.apache.org/api/python/gluon/data.html?highlight=imagefolderdataset#mxnet.gluon.data.vision.datasets.ImageFolderDataset)s to retrieve the original descriptions (e.g. `train_dataset.synsets[i]`). -```python +```{.python .input} sample_idx = 539 sample = train_dataset[sample_idx] data = sample[0] @@ -299,7 +299,7 @@ Before Gluon's [DataLoader](/api/python/docs/api/gluon/data/index.html#dataloade So you can get up and running with Gluon quicker if you have already implemented complex pre-processing steps using `DataIter`, we have provided a simple class to wrap existing `DataIter` objects so they can be used in a typical Gluon training loop. You can use this class for `DataIter`s such as [mxnet.image.ImageIter](/api/python/docs/api/mxnet/image/index.html#mxnet.image.ImageIter) and [mxnet.io.ImageRecordIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.ImageDetRecordIter) that have single data and label arrays. -```python +```{.python .input} class DataIterLoader(): def __init__(self, data_iter): self.data_iter = data_iter @@ -320,7 +320,7 @@ class DataIterLoader(): ``` -```python +```{.python .input} data_iter = mx.io.NDArrayIter(data=X, label=y, batch_size=5) data_iter_loader = DataIterLoader(data_iter) for X_batch, y_batch in data_iter_loader: diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md b/docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md deleted file mode 100644 index 70be781be3d6..000000000000 --- a/docs/python_docs/python/tutorials/packages/gluon/image/image-augmentation.md +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - - - - - - - - - - -# Image Augmentation - -Image augmentation technology expands the scale of -training data sets by making a series of random changes to the training images -to produce similar, but different, training examples. Given its popularity in -computer vision, the `mxnet.gluon.data.vision.transforms` model provides -multiple pre-defined image augmentation methods. In this section we will briefly -go through this module. - -First, import the module required for this section. - -```python -from matplotlib import pyplot as plt -from mxnet import image -from mxnet.gluon import data as gdata, utils -``` - -Then read the sample $400\times 500$ image. - -```python -utils.download('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/cat.jpg') -img = image.imread('cat.jpg') -plt.imshow(img.asnumpy()) -plt.show() -``` - -In addition, we define a function to draw a list of images. - -```python -def show_images(imgs, num_rows, num_cols, scale=2): - figsize = (num_cols * scale, num_rows * scale) - _, axes = plt.subplots(num_rows, num_cols, figsize=figsize) - for i in range(num_rows): - for j in range(num_cols): - axes[i][j].imshow(imgs[i * num_cols + j].asnumpy()) - axes[i][j].axes.get_xaxis().set_visible(False) - axes[i][j].axes.get_yaxis().set_visible(False) - return axes -``` - -Most image augmentation methods have a certain degree of randomness. To make it -easier for us to observe the effect of image augmentation, we next define the -auxiliary function `apply`. This function runs the image augmentation method -`aug` multiple times on the input image `img` and shows all results. - -```python -def apply(img, aug, num_rows=2, num_cols=4, scale=3): - Y = [aug(img) for _ in range(num_rows * num_cols)] - show_images(Y, num_rows, num_cols, scale) -``` - -## Flip and Crop - -Flipping the image left and right usually does not change the -category of the object. This is one of the earliest and most widely used methods -of image augmentation. Next, we use the `transforms` module to create the -`RandomFlipLeftRight` instance, which introduces a 50% chance that the image is -flipped left and right. - -```python -apply(img, gdata.vision.transforms.RandomFlipLeftRight()) -``` - -Flipping up and down is not as commonly used as flipping left and right. -However, at least for this example image, flipping up and down does not hinder -recognition. Next, we create a `RandomFlipTopBottom` instance for a 50% chance -of flipping the image up and down. - -```python -apply(img, gdata.vision.transforms.RandomFlipTopBottom()) -``` - -In the example image we used, the cat is in the middle of the image, but this -may not be the case for all images. In the [Pooling Layer](https://d2l.ai/chapter_convolutional-neural-networks/pooling.html) section of the d2l.ai book, we explain that the pooling layer can reduce the sensitivity of the convolutional -layer to the target location. In addition, we can make objects appear at -different positions in the image in different proportions by randomly cropping -the image. This can also reduce the sensitivity of the model to the target -position. - -In the following code, we randomly crop a region with an area of 10% -to 100% of the original area, and the ratio of width to height of the region is -randomly selected from between 0.5 and 2. Then, the width and height of the -region are both scaled to 200 pixels. Unless otherwise stated, the random number -between $a$ and $b$ in this section refers to a continuous value obtained by -uniform sampling in the interval $[a,b]$. - -```{.python .input n=7} -shape_aug = gdata.vision.transforms.RandomResizedCrop( - (200, 200), scale=(0.1, 1), ratio=(0.5, 2)) -apply(img, shape_aug) -``` - -## Change Color - -Another augmentation method is changing colors. We can change -four aspects of the image color: brightness, contrast, saturation, and hue. In -the example below, we randomly change the brightness of the image to a value -between 50% ($1-0.5$) and 150% ($1+0.5$) of the original image. - -```{.python .input n=8} -apply(img, gdata.vision.transforms.RandomBrightness(0.5)) -``` - -Similarly, we can randomly change the hue of the image. - -```{.python .input n=9} -apply(img, gdata.vision.transforms.RandomHue(0.5)) -``` - -We can also create a `RandomColorJitter` instance and set how to randomly change -the `brightness`, `contrast`, `saturation`, and `hue` of the image at the same -time. - -```{.python .input n=10} -color_aug = gdata.vision.transforms.RandomColorJitter( - brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5) -apply(img, color_aug) -``` - -## Overlying Multiple Image Augmentation Methods - -In practice, we will overlay -multiple image augmentation methods. We can overlay the different image -augmentation methods defined above and apply them to each image by using a -`Compose` instance. - -```{.python .input n=11} -augs = gdata.vision.transforms.Compose([ - gdata.vision.transforms.RandomFlipLeftRight(), color_aug, shape_aug]) -apply(img, augs) -``` diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md index 463ee341e7c4..d7cd28aef5fe 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md +++ b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md @@ -22,7 +22,7 @@ This notebook shows how to implement an InfoGAN based on Gluon. InfoGAN is an ex The codes are made meaningful by maximizing the mutual information between code and generator output. InfoGAN learns a disentangled representation in a completely unsupervised manner. It can be used for many applications such as image similarity search. This notebook uses the DCGAN example from the [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html) and extends it to create an InfoGAN. -```python +```{.python .input} from __future__ import print_function from datetime import datetime import logging @@ -46,7 +46,7 @@ from mxnet import autograd The latent code vector can contain several variables, which can be categorical and/or continuous. We set `n_continuous` to 2 and `n_categories` to 10. -```python +```{.python .input} batch_size = 64 z_dim = 100 n_continuous = 2 @@ -57,7 +57,7 @@ ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() Some functions to load and normalize images. -```python +```{.python .input} lfw_url = 'http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz' data_path = 'lfw_dataset' if not os.path.exists(data_path): @@ -69,7 +69,7 @@ if not os.path.exists(data_path): ``` -```python +```{.python .input} def transform(data, width=64, height=64): data = mx.image.imresize(data, width, height) data = nd.transpose(data, (2,0,1)) @@ -80,7 +80,7 @@ def transform(data, width=64, height=64): ``` -```python +```{.python .input} def get_files(data_dir): images = [] filenames = [] @@ -99,7 +99,7 @@ def get_files(data_dir): Load the dataset `lfw_dataset` which contains images of celebrities. -```python +```{.python .input} data_dir = 'lfw_dataset' images, filenames = get_files(data_dir) split = int(len(images)*0.8) @@ -116,7 +116,7 @@ train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuf Define the Generator model. Architecture is taken from the DCGAN implementation in [Straight Dope Book](https://gluon.mxnet.io/chapter14_generative-adversarial-networks/dcgan.html). The Generator consist of 4 layers where each layer involves a strided convolution, batch normalization, and rectified nonlinearity. It takes as input random noise and the latent code and produces an `(64,64,3)` output image. -```python +```{.python .input} class Generator(gluon.HybridBlock): def __init__(self, **kwargs): super(Generator, self).__init__(**kwargs) @@ -149,7 +149,7 @@ class Generator(gluon.HybridBlock): Define the Discriminator and Q model. The Q model shares many layers with the Discriminator. Its task is to estimate the code `c` for a given fake image. It is used to maximize the lower bound to the mutual information. -```python +```{.python .input} class Discriminator(gluon.HybridBlock): def __init__(self, **kwargs): super(Discriminator, self).__init__(**kwargs) @@ -195,7 +195,7 @@ Discriminator and Generator are the same as in the DCGAN example. On top of the Initialize Generator and Discriminator and define correspoing trainer function. -```python +```{.python .input} generator = Generator() generator.hybridize() generator.initialize(mx.init.Normal(0.002), ctx=ctx) @@ -215,7 +215,7 @@ q_trainer = gluon.Trainer(discriminator.Q.collect_params(), 'adam', {'learning_r Create vectors with real (=1) and fake labels (=0). -```python +```{.python .input} real_label = nd.ones((batch_size,), ctx=ctx) fake_label = nd.zeros((batch_size,),ctx=ctx) ``` @@ -223,7 +223,7 @@ fake_label = nd.zeros((batch_size,),ctx=ctx) Load a pretrained model. -```python +```{.python .input} if os.path.isfile('infogan_d_latest.params') and os.path.isfile('infogan_g_latest.params'): discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True) @@ -243,7 +243,7 @@ where `V(D,G)` is the GAN loss and the mutual information `I(c, G(z, c))` goes i Define the loss functions. `SoftmaxCrossEntropyLoss` for the categorical code, `L2Loss` for the continious code and `SigmoidBinaryCrossEntropyLoss` for the normal GAN loss. -```python +```{.python .input} loss1 = gluon.loss.SigmoidBinaryCrossEntropyLoss() loss2 = gluon.loss.L2Loss() loss3 = gluon.loss.SoftmaxCrossEntropyLoss() @@ -252,7 +252,7 @@ loss3 = gluon.loss.SoftmaxCrossEntropyLoss() This function samples `c`, `z`, and concatenates them to create the generator input. -```python +```{.python .input} def create_generator_input(): #create random noise @@ -273,7 +273,7 @@ Define the training loop. 4. Update Generator and Q -```python +```{.python .input} with SummaryWriter(logdir='./logs/') as sw: epochs = 1 @@ -351,7 +351,7 @@ Once the InfoGAN is trained, we can use the Discriminator to do an image similar Load the trained discriminator and retrieve one of its last layers. -```python +```{.python .input} discriminator = Discriminator() discriminator.load_parameters("infogan_d_latest.params", ctx=ctx, ignore_extra=True) @@ -364,7 +364,7 @@ discriminator.hybridize() Nearest neighbor function, which takes a matrix of features and an input feature vector. It returns the 3 closest features. -```python +```{.python .input} def get_knn(features, input_vector, k=3): dist = (nd.square(features - input_vector).sum(axis=1))/features.shape[0] indices = dist.asnumpy().argsort()[:k] @@ -374,7 +374,7 @@ def get_knn(features, input_vector, k=3): A helper function to visualize image data. -```python +```{.python .input} def visualize(img_array): plt.imshow(((img_array.asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(np.uint8)) plt.axis('off') @@ -383,7 +383,7 @@ def visualize(img_array): Take some images from the test data, obtain its feature vector from `discriminator.D[:11]` and plot images of the corresponding closest vectors in the feature space. -```python +```{.python .input} feature_size = 8192 features = nd.zeros((len(test_images), feature_size), ctx=ctx) @@ -425,7 +425,7 @@ We trained the Generator for a couple of epochs and stored a couple of fake imag The following function computes the TSNE on the feature matrix and stores the result in a json-file. This file can be loaded with [TSNEViewer](https://ml4a.github.io/guides/ImageTSNEViewer/) -```python +```{.python .input} import json from sklearn.manifold import TSNE @@ -449,4 +449,4 @@ Load the file with TSNEViewer. You can now inspect whether similiar looking imag - \ No newline at end of file + diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md index f18ec1a2357f..bed7f0848052 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md +++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md @@ -47,7 +47,7 @@ Before we define the model, let's first fetch the [MNIST](http://yann.lecun.com/ The following source code downloads and loads the images and the corresponding labels into memory. -```python +```{.python .input} import mxnet as mx # Fixing the random seed @@ -63,7 +63,7 @@ Data iterators take care of this by randomly shuffling the inputs. Note that we The following source code initializes the data iterators for the MNIST dataset. Note that we initialize two iterators: one for train data and one for test data. -```python +```{.python .input} batch_size = 100 train_data = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True) val_data = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) @@ -75,7 +75,7 @@ We will cover a couple of approaches for performing the hand written digit recog Now, let's import required nn modules -```python +```{.python .input} from __future__ import print_function import mxnet as mx from mxnet import gluon @@ -96,7 +96,7 @@ The last fully connected layer often has its hidden size equal to the number of To do this, we will use [Sequential layer](https://mxnet.io/api/python/docs/api/gluon/_autogen/mxnet.gluon.nn.Sequential.html) type. This is simply a linear stack of neural network layers. `nn.Dense` layers are nothing but the fully connected layers we discussed above. -```python +```{.python .input} # define network net = nn.Sequential() net.add(nn.Dense(128, activation='relu')) @@ -115,7 +115,7 @@ We will use [Trainer](/api/python/docs/api/gluon/trainer.html) class to apply th [SGD optimizer](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.SGD) on the initialized parameters. -```python +```{.python .input} gpus = mx.test_utils.list_gpus() ctx = [mx.gpu()] if gpus else [mx.cpu(0), mx.cpu(1)] net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) @@ -140,7 +140,7 @@ There are many predefined loss functions in gluon.loss. Here we use [softmax_cross_entropy_loss](https://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.loss.softmax_cross_entropy_loss) for digit classification. We will compute loss and do backward propagation inside training scope which is defined by `autograd.record()`. -```python +```{.python .input} %%time epoch = 10 # Use Accuracy as the evaluation metric. @@ -183,7 +183,7 @@ for i in range(epoch): After the above training completes, we can evaluate the trained model by running predictions on validation dataset. Since the dataset also has labels for all test images, we can compute the accuracy metric over validation data as follows: -```python +```{.python .input} # Use Accuracy as the evaluation metric. metric = mx.metric.Accuracy() # Reset the validation data iterator. @@ -218,7 +218,7 @@ The following source code defines a convolutional neural network architecture ca A typical way to write your network is creating a new class inherited from `gluon.Block` class. We can define the network by composing and inheriting Block class as follows: -```python +```{.python .input} import mxnet.ndarray as F class Net(gluon.Block): @@ -248,7 +248,7 @@ We also imported `mxnet.ndarray` package to use activation functions from `ndarr Now, We will create the network as follows: -```python +```{.python .input} net = Net() ``` @@ -264,7 +264,7 @@ Training and prediction can be done in the similar way as we did for MLP. We will initialize the network parameters as follows: -```python +```{.python .input} # set the context on GPU is available otherwise CPU ctx = [mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()] net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) @@ -273,7 +273,7 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03}) #### Training -```python +```{.python .input} # Use Accuracy as the evaluation metric. metric = mx.metric.Accuracy() softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss() @@ -315,7 +315,7 @@ for i in range(epoch): Finally, we'll use the trained LeNet model to generate predictions for the test data. -```python +```{.python .input} # Use Accuracy as the evaluation metric. metric = mx.metric.Accuracy() # Reset the validation data iterator. diff --git a/docs/python_docs/python/tutorials/packages/gluon/index.rst b/docs/python_docs/python/tutorials/packages/gluon/index.rst index 7cb083cd96d1..2b771c0dfec9 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/index.rst +++ b/docs/python_docs/python/tutorials/packages/gluon/index.rst @@ -59,12 +59,6 @@ Getting started Saving and loading trained models. - .. card:: - :title: Using pre-trained models in MXNet - :link: image/pretrained_models.html - - Using pre-trained models with Apache MXNet. - Data ---- @@ -74,12 +68,6 @@ Data :title: Data Augmentation :link: data/data_augmentation.html - A guide to data augmentation. - - .. card:: - :title: Image Augmentation - :link: image/image-augmentation.html - Boost your training dataset with image augmentation. .. card:: diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md index 1cae0c91b837..6aa556f92d3a 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md +++ b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md @@ -25,7 +25,7 @@ All neural networks need a loss function for training. A loss function is a quan However, we may sometimes want to solve problems that require customized loss functions; this tutorial shows how we can do that in Gluon. We will implement contrastive loss which is typically used in Siamese networks. -```python +```{.python .input} import matplotlib.pyplot as plt import mxnet as mx from mxnet import autograd, gluon, nd @@ -48,7 +48,7 @@ The loss function uses a margin *m* which is has the effect that dissimlar pairs In order to implement such a customized loss function in Gluon, we only need to define a new class that is inheriting from the [Loss](/api/python/docs/api/gluon/loss/index.html#mxnet.gluon.loss.Loss) base class. We then define the contrastive loss logic in the [hybrid_forward](/api/python/docs/api/gluon/hybrid_block.html#mxnet.gluon.HybridBlock.hybrid_forward) method. This method takes the images `image1`, `image2` and the label which defines whether `image1` and `image2` are similar (=0) or dissimilar (=1). The input F is an `mxnet.ndarry` or an `mxnet.symbol` if we hybridize the network. Gluon's `Loss` base class is in fact a [HybridBlock](/api/python/docs/api/gluon/hybrid_block.html). This means we can either run imperatively or symbolically. When we hybridize our custom loss function, we can get performance speedups. -```python +```{.python .input} class ContrastiveLoss(Loss): def __init__(self, margin=6., weight=None, batch_axis=0, **kwargs): super(ContrastiveLoss, self).__init__(weight, batch_axis, **kwargs) @@ -71,7 +71,7 @@ A [Siamese network](https://papers.nips.cc/paper/769-signature-verification-usin Our network consists of 2 convolutional and max pooling layers that downsample the input image. The output is then fed through a fully connected layer with 256 hidden units and another fully connected layer with 2 hidden units. -```python +```{.python .input} class Siamese(gluon.HybridBlock): def __init__(self, **kwargs): super(Siamese, self).__init__(**kwargs) @@ -95,7 +95,7 @@ class Siamese(gluon.HybridBlock): We train our network on the [Ominglot](http://www.omniglot.com/) dataset which is a collection of 1623 hand drawn characters from 50 alphabets. You can download it from [here](https://github.com/brendenlake/omniglot/tree/master/python). We need to create a dataset that contains a random set of similar and dissimilar images. We use Gluon's `ImageFolderDataset` where we overwrite `__getitem__` and randomly return similar and dissimilar pairs of images. -```python +```{.python .input} class GetImagePairs(mx.gluon.data.vision.ImageFolderDataset): def __init__(self, root): super(GetImagePairs, self).__init__(root, flag=0) @@ -125,7 +125,7 @@ class GetImagePairs(mx.gluon.data.vision.ImageFolderDataset): We train the network on a subset of the data, the [*Tifinagh*](https://www.omniglot.com/writing/tifinagh.htm) alphabet. Once the model is trained we test it on the [*Inuktitut*](https://www.omniglot.com/writing/inuktitut.htm) alphabet. -```python +```{.python .input} def transform(img0, img1, label): normalized_img0 = nd.transpose(img0.astype('float32'), (2, 0, 1))/255.0 normalized_img1 = nd.transpose(img1.astype('float32'), (2, 0, 1))/255.0 @@ -144,7 +144,7 @@ test_dataloader = gluon.data.DataLoader(test.transform(transform), Following code plots some examples from the test dataset. -```python +```{.python .input} img1, img2, label = test[0] print("Same: {}".format(int(label.asscalar()) == 0)) fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10, 5)) @@ -164,7 +164,7 @@ plt.show() Before we can start training, we need to instantiate the custom constrastive loss function and initialize the model. -```python +```{.python .input} model = Siamese() model.initialize(init=mx.init.Xavier()) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': 0.001}) @@ -174,7 +174,7 @@ loss = ContrastiveLoss(margin=6.0) Start the training loop: -```python +```{.python .input} for epoch in range(10): for i, data in enumerate(train_dataloader): image1, image2, label = data @@ -192,7 +192,7 @@ for epoch in range(10): During inference we compute the Euclidean distance between the output vectors of the Siamese network. High distance indicates dissimilarity, low values indicate similarity. -```python +```{.python .input} for i, data in enumerate(test_dataloader): img1, img2, label = data output1, output2 = model(img1, img2) diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md index 018e75f5fcab..e7efbcadc467 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md +++ b/docs/python_docs/python/tutorials/packages/gluon/loss/loss.md @@ -209,7 +209,7 @@ The loss is large, if the predicted probability distribution is far from the gro For instance, in the following example we get a KL divergence of 0.02. We set ```from_logits=False```, so the loss functions will apply ```log_softmax``` on the network output, before computing the KL divergence. -```python +```{.python .input} output = mx.nd.array([[0.39056206, 1.3068528, 0.39056206, -0.30258512]]) print('output.softmax(): {}'.format(output.softmax().asnumpy().tolist())) target_dist = mx.nd.array([[0.3, 0.4, 0.1, 0.2]]) diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md index 3858d0f5e17f..ce270da32d3d 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md +++ b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md @@ -31,7 +31,7 @@ To complete this tutorial, you will need: - [Jupyter Notebook](https://jupyter.org/index.html) (For interactively running the provided .ipynb file) -```python +```{.python .input} import mxnet as mx from mxnet import gluon from mxnet.gluon.model_zoo import vision @@ -52,7 +52,7 @@ ctx = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu() We will use the ```gluon.data.vision``` package to directly import the Fashion-MNIST dataset and perform pre-processing on it. -```python +```{.python .input} # Get the training data fashion_mnist_train = gluon.data.vision.FashionMNIST(train=True) @@ -61,7 +61,7 @@ fashion_mnist_val = gluon.data.vision.FashionMNIST(train=False) ``` -```python +```{.python .input} transforms = [gluon.data.vision.transforms.Resize(224), # We pick 224 as the model we use takes an input of size 224. gluon.data.vision.transforms.ToTensor()] @@ -70,14 +70,14 @@ transforms = gluon.data.vision.transforms.Compose(transforms) ``` -```python +```{.python .input} # Apply the transformations fashion_mnist_train = fashion_mnist_train.transform_first(transforms) fashion_mnist_val = fashion_mnist_val.transform_first(transforms) ``` -```python +```{.python .input} batch_size = 256 # Batch size of the images num_workers = 4 # The number of parallel workers for loading the data using Data Loaders. @@ -92,7 +92,7 @@ val_data_loader = gluon.data.DataLoader(fashion_mnist_val, batch_size=batch_size Let's load the resnet-18 model architecture from [Gluon Model Zoo](https://mxnet.apache.org/api/python/gluon/model_zoo.html) and initialize its parameters. The Gluon Model Zoo contains a repository of pre-trained models as well the model architecture definitions. We are using the model architecture from the model zoo in order to train it from scratch. -```python +```{.python .input} resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes = 10) resnet_18_v1.initialize(init = mx.init.Xavier(), ctx=ctx) ``` @@ -101,14 +101,14 @@ We will be using `SoftmaxCrossEntropyLoss` as the loss function since this is a You can experiment with a [different loss](/api/python/docs/api/gluon/loss/index.html) or [optimizer](/api/python/docs/api/optimizer/index.html) as well. -```python +```{.python .input} loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() ``` Let's define the trainer object for training the model. -```python +```{.python .input} learning_rate = 0.04 # You can experiment with your own learning rate here num_epochs = 2 # You can run training for more epochs trainer = gluon.Trainer(resnet_18_v1.collect_params(), @@ -124,7 +124,7 @@ In the basic usage example, with just 2 lines of code, we will set up our model ### Basic Usage -```python +```{.python .input} train_acc = mx.metric.Accuracy() # Metric to monitor # Define the estimator, by passing to it the model, loss function, metrics, trainer object and context @@ -174,7 +174,7 @@ Our custom event handler is a simple one: record the loss values at the end of e Note: For each of the method, the `Estimator` object is passed along, so you can access training metrics. -```python +```{.python .input} class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd): def __init__(self): super(LossRecordHandler, self).__init__() @@ -201,7 +201,7 @@ class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd): ``` -```python +```{.python .input} # Let's reset the model, trainer and accuracy objects from above resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), ctx=ctx) @@ -211,7 +211,7 @@ train_acc = mx.metric.Accuracy() ``` -```python +```{.python .input} # Define the estimator, by passing to it the model, loss function, metrics, trainer object and context est = estimator.Estimator(net=resnet_18_v1, loss=loss_fn, @@ -255,7 +255,7 @@ with warnings.catch_warnings(): You can load the saved model, by using the `load_parameters` API in Gluon. For more details refer to the [Loading model parameters from file tutorial](/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html#saving-model-parameters-to-file) -```python +```{.python .input} resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes=10) resnet_18_v1.load_parameters('./my_model-best.params', ctx=ctx) ``` diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md index a32c8a1e92cd..0cab619a72bb 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md +++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md @@ -41,7 +41,7 @@ As expected, for very small learning rates we don't see much change in the loss Usually, our unit of work is an epoch (a full pass through the dataset) and the learning rate would typically be held constant throughout the epoch. With the Learning Rate Finder (and cyclical learning rate schedules) we are required to vary the learning rate every iteration. As such we structure our training code so that a single iteration can be run with a given learning rate. You can implement Learner as you wish. Just initialize the network, define the loss and trainer in `__init__` and keep your training logic for a single batch in `iteration`. -```python +```{.python .input} import mxnet as mx # Set seed for reproducibility @@ -96,7 +96,7 @@ class Learner(): We also adjust our `DataLoader` so that it continuously provides batches of data and doesn't stop after a single epoch. We can then call `iteration` as many times as required for the loss to diverge as part of the Learning Rate Finder process. We implement a custom `BatchSampler` for this, that keeps returning random indices of samples to be included in the next batch. We use the CIFAR-10 dataset for image classification to test our Learning Rate Finder. -```python +```{.python .input} from mxnet.gluon.data.vision import transforms transform = transforms.Compose([ @@ -133,7 +133,7 @@ data_loader = mx.gluon.data.DataLoader(dataset, batch_sampler=batch_sampler) With preparation complete, we're ready to write our Learning Rate Finder that wraps the `Learner` we defined above. We implement a `find` method for the procedure, and `plot` for the visualization. Starting with a very low learning rate as defined by `lr_start` we train one iteration at a time and keep multiplying the learning rate by `lr_multiplier`. We analyse the loss and continue until it diverges according to `LRFinderStoppingCriteria` (which is defined later on). You may also notice that we save the parameters and state of the optimizer before the process and restore afterwards. This is so the Learning Rate Finder process doesn't impact the state of the model, and can be used at any point during training. -```python +```{.python .input} from matplotlib import pyplot as plt class LRFinder(): @@ -197,7 +197,7 @@ class LRFinder(): You can define the `LRFinderStoppingCriteria` as you wish, but empirical testing suggests using a smoothed average gives a more consistent stopping rule (see `smoothing`). We stop when the smoothed average of the loss exceeds twice the initial loss, assuming there have been a minimum number of iterations (see `min_iter`). -```python +```{.python .input} class LRFinderStoppingCriteria(): def __init__(self, smoothing=0.3, min_iter=20): """ @@ -230,7 +230,7 @@ class LRFinderStoppingCriteria(): Using a Pre-activation ResNet-18 from the Gluon model zoo, we instantiate our Learner and fire up our Learning Rate Finder! -```python +```{.python .input} ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10) learner = Learner(net=net, data_loader=data_loader, ctx=ctx) @@ -246,7 +246,7 @@ lr_finder.plot() As discussed before, we should select a learning rate where the loss is falling (i.e. from 0.001 to 0.05) but before the loss starts to diverge (i.e. 0.1). We prefer higher learning rates where possible, so we select an initial learning rate of 0.05. Just as a test, we will run 500 epochs using this learning rate and evaluate the loss on the final batch. As we're working with a single batch of 128 samples, the variance of the loss estimates will be reasonably high, but it will give us a general idea. We save the initialized parameters for a later comparison with other learning rates. -```python +```{.python .input} learner.net.save_parameters("net.params") lr = 0.05 @@ -272,7 +272,7 @@ We see a sizable drop in the loss from approx. 2.7 to 1.2. And now we have a baseline, let's see what happens when we train with a learning rate that's higher than advisable at 0.5. -```python +```{.python .input} net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10) learner = Learner(net=net, data_loader=data_loader, ctx=ctx) learner.net.load_parameters("net.params", ctx=ctx) @@ -300,7 +300,7 @@ We still observe a fall in the loss but aren't able to reach as low as before. And lastly, we see how the model trains with a more conservative learning rate of 0.005. -```python +```{.python .input} net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10) learner = Learner(net=net, data_loader=data_loader, ctx=ctx) learner.net.load_parameters("net.params", ctx=ctx) diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md index 86d0f8fdd2c8..e26218a05836 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md +++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md @@ -28,7 +28,7 @@ Schedules define how the learning rate changes over time and are typically speci In this tutorial, we visualize the schedules defined in `mx.lr_scheduler`, show how to implement custom schedules and see an example of using a schedule while training models. Since schedules are passed to `mx.optimizer.Optimizer` classes, these methods work with both Module and Gluon APIs. -```python +```{.python .input} from __future__ import print_function import math import matplotlib.pyplot as plt @@ -39,7 +39,7 @@ import numpy as np %matplotlib inline ``` -```python +```{.python .input} def plot_schedule(schedule_fn, iterations=1500): # Iteration count starting at 1 iterations = [i+1 for i in range(iterations)] @@ -59,7 +59,7 @@ In this section, we take a look at the schedules in `mx.lr_scheduler`. All of th One of the most commonly used learning rate schedules is called stepwise decay, where the learning rate is reduced by a factor at certain intervals. MXNet implements a `FactorScheduler` for equally spaced intervals, and `MultiFactorScheduler` for greater control. We start with an example of halving the learning rate every 250 iterations. More precisely, the learning rate will be multiplied by `factor` _after_ the `step` index and multiples thereafter. So in the example below the learning rate of the 250th iteration will be 1 and the 251st iteration will be 0.5. -```python +```{.python .input} schedule = mx.lr_scheduler.FactorScheduler(step=250, factor=0.5) schedule.base_lr = 1 plot_schedule(schedule) @@ -74,7 +74,7 @@ Note: the `base_lr` is used to determine the initial learning rate. It takes a d We can define non-uniform intervals with `MultiFactorScheduler` and in the example below we halve the learning rate _after_ the 250th, 750th (i.e. a step length of 500 iterations) and 900th (a step length of 150 iterations). As before, the learning rate of the 250th iteration will be 1 and the 251th iteration will be 0.5. -```python +```{.python .input} schedule = mx.lr_scheduler.MultiFactorScheduler(step=[250, 750, 900], factor=0.5) schedule.base_lr = 1 plot_schedule(schedule) @@ -89,7 +89,7 @@ plot_schedule(schedule) Stepwise schedules and the discontinuities they introduce may sometimes lead to instability in the optimization, so in some cases smoother schedules are preferred. `PolyScheduler` gives a smooth decay using a polynomial function and reaches a learning rate of 0 after `max_update` iterations. In the example below, we have a quadratic function (`pwr=2`) that falls from 0.998 at iteration 1 to 0 at iteration 1000. After this the learning rate stays at 0, so nothing will be learnt from `max_update` iterations onwards. -```python +```{.python .input} schedule = mx.lr_scheduler.PolyScheduler(max_update=1000, base_lr=1, pwr=2) plot_schedule(schedule) ``` @@ -107,7 +107,7 @@ And we don't evaluate at `iteration=0` (to get `base_lr`) since we are working w You can implement your own custom schedule with a function or callable class, that takes an integer denoting the iteration index (starting at 1) and returns a float representing the learning rate to be used for that iteration. We implement the Cosine Annealing Schedule in the example below as a callable class (see `__call__` method). -```python +```{.python .input} class CosineAnnealingSchedule(): def __init__(self, min_lr, max_lr, cycle_length): self.min_lr = min_lr @@ -138,7 +138,7 @@ While training a simple handwritten digit classifier on the MNIST dataset, we ta As discussed above, the schedule should return a learning rate given an (1-based) iteration index. -```python +```{.python .input} # Use GPU if one exists, else use CPU ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu() @@ -176,7 +176,7 @@ net = build_cnn() We then initialize our network (technically deferred until we pass the first batch) and define the loss. -```python +```{.python .input} # Initialize the parameters with Xavier initializer net.initialize(mx.init.Xavier(), ctx=ctx) # Use cross entropy loss @@ -186,7 +186,7 @@ softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss() We're now ready to create our schedule, and in this example we opt for a stepwise decay schedule using `MultiFactorScheduler`. Since we're only training a demonstration model for a limited number of epochs (10 in total) we will exaggerate the schedule and drop the learning rate by 90% after the 4th, 7th and 9th epochs. We call these steps, and the drop occurs _after_ the step index. Schedules are defined for iterations (i.e. training batches), so we must represent our steps in iterations too. -```python +```{.python .input} steps_epochs = [4, 7, 9] # assuming we keep partial batches, see `last_batch` parameter of DataLoader iterations_per_epoch = math.ceil(len(train_dataset) / batch_size) @@ -201,26 +201,26 @@ Learning rate drops after iterations: [3752, 6566, 8442] ``` -```python +```{.python .input} schedule = mx.lr_scheduler.MultiFactorScheduler(step=steps_iterations, factor=0.1) ``` **We create our `Optimizer` and pass the schedule via the `lr_scheduler` parameter.** In this example we're using Stochastic Gradient Descent. -```python +```{.python .input} sgd_optimizer = mx.optimizer.SGD(learning_rate=0.03, lr_scheduler=schedule) ``` And we use this optimizer (with schedule) in our `Trainer` and train for 10 epochs. Alternatively, we could have set the `optimizer` to the string `sgd`, and pass a dictionary of the optimizer parameters directly to the trainer using `optimizer_params`. -```python +```{.python .input} trainer = mx.gluon.Trainer(params=net.collect_params(), optimizer=sgd_optimizer) ``` -```python +```{.python .input} num_epochs = 10 # epoch and batch counts starting at 1 for epoch in range(1, num_epochs+1): @@ -277,7 +277,7 @@ When using the method above you don't need to manually keep track of iteration c We replicate the example above, but now keep track of the `iteration_idx`, call the schedule and set the learning rate appropriately using `set_learning_rate`. We also use `schedule.base_lr` to set the initial learning rate for the schedule since we are calling the schedule directly and not using it as part of the `Optimizer`. -```python +```{.python .input} net = build_cnn() net.initialize(mx.init.Xavier(), ctx=ctx) @@ -343,4 +343,4 @@ Once again, we see the learning rate start at 0.03, and fall to 0.00003 by the e We have a related tutorial on Advanced Learning Rate Schedules that shows reference implementations of schedules that give state-of-the-art results. We look at cyclical schedules applied to a variety of cycle shapes, and many other techniques such as warm-up and cool-down. - \ No newline at end of file + diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md index c59c9515f02e..e6c40cd555da 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md +++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules_advanced.md @@ -23,7 +23,7 @@ Given the importance of learning rate and the learning rate schedule for trainin See the "Learning Rate Schedules" tutorial for a more basic overview of learning rates, and an example of how to use them while training your own models. -```python +```{.python .input} %matplotlib inline import copy import math @@ -32,7 +32,7 @@ import numpy as np import matplotlib.pyplot as plt ``` -```python +```{.python .input} def plot_schedule(schedule_fn, iterations=1500): # Iteration count starting at 1 iterations = [i+1 for i in range(iterations)] @@ -54,7 +54,7 @@ We look at "warm-up" in more detail later in the tutorial, but this could be vie One adjustment proposed by [Jeremy Howard, Sebastian Ruder (2018)](https://arxiv.org/abs/1801.06146) was to change the ratio between the increasing and decreasing stages, instead of the 50:50 split. Changing the increasing fraction (`inc_fraction!=0.5`) leads to a **"slanted triangular"** schedule. Using `inc_fraction<0.5` tends to give better results. -```python +```{.python .input} class TriangularSchedule(): def __init__(self, min_lr, max_lr, cycle_length, inc_fraction=0.5): """ @@ -82,7 +82,7 @@ class TriangularSchedule(): We look an example of a slanted triangular schedule that increases from a learning rate of 1 to 2, and back to 1 over 1000 iterations. Since we set `inc_fraction=0.2`, 200 iterations are used for the increasing stage, and 800 for the decreasing stage. After this, the schedule stays at the lower bound indefinitely. -```python +```{.python .input} schedule = TriangularSchedule(min_lr=1, max_lr=2, cycle_length=1000, inc_fraction=0.2) plot_schedule(schedule) ``` @@ -96,7 +96,7 @@ plot_schedule(schedule) Continuing with the idea that smooth decay profiles give improved performance over stepwise decay, [Ilya Loshchilov, Frank Hutter (2016)](https://arxiv.org/abs/1608.03983) used **"cosine annealing"** schedules to good effect. As with triangular schedules, the original idea was that this should be used as part of a cyclical schedule, but we begin by implementing the cosine annealing component before the full Stochastic Gradient Descent with Warm Restarts (SGDR) method later in the tutorial. -```python +```{.python .input} class CosineAnnealingSchedule(): def __init__(self, min_lr, max_lr, cycle_length): """ @@ -120,7 +120,7 @@ class CosineAnnealingSchedule(): We look at an example of a cosine annealing schedule that smoothing decreases from a learning rate of 2 to 1 across 1000 iterations. After this, the schedule stays at the lower bound indefinietly. -```python +```{.python .input} schedule = CosineAnnealingSchedule(min_lr=1, max_lr=2, cycle_length=1000) plot_schedule(schedule) ``` @@ -140,7 +140,7 @@ Unlike the schedules above and those implemented in `mx.lr_scheduler`, these cla Using the idea of linear warm-up of the learning rate proposed in ["Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour" by Priya Goyal et al. (2017)](https://arxiv.org/abs/1706.02677), we implement a wrapper class that adds warm-up to an existing schedule. Going from `start_lr` to the initial learning rate of the `schedule` over `length` iterations, this adjustment is useful when training with large batch sizes. -```python +```{.python .input} class LinearWarmUp(): def __init__(self, schedule, start_lr, length): """ @@ -164,7 +164,7 @@ class LinearWarmUp(): As an example, we add a linear warm-up of the learning rate (from 0 to 1 over 250 iterations) to a stepwise decay schedule. We first create the `MultiFactorScheduler` (and set the `base_lr`) and then pass it to `LinearWarmUp` to add the warm-up at the start. We can use `LinearWarmUp` with any other schedule including `CosineAnnealingSchedule`. -```python +```{.python .input} schedule = mx.lr_scheduler.MultiFactorScheduler(step=[250, 750, 900], factor=0.5) schedule.base_lr = 1 schedule = LinearWarmUp(schedule, start_lr=0, length=250) @@ -180,7 +180,7 @@ plot_schedule(schedule) Similarly, we could add a linear cool-down period to our schedule and this is used in the "1-Cycle" schedule proposed by [Leslie N. Smith, Nicholay Topin (2017)](https://arxiv.org/abs/1708.07120) to train neural networks very quickly in certain circumstances (coined "super-convergence"). We reduce the learning rate from its value at `start_idx` of `schedule` to `finish_lr` over a period of `length`, and then maintain `finish_lr` thereafter. -```python +```{.python .input} class LinearCoolDown(): def __init__(self, schedule, finish_lr, start_idx, length): """ @@ -209,7 +209,7 @@ class LinearCoolDown(): As an example, we apply learning rate cool-down to a `MultiFactorScheduler`. Starting the cool-down at iteration 1000, we reduce the learning rate linearly from 0.125 to 0.001 over 500 iterations, and hold the learning rate at 0.001 after this. -```python +```{.python .input} schedule = mx.lr_scheduler.MultiFactorScheduler(step=[250, 750, 900], factor=0.5) schedule.base_lr = 1 schedule = LinearCoolDown(schedule, finish_lr=0.001, start_idx=1000, length=500) @@ -225,7 +225,7 @@ plot_schedule(schedule) So we can implement the "1-Cycle" schedule proposed by [Leslie N. Smith, Nicholay Topin (2017)](https://arxiv.org/abs/1708.07120) we use a single and symmetric cycle of the triangular schedule above (i.e. `inc_fraction=0.5`), followed by a cool-down period of `cooldown_length` iterations. -```python +```{.python .input} class OneCycleSchedule(): def __init__(self, start_lr, max_lr, cycle_length, cooldown_length=0, finish_lr=None): """ @@ -251,7 +251,7 @@ class OneCycleSchedule(): As an example, we linearly increase and then decrease the learning rate from 0.1 to 0.5 and back over 500 iterations (i.e. single triangular cycle), before reducing the learning rate further to 0.001 over the next 750 iterations (i.e. cool-down). -```python +```{.python .input} schedule = OneCycleSchedule(start_lr=0.1, max_lr=0.5, cycle_length=500, cooldown_length=750, finish_lr=0.001) plot_schedule(schedule) ``` @@ -265,7 +265,7 @@ plot_schedule(schedule) Originally proposed by [Leslie N. Smith (2015)](https://arxiv.org/abs/1506.01186), the idea of cyclically increasing and decreasing the learning rate has been shown to give faster convergence and more optimal solutions. We implement a wrapper class that loops existing cycle-based schedules such as `TriangularSchedule` and `CosineAnnealingSchedule` to provide infinitely repeating schedules. We pass the schedule class (rather than an instance) because one feature of the `CyclicalSchedule` is to vary the `cycle_length` over time as seen in [Ilya Loshchilov, Frank Hutter (2016)](https://arxiv.org/abs/1608.03983) using `cycle_length_decay`. Another feature is the ability to decay the cycle magnitude over time with `cycle_magnitude_decay`. -```python +```{.python .input} class CyclicalSchedule(): def __init__(self, schedule_class, cycle_length, cycle_length_decay=1, cycle_magnitude_decay=1, **kwargs): """ @@ -298,7 +298,7 @@ class CyclicalSchedule(): As an example, we implement the triangular cyclical schedule presented in ["Cyclical Learning Rates for Training Neural Networks" by Leslie N. Smith (2015)](https://arxiv.org/abs/1506.01186). We use slightly different terminology to the paper here because we use `cycle_length` that is twice the 'stepsize' used in the paper. We repeat cycles, each with a length of 500 iterations and lower and upper learning rate bounds of 0.5 and 2 respectively. -```python +```{.python .input} schedule = CyclicalSchedule(TriangularSchedule, min_lr=0.5, max_lr=2, cycle_length=500) plot_schedule(schedule) ``` @@ -310,7 +310,7 @@ plot_schedule(schedule) And lastly, we implement the scheduled used in ["SGDR: Stochastic Gradient Descent with Warm Restarts" by Ilya Loshchilov, Frank Hutter (2016)](https://arxiv.org/abs/1608.03983). We repeat cosine annealing schedules, but each time we halve the magnitude and double the cycle length. -```python +```{.python .input} schedule = CyclicalSchedule(CosineAnnealingSchedule, min_lr=0.01, max_lr=2, cycle_length=250, cycle_length_decay=2, cycle_magnitude_decay=0.5) plot_schedule(schedule) diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md b/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md index 962834909a97..c17abe101613 100644 --- a/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md +++ b/docs/python_docs/python/tutorials/packages/gluon/training/normalization/index.md @@ -41,7 +41,7 @@ Warning: You should calculate the normalization means and standard deviations us When using pre-trained models from the [Gluon Model Zoo](https://mxnet.apache.org/api/python/gluon/model_zoo.html) you'll usually see the normalization statistics used for training (i.e. statistics from step 1). You'll want to use these statistics to normalize your own input data for fine-tuning or inference with these models. Using `transforms.Normalize` is one way of applying the normalization, and this should be used in the `Dataset`. -```python +```{.python .input} import mxnet as mx from mxnet.gluon.data.vision.transforms import Normalize @@ -81,7 +81,7 @@ Warning: it seems that `BatchNorm` is better suited to convolutional networks (C As an example, we'll apply `BatchNorm` to a batch of 2 samples, each with 2 channels, and both height and width of 2 (in NCHW format). -```python +```{.python .input} data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2) print(data) ``` @@ -89,14 +89,14 @@ print(data) With MXNet Gluon we can apply batch normalization with the `mx.gluon.nn.BatchNorm` block. It can be created and used just like any other MXNet Gluon block (such as `Conv2D`). Its input will typically be unnormalized activations from the previous layer, and the output will be the normalized activations ready for the next layer. Since we're using data in NCHW format we can use the default axis. -```python +```{.python .input} net = mx.gluon.nn.BatchNorm() ``` We still need to initialize the block because it has a number of trainable parameters, as we'll see later on. -```python +```{.python .input} net.initialize() ``` @@ -107,7 +107,7 @@ Remember: `BatchNorm` runs differently during training and inference. When train Warning: `BatchNorm` assumes the channel dimension is the 2nd in order (i.e. `axis=1`). You need to ensure your data has a channel dimension, and change the `axis` parameter of `BatchNorm` if it's not the 2nd dimension. A batch of greyscale images of shape `(100,32,32)` would not work, since the 2nd dimension is height and not channel. You'd need to add a channel dimension using `data.expand_dims(1)` in this case to give shape `(100,1,32,32)`. -```python +```{.python .input} with mx.autograd.record(): output = net(data) loss = output.abs() @@ -118,7 +118,7 @@ print(output) We can immediately see the activations have been scaled down and centered around zero. Activations are the same for each channel, because each channel was normalized independently. We can do a quick sanity check on these results, by manually calculating the batch mean and variance for each channel. -```python +```{.python .input} batch_means = data.mean(axis=1, exclude=True) batch_vars = (data - batch_means.reshape(1, -1, 1, 1)).square().mean(axis=1, exclude=True) print('batch_means:', batch_means.asnumpy()) @@ -128,7 +128,7 @@ print('batch_vars:', batch_vars.asnumpy()) And use these to scale the first entry in `data`, to confirm the `BatchNorm` calculation of `-1.324` was correct. -```python +```{.python .input} print("manually calculated:", ((data[0][0][0][0] - batch_means[0])/batch_vars[0].sqrt()).asnumpy()) print("automatically calculated:", output[0][0][0][0].asnumpy()) ``` @@ -141,7 +141,7 @@ Advanced: when using a pre-trained model inside another model (e.g. a pre-traine After a single step (specifically after the `backward` call) we can see the `running_mean` and `running_var` have been updated. -```python +```{.python .input} print('running_mean:', net.running_mean.data().asnumpy()) print('running_var:', net.running_var.data().asnumpy()) ``` @@ -149,7 +149,7 @@ print('running_var:', net.running_var.data().asnumpy()) You should notice though that these running statistics do not match the batch statistics we just calculated. And instead they are just 10% of the value we'd expect. We see this because of the exponential average process, and because the `momentum` parameter of `BatchNorm` is equal to 0.9 : i.e. 10% of the new value, 90% of the old value (which was initialized to 0). Over time the running statistics will converge to the statistics of the input distribution, while still being flexible enough to adjust to shifts in the input distribution. Using the same batch another 100 times (which wouldn't happen in practice), we can see the running statistics converge to the batch statsitics calculated before. -```python +```{.python .input} for i in range(100): with mx.autograd.record(): output = net(data) @@ -168,7 +168,7 @@ Advanced: Sometimes used for input normalization, you can prevent `beta` shiftin We haven't updated these parameters yet, so they should still be as initialized. You can see the default for `beta` is 0 (i.e. not shift) and `gamma` is 1 (i.e. not scale), so the initial behaviour is to keep the distribution unit normalized. -```python +```{.python .input} print('beta:', net.beta.data().asnumpy()) print('gamma:', net.gamma.data().asnumpy()) ``` @@ -176,7 +176,7 @@ print('gamma:', net.gamma.data().asnumpy()) We can also check the gradient on these parameters. Since we were finding the gradient of the sum of absolute values, we would expect the gradient of `gamma` to be equal to the number of points in the data (i.e. 16). So to minimize the loss we'd decrease the value of `gamma`, which would happen as part of a `trainer.step`. -```python +```{.python .input} print('beta gradient:', net.beta.grad().asnumpy()) print('gamma gradient:', net.gamma.grad().asnumpy()) ``` @@ -186,7 +186,7 @@ print('gamma gradient:', net.gamma.grad().asnumpy()) When it comes to inference, `BatchNorm` uses the global statistics that were calculated during training. Since we're using the same batch of data over and over again (and our global running statistics have converged), we get a very similar result to using training mode. `beta` and `gamma` are also applied by default (unless explicitly removed). -```python +```{.python .input} output = net(data) print(output) ``` @@ -211,7 +211,7 @@ Figure 3: `LayerNorm` on NCHW data | Figure 4: `LayerNorm` on NTC data As an example, we'll apply `LayerNorm` to a batch of 2 samples, each with 4 time steps and 2 channels (in NTC format). -```python +```{.python .input} data = mx.nd.arange(start=0, stop=2*4*2).reshape(2, 4, 2) print(data) ``` @@ -219,7 +219,7 @@ print(data) With MXNet Gluon we can apply layer normalization with the `mx.gluon.nn.LayerNorm` block. We need to call `initialize` because `LayerNorm` has two learnable parameters by default: `beta` and `gamma` that are used for post normalization shifting and scaling of each channel. -```python +```{.python .input} net = mx.gluon.nn.LayerNorm() net.initialize() output = net(data) @@ -231,7 +231,7 @@ We can see that normalization has been applied across all channels for each time We can also check the parameters `beta` and `gamma` and see that they are per channel (i.e. 2 of each in this example). -```python +```{.python .input} print('beta:', net.beta.data().asnumpy()) print('gamma:', net.gamma.data().asnumpy()) ``` @@ -250,7 +250,7 @@ Figure 3: `InstanceNorm` on NCHW data | Figure 4: `InstanceNorm` on NTC data As an example, we'll apply `InstanceNorm` to a batch of 2 samples, each with 2 channels, and both height and width of 2 (in NCHW format). -```python +```{.python .input} data = mx.nd.arange(start=0, stop=2*2*2*2).reshape(2, 2, 2, 2) print(data) ``` @@ -258,7 +258,7 @@ print(data) With MXNet Gluon we can apply instance normalization with the `mx.gluon.nn.InstanceNorm` block. We need to call `initialize` because InstanceNorm has two learnable parameters by default: `beta` and `gamma` that are used for post normalization shifting and scaling of each channel. -```python +```{.python .input} net = mx.gluon.nn.InstanceNorm() net.initialize() output = net(data) @@ -268,7 +268,7 @@ print(output) We can also check the parameters `beta` and `gamma` and see that they are per channel (i.e. 2 of each in this example). -```python +```{.python .input} print('beta:', net.beta.data().asnumpy()) print('gamma:', net.gamma.data().asnumpy()) ``` diff --git a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md index c03a03d1080e..81502901aafd 100644 --- a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md +++ b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md @@ -26,7 +26,7 @@ and pull data out. Let's consider a simple example: initializing a (`int`, `NDArray`) pair into the store, and then pulling the value out: -```python +```{.python .input} import mxnet as mx kv = mx.kv.create('local') # create a local kv store. @@ -43,7 +43,7 @@ print(a.asnumpy()) For any key that has been initialized, you can push a new value with the same shape to the key: -```python +```{.python .input} kv.push(3, mx.nd.ones(shape)*8) kv.pull(3, out = a) # pull out the value print(a.asnumpy()) @@ -56,7 +56,7 @@ values into the same key, where KVStore will first sum all of these values and then push the aggregated value. Here we will just demonstrate pushing a list of values on CPU. Please note summation only happens if the value list is longer than one -```python +```{.python .input} contexts = [mx.cpu(i) for i in range(4)] b = [mx.nd.ones(shape, ctx) for ctx in contexts] kv.push(3, b) @@ -70,7 +70,7 @@ For each push, KVStore combines the pushed value with the value stored using an `updater`. The default updater is `ASSIGN`. You can replace the default to control how data is merged: -```python +```{.python .input} def update(key, input, stored): print("update on key: %d" % key) stored += input * 2 @@ -81,7 +81,7 @@ print(a.asnumpy()) `[[ 4. 4. 4.],[ 4. 4. 4.]]` -```python +```{.python .input} kv.push(3, mx.nd.ones(shape)) kv.pull(3, out=a) print(a.asnumpy()) @@ -97,7 +97,7 @@ print(a.asnumpy()) You've already seen how to pull a single key-value pair. Similarly, to push, you can pull the value onto several devices with a single call: -```python +```{.python .input} b = [mx.nd.ones(shape, ctx) for ctx in contexts] kv.pull(3, out = b) print(b[1].asnumpy()) @@ -112,7 +112,7 @@ an interface for a list of key-value pairs. For a single device: -```python +```{.python .input} keys = [5, 7, 9] kv.init(keys, [mx.nd.ones(shape)]*len(keys)) kv.push(keys, [mx.nd.ones(shape)]*len(keys)) @@ -131,7 +131,7 @@ print(b[1].asnumpy()) For multiple devices: -```python +```{.python .input} b = [[mx.nd.ones(shape, ctx) for ctx in contexts]] * len(keys) kv.push(keys, b) kv.pull(keys, out = b) diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md index 6bc373e356e8..750f66ae438e 100644 --- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md +++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/01-ndarray-intro.md @@ -43,21 +43,21 @@ To get started, let's import habit of setting a random seed so that you always get the same results that we do. -```python +```{.python .input} import mxnet as mx from mxnet import nd ``` Let's start with a very simple 1-dimensional array with a python list. -```python +```{.python .input} x = nd.array([1,2,3]) print(x) ``` Now a 2-dimensional array. -```python +```{.python .input} y = nd.array([[1,2,3,4], [1,2,3,4], [1,2,3,4]]) print(y) ``` @@ -67,7 +67,7 @@ Specifically, we'll create a 2D array (also called a *matrix*) with 3 rows and 4 columns using the `.empty` function. We'll also try out `.full` which takes an additional parameter for what value you want to fill in the array. -```python +```{.python .input} x = nd.empty((3, 3)) print(x) x = nd.full((3,3), 7) @@ -85,7 +85,7 @@ different here (3,10) since the zeros may not produce anything different from empty... or use the two demonstrations to show something interesting or unique... when would I use one over the other?--> -```python +```{.python .input} x = nd.zeros((3, 10)) print(x) ``` @@ -93,7 +93,7 @@ print(x) Similarly, `ndarray` has a function to create a matrix of all ones aptly named [ones](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.ones). -```python +```{.python .input} x = nd.ones((3, 4)) print(x) ``` @@ -109,7 +109,7 @@ Is it that important to introduce zero mean and unit variance right now? Describe more? Or how about explain which is which for the 0 and the 1 and what they're going to do... if it actually matters at this point. --> -```python +```{.python .input} y = nd.random_normal(0, 1, shape=(3, 4)) print(y) ``` @@ -117,7 +117,7 @@ print(y) Sometimes you will want to copy an array by its shape but not its contents. You can do this with `.zeros_like`. -```python +```{.python .input} z = nd.zeros_like(y) print(z) ``` @@ -125,7 +125,7 @@ print(z) As in NumPy, the dimensions of each `NDArray` are accessible via the `.shape` attribute. -```python +```{.python .input} y.shape ``` @@ -135,13 +135,13 @@ how much memory the array occupies. -```python +```{.python .input} y.size ``` We can query the data type using `.dtype`. -```python +```{.python .input} y.dtype ``` @@ -150,7 +150,7 @@ precision, or you might want to use a different data type. You can force the data type when you create the array using a numpy type. This requires you to import numpy first. -```python +```{.python .input} import numpy as np a = nd.array([1,2,3]) b = nd.array([1,2,3], dtype=np.int32) @@ -163,7 +163,7 @@ happen on specific devices that you can set. You can compute on CPU(s), GPU(s), specific GPU, or all of the above depending on your situation and preference. Using `.context` reveals the location of the variable. -```python +```{.python .input} y.context ``` diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md index 352f6b7a0f34..c2270f7953ef 100644 --- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md +++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/02-ndarray-operations.md @@ -35,12 +35,12 @@ Such as element-wise addition: -```python +```{.python .input} import mxnet as mx from mxnet import nd ``` -```python +```{.python .input} x = nd.ones((3, 4)) y = nd.random_normal(0, 1, shape=(3, 4)) print('x=', x) @@ -51,7 +51,7 @@ print('x = x + y, x=', x) Multiplication: -```python +```{.python .input} x = nd.array([1, 2, 3]) y = nd.array([2, 2, 2]) x * y @@ -61,7 +61,7 @@ And exponentiation: -```python +```{.python .input} nd.exp(x) ``` @@ -69,7 +69,7 @@ We can also grab a matrix's transpose to compute a proper matrix-matrix product. -```python +```{.python .input} nd.dot(x, y.T) ``` @@ -93,7 +93,7 @@ detail, and quite possibily in its own notebook since I think it will help to show some gotchas like you mentioned verbally. I am still leaning toward delaying the introduction of this topic....--> -```python +```{.python .input} print('y=', y) print('id(y):', id(y)) y = y + x @@ -104,7 +104,7 @@ print('id(y):', id(y)) We can assign the result to a previously allocated array with slice notation, e.g., `result[:] = ...`. -```python +```{.python .input} print('x=', x) z = nd.zeros_like(x) print('z is zeros_like x, z=', z) @@ -120,7 +120,7 @@ before copying it to z. To make better use of memory, we can perform operations in place, avoiding temporary buffers. To do this we specify the `out` keyword argument every operator supports: -```python +```{.python .input} print('x=', x, 'is in id(x):', id(x)) print('y=', y, 'is in id(y):', id(y)) print('z=', z, 'is in id(z):', id(z)) @@ -136,7 +136,7 @@ itself. There are two ways to do this in MXNet. = x op y 2. By using the op-equals operators like `+=` -```python +```{.python .input} print('x=', x, 'is in id(x):', id(x)) x += y print('x=', x, 'is in id(x):', id(x)) @@ -155,7 +155,7 @@ the whole array: a[:] Here's an example of reading the second and third rows from `x`. -```python +```{.python .input} x = nd.array([1, 2, 3]) print('1D complete array, x=', x) s = x[1:3] @@ -168,7 +168,7 @@ print('slicing the 2nd and 3rd elements, s=', s) Now let's try writing to a specific element. -```python +```{.python .input} print('original x, x=', x) x[2] = 9.0 print('replaced entire row with x[2] = 9.0, x=', x) @@ -180,7 +180,7 @@ print('replaced range of elements with x[1:2,1:3] = 5.0, x=', x) Multi-dimensional slicing is also supported. -```python +```{.python .input} x = nd.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) print('original x, x=', x) s = x[1:2,1:3] @@ -214,7 +214,7 @@ a shape like (3,3) you lose some of the impact and miss some errors if people play with the values. Better to have a distinct shape so that it is more obvious what is happening and what can break.--> -```python +```{.python .input} x = nd.ones(shape=(3,6)) print('x = ', x) y = nd.arange(6) @@ -230,7 +230,7 @@ That's because broadcasting prefers to duplicate along the left most axis. We can alter this behavior by explicitly giving `y` a $2$D shape using `.reshape`. You can also chain `.arange` and `.reshape` to do this in one step. -```python +```{.python .input} y = y.reshape((3,1)) print('y = ', y) print('x + y = ', x+y) @@ -242,12 +242,12 @@ print('y = ', y) Converting MXNet NDArrays to and from NumPy is easy. The converted arrays do not share memory. -```python +```{.python .input} a = x.asnumpy() type(a) ``` -```python +```{.python .input} y = nd.array(a) print('id(a)=', id(a), 'id(x)=', id(x), 'id(y)=', id(y)) ``` diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md index 98d79d7510e9..2518f52e78bb 100644 --- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md +++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/gotchas_numpy_in_mxnet.md @@ -55,7 +55,7 @@ If a required operator is missing from `NDArray API`, there are few things you c There are a situation, when you can assemble a higher level operator using existing operators. An example for that is the [np.full_like()](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.full_like.html) operator. This operator doesn't exist in `NDArray API`, but can be easily replaced with a combination of existing operators. -```python +```{.python .input} from mxnet import nd import numpy as np @@ -80,7 +80,7 @@ Some operators may have slightly different name, but are similar in terms of fun One particular example of different input requirements is [nd.pad()](/api/python/docs/api/ndarray/ndarray.html#mxnet.ndarray.pad). The trick is that it can only work with 4-dimensional tensors. If your input has less dimensions, then you need to expand its number before using `nd.pad()` as it is shown in the code block below: -```python +```{.python .input} def pad_array(data, max_length): # expand dimensions to 4, because nd.pad can work only with 4 dims data_expanded = data.reshape(1, 1, 1, data.shape[0]) @@ -115,7 +115,7 @@ There are cases, when you have to use either `.asnumpy()` or `.asscalar()` metho You can minimize the impact of a blocking call by calling `.asnumpy()` or `.asscalar()` in the moment, when you think the calculation of this value is already done. In the example below, we introduce the `LossBuffer` class. It is used to cache the previous value of a loss function. By doing so, we delay printing by one iteration in hope that the `Execution Engine` would finish the previous iteration and blocking time would be minimized. -```python +```{.python .input} from __future__ import print_function import mxnet as mx diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md index b91279cff4d4..b14bec66d861 100644 --- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md +++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md @@ -94,7 +94,7 @@ There are a few different ways to create a `CSRNDArray`, but first let's recreat You can create a CSRNDArray with data, indices and indptr by using the `csr_matrix` function: -```python +```{.python .input} import mxnet as mx # Create a CSRNDArray with python lists shape = (3, 4) @@ -116,7 +116,7 @@ array([[ 7., 0., 8., 0.], -```python +```{.python .input} import numpy as np # Create a CSRNDArray with numpy arrays data_np = np.array([7, 8, 9]) @@ -136,7 +136,7 @@ array([[7, 0, 8, 0], -```python +```{.python .input} # Compare the two. They are exactly the same. {'a':a.asnumpy(), 'b':b.asnumpy()} ``` @@ -155,7 +155,7 @@ array([[7, 0, 8, 0], You can create an MXNet CSRNDArray from a `scipy.sparse.csr.csr_matrix` object by using the `array` function: -```python +```{.python .input} try: import scipy.sparse as spsp # generate a csr matrix in scipy @@ -177,7 +177,7 @@ d:[[7 0 8 0] What if you have a big set of data and you haven't calculated indices or indptr yet? Let's try a simple CSRNDArray from an existing array of data and derive those values with some built-in functions. We can mockup a "big" dataset with a random amount of the data being non-zero, then compress it by using the `tostype` function, which is explained further in the [Storage Type Conversion](#storage-type-conversion) section: -```python +```{.python .input} big_array = mx.nd.round(mx.nd.random.uniform(low=0, high=1, shape=(1000, 100))) print(big_array) big_array_csr = big_array.tostype('csr') @@ -205,7 +205,7 @@ You can also create a CSRNDArray from another using the `array` function specify which accepts a numpy type. By default, `float32` is used. -```python +```{.python .input} # Float32 is used by default e = mx.nd.sparse.array(a) # Create a 16-bit float array @@ -233,7 +233,7 @@ As you have seen already, we can inspect the contents of a `CSRNDArray` by filli its contents into a dense `numpy.ndarray` using the `asnumpy` function. -```python +```{.python .input} a.asnumpy() ``` @@ -249,7 +249,7 @@ array([[ 7., 0., 8., 0.], You can also inspect the internal storage of a CSRNDArray by accessing attributes such as `indptr`, `indices` and `data`: -```python +```{.python .input} # Access data array data = a.data # Access indices array @@ -281,7 +281,7 @@ You can also convert storage types with: To convert an NDArray to a CSRNDArray and vice versa by using the ``tostype`` function: -```python +```{.python .input} # Create a dense NDArray ones = mx.nd.ones((2,2)) # Cast the storage type from `default` to `csr` @@ -305,7 +305,7 @@ dense = csr.tostype('default') To convert the storage type by using the `cast_storage` operator: -```python +```{.python .input} # Create a dense NDArray ones = mx.nd.ones((2,2)) # Cast the storage type to `csr` @@ -332,7 +332,7 @@ You can use the `copy` method which makes a deep copy of the array and its data, You can also use the `copyto` method or the slice operator `[]` to deep copy to an existing array. -```python +```{.python .input} a = mx.nd.ones((2,2)).tostype('csr') b = a.copy() c = mx.nd.sparse.zeros('csr', (2,2)) @@ -357,7 +357,7 @@ the storage type of destination array will not change when copying with `copyto` the slice operator `[]`. -```python +```{.python .input} e = mx.nd.sparse.zeros('csr', (2,2)) f = mx.nd.sparse.zeros('csr', (2,2)) g = mx.nd.ones(e.shape) @@ -377,7 +377,7 @@ g.copyto(f) You can slice a CSRNDArray on axis 0 with operator `[]`, which copies the slices and returns a new CSRNDArray. -```python +```{.python .input} a = mx.nd.array(np.arange(6).reshape(3,2)).tostype('csr') b = a[1:2].asnumpy() c = a[:].asnumpy() @@ -403,7 +403,7 @@ Note that multi-dimensional indexing or slicing along a particular axis is curre Operators that have specialized implementation for sparse arrays can be accessed in `mx.nd.sparse`. You can read the [mxnet.ndarray.sparse API documentation](https://mxnet.apache.org/versions/master/api/python/ndarray/sparse.html) to find what sparse operators are available. -```python +```{.python .input} shape = (3, 4) data = [7, 8, 9] indptr = [0, 2, 2, 3] @@ -428,7 +428,7 @@ out = mx.nd.sparse.dot(a, rhs) # invoke sparse dot operator specialized for dot For any sparse operator, the storage type of output array is inferred based on inputs. You can either read the documentation or inspect the `stype` attribute of the output array to know what storage type is inferred: -```python +```{.python .input} b = a * 2 # b will be a CSRNDArray since zero multiplied by 2 is still zero c = a + mx.nd.ones(shape=(3, 4)) # c will be a dense NDArray {'b.stype':b.stype, 'c.stype':c.stype} @@ -448,7 +448,7 @@ If sparse inputs are provided, MXNet will convert sparse inputs into dense ones If sparse outputs are provided, MXNet will convert the dense outputs generated by the dense operator into the provided sparse format. -```python +```{.python .input} e = mx.nd.sparse.zeros('csr', a.shape) d = mx.nd.log(a) # dense operator with a sparse input e = mx.nd.log(a, out=e) # dense operator with a sparse output @@ -469,7 +469,7 @@ Note that warning messages will be printed when such a storage fallback event ha You can load data in batches from a CSRNDArray using `mx.io.NDArrayIter`: -```python +```{.python .input} # Create the source CSRNDArray data = mx.nd.array(np.arange(36).reshape((9,4))).tostype('csr') labels = np.ones([9, 1]) @@ -492,7 +492,7 @@ dataiter = mx.io.NDArrayIter(data, labels, batch_size, last_batch_handle='discar You can also load data stored in the [libsvm file format](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/) using `mx.io.LibSVMIter`, where the format is: ``