diff --git a/docs/tutorials/gluon/hybrid.md b/docs/tutorials/gluon/hybrid.md
index b7725386336e..01296722e4f3 100644
--- a/docs/tutorials/gluon/hybrid.md
+++ b/docs/tutorials/gluon/hybrid.md
@@ -154,7 +154,10 @@ You can use other language bindings to load them. You can also load them back
 to gluon with `SymbolBlock`:
 
 ```python
-net2 = gluon.SymbolBlock.imports('model-symbol.json', ['data'], 'model-0001.params')
+import warnings
+
+with warnings.catch_warnings():
+    net2 = gluon.SymbolBlock.imports('model-symbol.json', ['data'], 'model-0001.params')
 ```
 
 ## Operators that do not work with hybridize
@@ -259,4 +262,4 @@ For example, avoid writing `x += y` and use `x  = x + y`, otherwise you will get
 
 The recommended practice is to utilize the flexibility of imperative NDArray API during experimentation. Once you finalized your model, make necessary changes mentioned above so you can call `hybridize` function to improve performance.
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/save_load_params.md b/docs/tutorials/gluon/save_load_params.md
index ffebefdf80e1..61dad4263531 100644
--- a/docs/tutorials/gluon/save_load_params.md
+++ b/docs/tutorials/gluon/save_load_params.md
@@ -260,7 +260,9 @@ One of the main reasons to serialize model architecture into a JSON file is to l
 Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and used inside Python frontend using `gluon.nn.SymbolBlock`. To demonstrate that, let's load the network we serialized above.
 
 ```python
-deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params", ctx=ctx)
+import warnings
+with warnings.catch_warnings():
+    deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params", ctx=ctx)
 ```
 
 `deserialized_net` now contains the network we deserialized from files. Let's test the deserialized network to make sure it works.
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 7e0ffaa3f72a..67f0bf0b1257 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -134,7 +134,6 @@ Select API:&nbsp;
     * [MNIST Handwritten Digit Classification](/tutorials/python/mnist.html)
     * [Movie Review Classification using Convolutional Networks](/tutorials/nlp/cnn.html)
     * [Generative Adversarial Networks (GANs)](/tutorials/unsupervised_learning/gan.html)
-    * [Recommender Systems using Matrix Factorization](/tutorials/python/matrix_factorization.html)
     * [Speech Recognition with Connectionist Temporal Classification Loss](/tutorials/speech_recognition/ctc.html)
 * Practitioner Guides
     * [Predicting on new images using a pre-trained ImageNet model](/tutorials/python/predict_image.html)
diff --git a/docs/tutorials/onnx/fine_tuning_gluon.md b/docs/tutorials/onnx/fine_tuning_gluon.md
index dd0c0e93e862..1271dfc02f89 100644
--- a/docs/tutorials/onnx/fine_tuning_gluon.md
+++ b/docs/tutorials/onnx/fine_tuning_gluon.md
@@ -279,7 +279,9 @@ We create a symbol block that is going to hold all our pre-trained layers, and a
 
 
 ```python
-pre_trained = gluon.nn.SymbolBlock(outputs=new_sym, inputs=mx.sym.var('data_0'))
+import warnings
+with warnings.catch_warnings():
+    pre_trained = gluon.nn.SymbolBlock(outputs=new_sym, inputs=mx.sym.var('data_0'))
 net_params = pre_trained.collect_params()
 for param in new_arg_params:
     if param in net_params:
diff --git a/docs/tutorials/onnx/inference_on_onnx_model.md b/docs/tutorials/onnx/inference_on_onnx_model.md
index f12e050fcc73..654d0c11bcba 100644
--- a/docs/tutorials/onnx/inference_on_onnx_model.md
+++ b/docs/tutorials/onnx/inference_on_onnx_model.md
@@ -144,7 +144,9 @@ print(data_names)
 And load them into a MXNet Gluon symbol block. 
 
 ```python
-net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('data_0'))
+import warnings
+with warnings.catch_warnings():
+    net = gluon.nn.SymbolBlock(outputs=sym, inputs=mx.sym.var('data_0'))
 net_params = net.collect_params()
 for param in arg_params:
     if param in net_params:
@@ -247,6 +249,7 @@ Lucky for us, the [Caltech101 dataset](http://www.vision.caltech.edu/Image_Datas
 
 We show that in our next tutorial:
 
+
 - [Fine-tuning an ONNX Model using the modern imperative MXNet/Gluon](http://mxnet.incubator.apache.org/tutorials/onnx/fine_tuning_gluon.html)
     
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/python/matrix_factorization.md b/docs/tutorials/python/matrix_factorization.md
deleted file mode 100644
index cfe73a4856e0..000000000000
--- a/docs/tutorials/python/matrix_factorization.md
+++ /dev/null
@@ -1,289 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Matrix Factorization
-
-In a recommendation system, there is a group of users and a set of items. Given
-that each users have rated some items in the system, we would like to predict
-how the users would rate the items that they have not yet rated, such that we
-can make recommendations to the users.
-
-Matrix factorization is one of the main algorithms used in recommendation
-systems. It can be used to discover latent features underlying the interactions
-between two different kinds of entities.
-
-Assume we assign a k-dimensional vector to each user and a k-dimensional vector
-to each item such that the dot product of these two vectors gives the user's
-rating of that item. We can learn the user and item vectors directly, which is
-essentially performing SVD on the user-item matrix. We can also try to learn the
-latent features using multi-layer neural networks.
-
-In this tutorial, we will work though the steps to implement these ideas in
-MXNet.
-
-```python
-# Set the logging level
-import logging
-head = '%(asctime)-15s %(message)s'
-logging.basicConfig(level=logging.INFO)
-```
-
-```python
-import mxnet as mx
-import random
-
-# Fix the random seeds
-mx.random.seed(42)
-random.seed(42)
-
-# set the context on GPU is available otherwise CPU
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
-```
-
-## Prepare Data
-
-We use the [MovieLens](http://grouplens.org/datasets/movielens/) data here, but
-it can apply to other datasets as well. Each row of this dataset contains a
-tuple of user id, movie id, rating, and time stamp, we will only use the first
-three items. We first define the a batch which contains n tuples. It also
-provides name and shape information to MXNet about the data and label.
-
-
-```python
-
-class Batch(object):
-    def __init__(self, data_names, data, label_names, label):
-        self.data = data
-        self.label = label
-        self.data_names = data_names
-        self.label_names = label_names
-
-    @property
-    def provide_data(self):
-        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
-
-    @property
-    def provide_label(self):
-        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
-```
-
-Then we define a data iterator, which returns a batch of tuples each time.
-
-
-```python
-
-class Batch(object):
-    def __init__(self, data_names, data, label_names, label):
-        self.data = data
-        self.label = label
-        self.data_names = data_names
-        self.label_names = label_names
-
-    @property
-    def provide_data(self):
-        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
-
-    @property
-    def provide_label(self):
-        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
-
-class DataIter(mx.io.DataIter):
-    def __init__(self, fname, batch_size):
-        super(DataIter, self).__init__()
-        self.batch_size = batch_size
-        self.data = []
-        for line in open(fname):
-            tks = line.strip().split('\t')
-            if len(tks) != 4:
-                continue
-            self.data.append((int(tks[0]), int(tks[1]), float(tks[2])))
-        self.provide_data = [('user', (batch_size, )), ('item', (batch_size, ))]
-        self.provide_label = [('score', (self.batch_size, ))]
-
-    def __iter__(self):
-        for k in range(int(len(self.data) / self.batch_size)):
-            users = []
-            items = []
-            scores = []
-            for i in range(self.batch_size):
-                j = k * self.batch_size + i
-                user, item, score = self.data[j]
-                users.append(user)
-                items.append(item)
-                scores.append(score)
-
-            data_all = [mx.nd.array(users), mx.nd.array(items)]
-            label_all = [mx.nd.array(scores)]
-            data_names = ['user', 'item']
-            label_names = ['score']
-
-            data_batch = Batch(data_names, data_all, label_names, label_all)
-            yield data_batch
-
-    def reset(self):
-        random.shuffle(self.data)
-```
-
-Now we download the data and provide a function to obtain the data iterator:
-
-
-```python
-import os
-import urllib
-import zipfile
-file = mx.test_utils.download('http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'ml-100k.zip')
-with zipfile.ZipFile("ml-100k.zip","r") as f:
-    f.extractall(".")
-def get_data(batch_size):
-    return (DataIter(os.path.join('.','ml-100k','u1.base'), batch_size), DataIter(os.path.join('.','ml-100k','u1.test'), batch_size))
-```
-
-Finally we calculate the numbers of users and items for later use.
-
-```python
-def max_id(fname):
-    mu = 0
-    mi = 0
-    for line in open(fname):
-        tks = line.strip().split('\t')
-        if len(tks) != 4:
-            continue
-        mu = max(mu, int(tks[0]))
-        mi = max(mi, int(tks[1]))
-    return mu + 1, mi + 1
-max_user, max_item = max_id(os.path.join('.','ml-100k','u.data'))
-(max_user, max_item)
-```
-
-## Optimization
-
-We first implement the RMSE (root-mean-square error) measurement, which is
-commonly used by matrix factorization.
-
-```python
-import math
-def RMSE(label, pred):
-    ret = 0.0
-    n = 0.0
-    pred = pred.flatten()
-    for i in range(len(label)):
-        ret += (label[i] - pred[i]) * (label[i] - pred[i])
-        n += 1.0
-    return math.sqrt(ret / n)
-```
-
-Then we define a general training module, which is borrowed from the image
-classification application.
-
-```python
-def train(network, batch_size, num_epoch, learning_rate):
-    model = mx.mod.Module(symbol=network, context=ctx, data_names=('user','item'), label_names=['score'])
-
-    batch_size = 64
-    train, test = get_data(batch_size)
-
-    model.fit(train,
-              eval_data = test,
-              eval_metric = RMSE,
-              batch_end_callback=mx.callback.Speedometer(batch_size, 20000/batch_size),
-              num_epoch=num_epoch,
-              optimizer='sgd',
-              optimizer_params={'learning_rate':learning_rate, 'momentum':0.9, 'wd':0.0001}
-             )
-```
-
-## Networks
-
-Now we try various networks. We first learn the latent vectors directly.
-
-```python
-def plain_net(k):
-    # input
-    user = mx.symbol.Variable('user')
-    item = mx.symbol.Variable('item')
-    score = mx.symbol.Variable('score')
-    # user feature lookup
-    user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
-    # item feature lookup
-    item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
-    # predict by the inner product, which is elementwise product and then sum
-    pred = user * item
-    pred = mx.symbol.sum_axis(data = pred, axis = 1)
-    pred = mx.symbol.Flatten(data = pred)
-    # loss layer
-    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
-    return pred
-
-train(plain_net(64), batch_size=64, num_epoch=10, learning_rate=.05)
-```
-
-Next we try to use 2 layers neural network to learn the latent variables, which stack a fully connected layer above the embedding layers:
-
-```python
-def get_one_layer_mlp(hidden, k):
-    # input
-    user = mx.symbol.Variable('user')
-    item = mx.symbol.Variable('item')
-    score = mx.symbol.Variable('score')
-    # user latent features
-    user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
-    user = mx.symbol.Activation(data = user, act_type="relu")
-    user = mx.symbol.FullyConnected(data = user, num_hidden = hidden)
-    # item latent features
-    item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
-    item = mx.symbol.Activation(data = item, act_type="relu")
-    item = mx.symbol.FullyConnected(data = item, num_hidden = hidden)
-    # predict by the inner product
-    pred = user * item
-    pred = mx.symbol.sum_axis(data = pred, axis = 1)
-    pred = mx.symbol.Flatten(data = pred)
-    # loss layer
-    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
-    return pred
-
-train(get_one_layer_mlp(64, 64), batch_size=64, num_epoch=10, learning_rate=.05)
-```
-
-Adding dropout layers to relief the over-fitting.
-
-```python
-def get_one_layer_dropout_mlp(hidden, k):
-    # input
-    user = mx.symbol.Variable('user')
-    item = mx.symbol.Variable('item')
-    score = mx.symbol.Variable('score')
-    # user latent features
-    user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
-    user = mx.symbol.Activation(data = user, act_type="relu")
-    user = mx.symbol.FullyConnected(data = user, num_hidden = hidden)
-    user = mx.symbol.Dropout(data=user, p=0.5)
-    # item latent features
-    item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
-    item = mx.symbol.Activation(data = item, act_type="relu")
-    item = mx.symbol.FullyConnected(data = item, num_hidden = hidden)
-    item = mx.symbol.Dropout(data=item, p=0.5)
-    # predict by the inner product
-    pred = user * item
-    pred = mx.symbol.sum_axis(data = pred, axis = 1)
-    pred = mx.symbol.Flatten(data = pred)
-    # loss layer
-    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
-    return pred
-train(get_one_layer_mlp(256, 512), batch_size=64, num_epoch=10, learning_rate=.05)
-```
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 37ba9918fb70..8fa031300d11 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -139,9 +139,6 @@ def test_onnx_fine_tuning_gluon():
 def test_onnx_inference_on_onnx_model():
     assert _test_tutorial_nb('onnx/inference_on_onnx_model')
 
-def test_python_matrix_factorization():
-    assert _test_tutorial_nb('python/matrix_factorization')
-
 def test_python_linear_regression():
     assert _test_tutorial_nb('python/linear-regression')