From 8f4b092e57fb0b97a367f092c468f66193717d93 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@users.noreply.github.com>
Date: Tue, 24 Jul 2018 00:39:53 -0700
Subject: [PATCH 01/63] Enable control flow test (#11869)

---
 tests/python/unittest/test_contrib_control_flow.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index 1c4e491c1..67ed78ee0 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -1664,6 +1664,5 @@ def test_foreach_rnn():
 
 
 if __name__ == '__main__':
-    # import nose
-    # nose.runmodule()
-    test_cond()
+    import nose
+    nose.runmodule()

From 64d2e8bfe737fc6268bd4bbb65964b33838f4bd2 Mon Sep 17 00:00:00 2001
From: vishaalkapoor <40836875+vishaalkapoor@users.noreply.github.com>
Date: Tue, 24 Jul 2018 13:23:01 -0700
Subject: [PATCH 02/63] [MXAPPS-581] Nightly Straight Dope tests. (#11814)

* [MXAPPS-581] Nightly Straight Dope tests.

The Straight Dope notebooks will retrieved from the Github repo, run and
scanned for warnings and errors. Because we are not checking accuracy of
the training, we set the number of epochs to 1 to reduce the integration
test run time.
* Common functionality for running and testing notebooks has been
  factored into a common test util module.
* Support for running UTF-8 notebooks added (Python2 and 3 compatible).
* Notebooks requiring a single GPU and multi GPUs have been split
  into two different test suites so that they can be run on different
  hardware.
* Add test to make sure that all notebooks are tested.
* Comment out broken notebooks while they are being fixed (I will
  uncomment them in a follow up PR).

* [MXAPPS-581] Download notebooks in test setup.

* Moving logic to download the Straight Dope notebooks to the test
harness.
* Remove cache logic as it is unnecessary.

* [MXAPPS-581] Add a timeout for download of notebooks.

* [MXAPPS-581] Move notebooks requiring multi-gpus.

Move two notebooks requiring multi-GPUs out of the single GPU test suite.
---
 ci/docker/runtime_functions.sh                |  38 ++
 tests/nightly/JenkinsfileForBinaries          |  36 ++
 tests/nightly/straight_dope/README.md         |   7 +
 .../straight_dope/straight_dope_test_utils.py | 130 +++++++
 .../straight_dope/test_notebooks_multi_gpu.py |  49 +++
 .../test_notebooks_single_gpu.py              | 332 ++++++++++++++++++
 tests/tutorials/test_tutorials.py             |  80 +----
 tests/utils/notebook_test/__init__.py         |  97 +++++
 8 files changed, 705 insertions(+), 64 deletions(-)
 create mode 100755 tests/nightly/straight_dope/README.md
 create mode 100644 tests/nightly/straight_dope/straight_dope_test_utils.py
 create mode 100644 tests/nightly/straight_dope/test_notebooks_multi_gpu.py
 create mode 100644 tests/nightly/straight_dope/test_notebooks_single_gpu.py
 create mode 100644 tests/utils/notebook_test/__init__.py

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c899fe5c1..a0795eb58 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -895,6 +895,44 @@ nightly_test_javascript() {
     make -C /work/mxnet/amalgamation libmxnet_predict.js MIN=1 EMCC=/work/deps/emscripten/emcc
 }
 
+# Nightly 'MXNet: The Straight Dope' Single-GPU Tests
+nightly_straight_dope_python2_single_gpu_tests() {
+    set -ex
+    cd /work/mxnet/tests/nightly/straight_dope
+    export PYTHONPATH=/work/mxnet/python/
+    export MXNET_TEST_KERNEL=python2
+    nosetests-2.7 --with-xunit --xunit-file nosetests_straight_dope_python2_single_gpu.xml \
+      test_notebooks_single_gpu.py --nologcapture
+}
+
+nightly_straight_dope_python3_single_gpu_tests() {
+    set -ex
+    cd /work/mxnet/tests/nightly/straight_dope
+    export PYTHONPATH=/work/mxnet/python/
+    export MXNET_TEST_KERNEL=python3
+    nosetests-3.4 --with-xunit --xunit-file nosetests_straight_dope_python3_single_gpu.xml \
+      test_notebooks_single_gpu.py --nologcapture
+}
+
+# Nightly 'MXNet: The Straight Dope' Multi-GPU Tests
+nightly_straight_dope_python2_multi_gpu_tests() {
+    set -ex
+    cd /work/mxnet/tests/nightly/straight_dope
+    export PYTHONPATH=/work/mxnet/python/
+    export MXNET_TEST_KERNEL=python2
+    nosetests-2.7 --with-xunit --xunit-file nosetests_straight_dope_python2_multi_gpu.xml \
+      test_notebooks_multi_gpu.py --nologcapture
+}
+
+nightly_straight_dope_python3_multi_gpu_tests() {
+    set -ex
+    cd /work/mxnet/tests/nightly/straight_dope
+    export PYTHONPATH=/work/mxnet/python/
+    export MXNET_TEST_KERNEL=python3
+    nosetests-3.4 --with-xunit --xunit-file nosetests_straight_dope_python3_multi_gpu.xml \
+      test_notebooks_multi_gpu.py --nologcapture
+}
+
 # Deploy
 
 deploy_docs() {
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index 3d958b1de..0b009d28a 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -98,6 +98,42 @@ try {
           docker_run('ubuntu_nightly_gpu', 'nightly_test_KVStore_singleNode', true) 
         }
       }
+    },
+    'StraightDope: Python2 Single-GPU': {
+      node('mxnetlinux-gpu-p3') {
+        ws('workspace/straight_dope-single_gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python2_single_gpu_tests', true)
+        }
+      }
+    },
+    'StraightDope: Python2 Multi-GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/straight_dope-multi_gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python2_multi_gpu_tests', true)
+        }
+      }
+    },
+    'StraightDope: Python3 Single-GPU': {
+      node('mxnetlinux-gpu-p3') {
+        ws('workspace/straight_dope-single_gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python3_single_gpu_tests', true)
+        }
+      }
+    },
+    'StraightDope: Python3 Multi-GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/straight_dope-multi_gpu') {
+          init_git()
+          unpack_lib('gpu', mx_lib)
+          docker_run('ubuntu_nightly_gpu', 'nightly_straight_dope_python3_multi_gpu_tests', true)
+        }
+      }
     }
   }
 } catch (caughtError) {
diff --git a/tests/nightly/straight_dope/README.md b/tests/nightly/straight_dope/README.md
new file mode 100755
index 000000000..65a615b58
--- /dev/null
+++ b/tests/nightly/straight_dope/README.md
@@ -0,0 +1,7 @@
+# Nightly Tests for MXNet: The Straight Dope
+
+These are some longer running tests that are scheduled to run every night. 
+
+### Description
+These tests verify that the straight dope tutorials run without error. They are
+run on both single and multi-gpu configurations.
diff --git a/tests/nightly/straight_dope/straight_dope_test_utils.py b/tests/nightly/straight_dope/straight_dope_test_utils.py
new file mode 100644
index 000000000..bb64f37fe
--- /dev/null
+++ b/tests/nightly/straight_dope/straight_dope_test_utils.py
@@ -0,0 +1,130 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#pylint: disable=no-member, too-many-locals, too-many-branches, no-self-use, broad-except, lost-exception, too-many-nested-blocks, too-few-public-methods, invalid-name
+"""
+    This file tests and ensures that all straight dope notebooks run
+    without warning or exception.
+
+    env variable MXNET_TEST_KERNEL controls which kernel to use when running
+    the notebook. e.g: `export MXNET_TEST_KERNEL=python2`
+"""
+import io
+import os
+import re
+import shutil
+import subprocess
+import sys
+from time import sleep
+
+#TODO(vishaalk): Find a cleaner way to import this notebook.
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'utils'))
+from notebook_test import run_notebook
+
+EPOCHS_REGEX = r'epochs\s+=\s+[0-9]+'  # Regular expression that matches 'epochs = #'
+GIT_PATH = '/usr/bin/git'
+GIT_REPO = 'https://github.com/zackchase/mxnet-the-straight-dope'
+KERNEL = os.getenv('MXNET_TEST_KERNEL', None)
+NOTEBOOKS_DIR = os.path.join(os.path.dirname(__file__), 'tmp_notebook')
+
+def _test_notebook(notebook, override_epochs=True):
+    """Run Jupyter notebook to catch any execution error.
+
+    Args:
+        notebook : string
+            notebook name in folder/notebook format
+        epochs : boolean
+            whether or not to override the number of epochs to 1
+    Returns:
+        True if the notebook runs without warning or error.
+    """
+    if override_epochs:
+        _override_epochs(notebook)
+    return run_notebook(notebook, NOTEBOOKS_DIR, kernel=KERNEL, temp_dir=NOTEBOOKS_DIR)
+
+
+def _override_epochs(notebook):
+    """Overrides the number of epochs in the notebook to 1 epoch. Note this operation is idempotent.
+
+    Args:
+        notebook : string
+            notebook name in folder/notebook format
+
+    """
+    notebook_path = os.path.join(*([NOTEBOOKS_DIR] + notebook.split('/'))) + ".ipynb"
+
+    # Read the notebook and set epochs to num_epochs
+    with io.open(notebook_path, 'r', encoding='utf-8') as f:
+        notebook = f.read()
+
+    # Set number of epochs to 1
+    modified_notebook = re.sub(EPOCHS_REGEX, 'epochs = 1', notebook)
+
+    # Replace the original notebook with the modified one.
+    with io.open(notebook_path, 'w', encoding='utf-8') as f:
+        f.write(modified_notebook)
+
+
+def _download_straight_dope_notebooks():
+    """Downloads the Straight Dope Notebooks.
+
+    Returns:
+        True if it succeeds in downloading the notebooks without error.
+    """
+    print('Cleaning and setting up notebooks directory "{}"'.format(NOTEBOOKS_DIR))
+    shutil.rmtree(NOTEBOOKS_DIR, ignore_errors=True)
+
+    cmd = [GIT_PATH,
+           'clone',
+           GIT_REPO,
+           NOTEBOOKS_DIR]
+
+    proc, msg = _run_command(cmd)
+
+    if proc.returncode != 0:
+        err_msg = 'Error downloading Straight Dope notebooks.\n'
+        err_msg += msg
+        print(err_msg)
+        return False
+    return True
+
+def _run_command(cmd, timeout_secs=300):
+    """ Runs a command with a specified timeout.
+
+    Args:
+        cmd : list of string
+            The command with arguments to run.
+        timeout_secs: integer
+            The timeout in seconds
+
+    Returns:
+        Returns the process and the output as a pair.
+    """
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT)
+
+    for i in range(timeout_secs):
+        sleep(1)
+        if proc.poll() is not None:
+            (out, _) = proc.communicate()
+            return proc, out.decode('utf-8')
+
+    proc.kill()
+    return proc, "Timeout of %s secs exceeded." % timeout_secs
+
diff --git a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
new file mode 100644
index 000000000..2038ada3a
--- /dev/null
+++ b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#pylint: disable=no-member, too-many-locals, too-many-branches, no-self-use, broad-except, lost-exception, too-many-nested-blocks, too-few-public-methods, invalid-name, missing-docstring
+"""
+    This file tests that the notebooks requiring multi GPUs run without
+    warning or exception.
+"""
+import unittest
+from straight_dope_test_utils import _test_notebook
+from straight_dope_test_utils import _download_straight_dope_notebooks
+
+class StraightDopeMultiGpuTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        assert _download_straight_dope_notebooks()
+
+    # Chapter 7
+
+    # TODO(vishaalk): module 'mxnet.gluon' has no attribute 'autograd'
+    #def test_multiple_gpus_scratch(self):
+    #    assert _test_notebook('chapter07_distributed-learning/multiple-gpus-scratch')
+
+    def test_multiple_gpus_gluon(self):
+        assert _test_notebook('chapter07_distributed-learning/multiple-gpus-gluon')
+
+    # TODO(vishaalk): Do a dry run, and then enable.
+    #def test_training_with_multiple_machines(self):
+    #    assert _test_notebook('chapter07_distributed-learning/training-with-multiple-machines')
+
+    # Chapter 8
+
+    # TODO(vishaalk): Module skimage needs to be added to docker image.
+    # def test_fine_tuning(self):
+    #    assert _test_notebook('chapter08_computer-vision/fine-tuning')
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
new file mode 100644
index 000000000..b87d16cb0
--- /dev/null
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -0,0 +1,332 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#pylint: disable=no-member, too-many-locals, too-many-branches, no-self-use, broad-except, lost-exception, too-many-nested-blocks, too-few-public-methods, invalid-name, missing-docstring
+"""
+    This file tests that the notebooks requiring a single GPU run without
+    warning or exception.
+"""
+import glob
+import re
+import os
+import unittest
+from straight_dope_test_utils import _test_notebook
+from straight_dope_test_utils import _download_straight_dope_notebooks
+
+NOTEBOOKS_WHITELIST = [
+    'chapter01_crashcourse/preface',
+    'chapter01_crashcourse/introduction',
+    'chapter01_crashcourse/chapter-one-problem-set',
+    'chapter02_supervised-learning/environment',
+    'chapter07_distributed-learning/multiple-gpus-scratch',
+    'chapter07_distributed-learning/multiple-gpus-gluon',
+    'chapter07_distributed-learning/training-with-multiple-machines'
+]
+
+
+class StraightDopeSingleGpuTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        assert _download_straight_dope_notebooks()
+
+
+    def test_completeness(self):
+        """
+        Make sure that every tutorial that isn't in the whitelist is considered for testing by this
+        file. Exceptions should be added to the whitelist.
+        N.B. If the test is commented out, then that will be viewed as an intentional disabling of the
+        test.
+        """
+        # Open up this test file.
+        with open(__file__, 'r') as f:
+            notebook_test_text = '\n'.join(f.readlines())
+
+        notebooks_path = os.path.join(os.path.dirname(__file__), 'straight_dope_book')
+        notebooks = glob.glob(os.path.join(notebooks_path, '**', '*.ipynb'))
+
+        # Compile a list of notebooks that are tested
+        tested_notebooks = set(re.findall(r"assert _test_notebook\('(.*)'\)", notebook_test_text))
+
+       # Ensure each notebook in the straight dope book directory is on the whitelist or is tested.
+        for notebook in notebooks:
+            friendly_name = '/'.join(notebook.split('/')[-2:]).split('.')[0]
+            if friendly_name not in tested_notebooks and friendly_name not in NOTEBOOKS_WHITELIST:
+                assert False, friendly_name + " has not been added to the nightly/tests/straight_" + \
+                              "dope/test_notebooks_single_gpu.py test_suite. Consider also adding " + \
+                              "it to nightly/tests/straight_dope/test_notebooks_multi_gpu.py as " + \
+                              "well if the notebooks makes use of multiple GPUs."
+
+    def test_ndarray(self):
+        assert _test_notebook('chapter01_crashcourse/ndarray')
+
+    def test_linear_algebra(self):
+        assert _test_notebook('chapter01_crashcourse/linear-algebra')
+
+    def test_probability(self):
+        assert _test_notebook('chapter01_crashcourse/probability')
+
+    # TODO(vishaalk): Notebook contains the word 'Warning'. Needs to be updated to a synonym.
+    #def test_autograd(self):
+    #    assert _test_notebook('chapter01_crashcourse/autograd')
+
+    # Chapter 2
+
+    def test_linear_regression_scratch(self):
+        assert _test_notebook('chapter02_supervised-learning/linear-regression-scratch')
+
+    def test_linear_regression_gluon(self):
+        assert _test_notebook('chapter02_supervised-learning/linear-regression-gluon')
+
+    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
+    # python code can be run from another directory.
+    #def test_logistic_regression_gluon(self):
+    #    assert _test_notebook('chapter02_supervised-learning/logistic-regression-gluon')
+
+    def test_softmax_regression_scratch(self):
+        assert _test_notebook('chapter02_supervised-learning/softmax-regression-scratch')
+
+    def test_softmax_regression_gluon(self):
+        assert _test_notebook('chapter02_supervised-learning/softmax-regression-gluon')
+
+    def test_regularization_scratch(self):
+        assert _test_notebook('chapter02_supervised-learning/regularization-scratch')
+
+    # TODO(vishaalk): Notebook does not appear to be JSON: '{\n "cells": [\n  {\n   "cell_type": "m....
+    #def test_regularization_gluon(self):
+    #    assert _test_notebook('chapter02_supervised-learning/regularization-gluon')
+
+    def test_perceptron(self):
+        assert _test_notebook('chapter02_supervised-learning/perceptron')
+
+    # Chapter 3
+
+    def test_mlp_scratch(self):
+        assert _test_notebook('chapter03_deep-neural-networks/mlp-scratch')
+
+    def test_mlp_gluon(self):
+        assert _test_notebook('chapter03_deep-neural-networks/mlp-gluon')
+
+    def test_mlp_dropout_scratch(self):
+        assert _test_notebook('chapter03_deep-neural-networks/mlp-dropout-scratch')
+
+    def test_mlp_dropout_gluon(self):
+        assert _test_notebook('chapter03_deep-neural-networks/mlp-dropout-gluon')
+
+    def test_plumbing(self):
+        assert _test_notebook('chapter03_deep-neural-networks/plumbing')
+
+    def test_custom_layer(self):
+        assert _test_notebook('chapter03_deep-neural-networks/custom-layer')
+
+    #def test_kaggle_gluon_kfold(self):
+    #    assert _test_notebook('chapter03_deep-neural-networks/kaggle-gluon-kfold')
+
+    # TODO(vishaalk): Load params and Save params are deprecated warning.
+    #def test_serialization(self):
+    #    assert _test_notebook('chapter03_deep-neural-networks/serialization')
+
+    # Chapter 4
+
+    def test_cnn_scratch(self):
+        assert _test_notebook('chapter04_convolutional-neural-networks/cnn-scratch')
+
+    def test_cnn_gluon(self):
+        assert _test_notebook('chapter04_convolutional-neural-networks/cnn-gluon')
+
+    # TODO(vishaalk): Load params and Save params are deprecated warning.
+    #def test_deep_cnns_alexnet(self):
+    #    assert _test_notebook('chapter04_convolutional-neural-networks/deep-cnns-alexnet')
+
+    def test_very_deep_nets_vgg(self):
+        assert _test_notebook('chapter04_convolutional-neural-networks/very-deep-nets-vgg')
+
+    def test_cnn_batch_norm_scratch(self):
+        assert _test_notebook('chapter04_convolutional-neural-networks/cnn-batch-norm-scratch')
+
+    def test_cnn_batch_norm_gluon(self):
+        assert _test_notebook('chapter04_convolutional-neural-networks/cnn-batch-norm-gluon')
+
+    # Chapter 5
+
+    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
+    # python code can be run from another directory.
+    #def test_simple_rnn(self):
+    #    assert _test_notebook('chapter05_recurrent-neural-networks/simple-rnn')
+
+    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
+    # python code can be run from another directory.
+    #def test_lstm_scratch(self):
+    #    assert _test_notebook('chapter05_recurrent-neural-networks/lstm-scratch')
+
+    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
+    # python code can be run from another directory.
+    #def test_gru_scratch(self):
+    #    assert _test_notebook('chapter05_recurrent-neural-networks/gru-scratch')
+
+    #def test_rnns_gluon(self):
+    #    assert _test_notebook('chapter05_recurrent-neural-networks/rnns-gluon')
+
+    # Chapter 6
+
+    def test_optimization_intro(self):
+        assert _test_notebook('chapter06_optimization/optimization-intro')
+
+    # TODO(vishaalk): RuntimeWarning: Overflow encountered in reduce.
+    #def test_gd_sgd_scratch(self):
+    #    assert _test_notebook('chapter06_optimization/gd-sgd-scratch')
+
+    # TODO(vishaalk): RuntimeWarning: Overflow encountered in reduce.
+    #def test_gd_sgd_gluon(self):
+    #    assert _test_notebook('chapter06_optimization/gd-sgd-gluon')
+
+    def test_momentum_scratch(self):
+        assert _test_notebook('chapter06_optimization/momentum-scratch')
+
+    def test_momentum_gluon(self):
+        assert _test_notebook('chapter06_optimization/momentum-gluon')
+
+    def test_adagrad_scratch(self):
+        assert _test_notebook('chapter06_optimization/adagrad-scratch')
+
+    def test_adagrad_gluon(self):
+        assert _test_notebook('chapter06_optimization/adagrad-gluon')
+
+    def test_rmsprop_scratch(self):
+        assert _test_notebook('chapter06_optimization/rmsprop-scratch')
+
+    def test_rmsprop_gluon(self):
+        assert _test_notebook('chapter06_optimization/rmsprop-gluon')
+
+    def test_adadelta_scratch(self):
+        assert _test_notebook('chapter06_optimization/adadelta-scratch')
+
+    def test_adadelta_gluon(self):
+        assert _test_notebook('chapter06_optimization/adadelta-gluon')
+
+    def test_adam_scratch(self):
+        assert _test_notebook('chapter06_optimization/adam-scratch')
+
+    def test_adam_gluon(self):
+        assert _test_notebook('chapter06_optimization/adam-gluon')
+
+    # Chapter 7
+
+    def test_hybridize(self):
+        assert _test_notebook('chapter07_distributed-learning/hybridize')
+
+
+    # Chapter 8
+
+    # TODO(vishaalk): Load params and Save params are deprecated warning.
+    #def test_object_detection(self):
+    #    assert _test_notebook('chapter08_computer-vision/object-detection')
+
+    # TODO(vishaalk): Module skimage needs to be added to docker image.
+    #def test_fine_tuning(self):
+    #    assert _test_notebook('chapter08_computer-vision/fine-tuning')
+
+    # TODO(vishaalk):
+    #def test_visual_question_answer(self):
+    #    assert _test_notebook('chapter08_computer-vision/visual-question-answer')
+
+    # Chapter 9
+
+    def test_tree_lstm(self):
+        assert _test_notebook('chapter09_natural-language-processing/tree-lstm')
+
+    # Chapter 11
+
+    # TODO(vishaalk): Deferred initialization failed because shape cannot be inferred.
+    #def test_intro_recommender_systems(self):
+    #    assert _test_notebook('chapter11_recommender-systems/intro-recommender-systems')
+
+    # Chapter 12
+
+    def test_lds_scratch(self):
+        assert _test_notebook('chapter12_time-series/lds-scratch')
+
+    # TODO(vishaalk): File doesn't appear to be valid JSON.
+    #def test_issm_scratch(self):
+    #    assert _test_notebook('chapter12_time-series/issm-scratch')
+
+    # TODO(vishaalk): Error: sequential1_batchnorm0_running_mean' has not been initialized
+    # def test_intro_forecasting_gluon(self):
+    #    assert _test_notebook('chapter12_time-series/intro-forecasting-gluon')
+
+    #def test_intro_forecasting_2_gluon(self):
+    #    assert _test_notebook('chapter12_time-series/intro-forecasting-2-gluon')
+
+    # Chapter 13
+
+    # TODO(vishaalk): Load params and Save params are deprecated warning.
+    #def test_vae_gluon(self):
+    #    assert _test_notebook('chapter13_unsupervised-learning/vae-gluon')
+
+    # Chapter 14
+
+    def test_igan_intro(self):
+        assert _test_notebook('chapter14_generative-adversarial-networks/gan-intro')
+
+    def test_dcgan(self):
+        assert _test_notebook('chapter14_generative-adversarial-networks/dcgan')
+
+    def test_generative_adversarial_networks(self):
+        assert _test_notebook('chapter14_generative-adversarial-networks/conditional')
+
+    # Chapter 16
+
+    # TODO(vishaalk): Checked failed oshape.Size() != dshape.Size()
+    #def test_tensor_basics(self):
+    #    assert _test_notebook('chapter16_tensor_methods/tensor_basics')
+
+    # TODO(vishaalk): Notebook does not appear to be valid JSON.
+    #def test_pixel2pixel(self):
+    #    assert _test_notebook('chapter14_generative-adversarial-networks/pixel2pixel')
+
+    # Chapter 17
+
+    # TODO(vishaalk): Requires OpenAI Gym. Also uses deprecated load_params.
+    #def test_dqn(self):
+#    assert _test_notebook('chapter17_deep-reinforcement-learning/DQN')
+
+#def test_ddqn(self):
+#    assert _test_notebook('chapter17_deep-reinforcement-learning/DDQN')
+
+# Chapter 18
+
+#def test_bayes_by_backprop(self):
+#    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop')
+
+#def test_bayes_by_backprop_gluon(self):
+#    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop-gluon')
+
+# TODO(vishaalk): AttributeError: 'list' object has no attribute 'keys'
+#def test_bayes_by_backprop_rnn(self):
+#    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop-rnn')
+
+# Chapter 19
+
+# TODO(vishaalk): Requires deepchem
+#def test_graph_neural_networks(self):
+#    assert _test_notebook('chapter19_graph-neural-networks/Graph-Neural-Networks')
+
+# Cheatsheets
+
+# TODO(vishaalk): There is a relative file path needs to be fixed so that the
+# python code can be run from another directory.
+#def test_kaggle_gluon_kfold(self):
+#    assert _test_notebook('cheatsheets/kaggle-gluon-kfold')
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index a95e3250c..d2d5e6e15 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -33,77 +33,29 @@
     a clean workspace.
 """
 import os
-import warnings
-import imp
-import shutil
-import time
-import argparse
-import traceback
-import nbformat
-from nbconvert.preprocessors import ExecutePreprocessor
 import sys
 
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'utils'))
+from notebook_test import run_notebook
 
-# Maximum 10 minutes per test
-# Reaching timeout causes test failure
-TIME_OUT = 10*60
-# Pin to ipython version 4
-IPYTHON_VERSION = 4
-temp_dir = 'tmp_notebook'
+
+TUTORIAL_DIR = os.path.join(os.path.dirname(__file__), '..', '..', 'docs', '_build', 'html', 'tutorials')
+KERNEL = os.getenv('MXNET_TUTORIAL_TEST_KERNEL', None)
+NO_CACHE = os.getenv('MXNET_TUTORIAL_TEST_NO_CACHE', False)
 
 def _test_tutorial_nb(tutorial):
-    """Run tutorial jupyter notebook to catch any execution error.
+    """Run tutorial Jupyter notebook to catch any execution error.
 
     Parameters
     ----------
     tutorial : str
-        tutorial name in folder/tutorial format
-    """
-
-    tutorial_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'docs', '_build', 'html', 'tutorials')
-    tutorial_path = os.path.join(*([tutorial_dir] + tutorial.split('/')))
-
-    # see env variable docs in the doc string of the file
-    kernel = os.getenv('MXNET_TUTORIAL_TEST_KERNEL', None)
-    no_cache = os.getenv('MXNET_TUTORIAL_TEST_NO_CACHE', False)
-
-    working_dir = os.path.join(*([temp_dir] + tutorial.split('/')))
-
-    if no_cache == '1':
-        print("Cleaning and setting up temp directory '{}'".format(working_dir))
-        shutil.rmtree(temp_dir, ignore_errors=True)
-
-    errors = []
-    notebook = None
-    if not os.path.isdir(working_dir):
-        os.makedirs(working_dir)
-    try:
-        notebook = nbformat.read(tutorial_path + '.ipynb', as_version=IPYTHON_VERSION)
-        # Adding a small delay to allow time for sockets to be freed
-        # stop-gap measure to battle the 1000ms linger of socket hard coded
-        # in the kernel API code
-        time.sleep(1.1) 
-        if kernel is not None:
-            eprocessor = ExecutePreprocessor(timeout=TIME_OUT, kernel_name=kernel)
-        else:
-            eprocessor = ExecutePreprocessor(timeout=TIME_OUT)
-        nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
-    except Exception as err:
-        err_msg = str(err)
-        errors.append(err_msg)
-    finally:
-        if notebook is not None:
-            output_file = os.path.join(working_dir, "output.txt")
-            nbformat.write(notebook, output_file)
-            output_nb = open(output_file, mode='r')
-            for line in output_nb:
-                if "Warning:" in line:
-                    errors.append("Warning:\n"+line)
-        if len(errors) > 0:
-            print('\n'.join(errors))
-            return False
-        return True
+        the name of the tutorial to be tested
 
+    Returns
+    -------
+        True if there are no warnings or errors.
+    """
+    return run_notebook(tutorial, TUTORIAL_DIR, kernel=KERNEL, no_cache=NO_CACHE)
 
 
 def test_basic_ndarray():
@@ -181,7 +133,7 @@ def test_onnx_inference_on_onnx_model():
 def test_python_matrix_factorization():
     assert _test_tutorial_nb('python/matrix_factorization')
 
-def test_python_linear_regression() :
+def test_python_linear_regression():
     assert _test_tutorial_nb('python/linear-regression')
 
 def test_python_mnist():
@@ -204,7 +156,7 @@ def test_python_types_of_data_augmentation():
 
 def test_python_profiler():
     assert _test_tutorial_nb('python/profiler')
-    
+
 def test_sparse_row_sparse():
     assert _test_tutorial_nb('sparse/row_sparse')
 
@@ -224,4 +176,4 @@ def test_vision_large_scale_classification():
     assert _test_tutorial_nb('vision/large_scale_classification')
 
 def test_vision_cnn_visualization():
-    assert _test_tutorial_nb('vision/cnn_visualization')
\ No newline at end of file
+    assert _test_tutorial_nb('vision/cnn_visualization')
diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
new file mode 100644
index 000000000..cb5282fb4
--- /dev/null
+++ b/tests/utils/notebook_test/__init__.py
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#pylint: disable=no-member, too-many-locals, too-many-branches, no-self-use, broad-except, lost-exception, too-many-nested-blocks, too-few-public-methods, invalid-name
+"""
+    This file tests provides functionality to test that notebooks run without
+    warning or exception.
+"""
+import io
+import os
+import shutil
+import time
+
+from nbconvert.preprocessors import ExecutePreprocessor
+import nbformat
+
+
+IPYTHON_VERSION = 4  # Pin to ipython version 4.
+TIME_OUT = 10*60  # Maximum 10 mins/test. Reaching timeout causes test failure.
+
+def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'):
+    """Run tutorial Jupyter notebook to catch any execution error.
+
+    Parameters
+    ----------
+    notebook : string
+        the name of the notebook to be tested
+    notebook_dir : string
+        the directory of the notebook to be tested
+    kernel : string, None
+        controls which kernel to use when running the notebook. e.g: python2
+    no_cache : '1' or False
+        controls whether to clean the temporary directory in which the
+        notebook was run and re-download any resource file. The default
+        behavior is to not clean the directory. Set to '1' to force clean the
+        directory.
+        NB: in the real CI, the tests will re-download everything since they
+        start from a clean workspace.
+    temp_dir: string
+        The temporary sub-directory directory in which to run the notebook.
+
+    Returns
+    -------
+       Returns true if the workbook runs with no warning or exception.
+    """
+
+    notebook_path = os.path.join(*([notebook_dir] + notebook.split('/')))
+    working_dir = os.path.join(*([temp_dir] + notebook.split('/')))
+
+    if no_cache == '1':
+        print("Cleaning and setting up temp directory '{}'".format(working_dir))
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+    errors = []
+    notebook = None
+    if not os.path.isdir(working_dir):
+        os.makedirs(working_dir)
+    try:
+        notebook = nbformat.read(notebook_path + '.ipynb', as_version=IPYTHON_VERSION)
+        # Adding a small delay to allow time for sockets to be freed
+        # stop-gap measure to battle the 1000ms linger of socket hard coded
+        # in the kernel API code
+        time.sleep(1.1)
+        if kernel is not None:
+            eprocessor = ExecutePreprocessor(timeout=TIME_OUT, kernel_name=kernel)
+        else:
+            eprocessor = ExecutePreprocessor(timeout=TIME_OUT)
+        nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+    except Exception as err:
+        err_msg = str(err)
+        errors.append(err_msg)
+    finally:
+        if notebook is not None:
+            output_file = os.path.join(working_dir, "output.txt")
+            nbformat.write(notebook, output_file)
+            output_nb = io.open(output_file, mode='r', encoding='utf-8')
+            for line in output_nb:
+                if "Warning:" in line:
+                    errors.append("Warning:\n" + line)
+        if len(errors) > 0:
+            print('\n'.join(errors))
+            return False
+        return True

From fe07d504f97b822ec66be6295428a331268d1277 Mon Sep 17 00:00:00 2001
From: ctcyang <carl14706@gmail.com>
Date: Tue, 24 Jul 2018 14:38:17 -0700
Subject: [PATCH 03/63] [MXNET-331] Single machine All Reduce Topology-aware
 Communication (Updated) (#11591)

* add multiroot all-reduce communication pattern

* fix bug with UpdateWeight

* fix PCI-E links appearing in weight matrix bug

* optimization to skip CopyFromTo in ReduceInner gains a bit of throughput

* remove unnecessary if statement

* Add tests

* add more tests, 6 tests left to add

* get rid of some dead code

* Add comments

* Add randomized tests for backtrack and kernighan-lin

* Fix Postprocess

* Add switch for first valid tree when num_gpus > 8, and for maximum weight when num_gpus <= 8

* Kernighan-Lin seems to find better trees

* get rid of printfs

* change defaults

* inherit from CommDevice instead of Comm

* Fix lint errors

* Add Python test using MXNET_KVSTORE_USETREE, fix CMake compilation problem, add header guard

* fix lint errors

* better header guard that works for tests

* get rid of unused variable warning

* retrigger jenkins

* resolve 2 comments

* address comment using Class to do test, get rid of extraneous test, use PCI-E as fallback for GPUs that are not linked by NVLink

* address comments

* fix a few bugs

* get rid of printfs

* get rid of print

* Comment out test for now

* fix 2 more bugs

* fix segfault

* change PrintVector, PrintTopo, PrintMatrix to LOG(INFO) instead of stdout

* Fix code alignment

* get rid of todo

* Make changes to env variable names to indicate they are TREE-related

* Add note saying when ARRAY_BOUND env var takes effect
---
 docs/faq/env_var.md                    |   26 +
 src/kvstore/comm.h                     |  121 +--
 src/kvstore/comm_tree.h                |  532 ++++++++++++
 src/kvstore/gpu_topology.h             | 1100 ++++++++++++++++++++++++
 src/kvstore/kvstore_local.h            |    8 +-
 tests/cpp/kvstore/gpu_topology_test.cc |  676 +++++++++++++++
 tests/python/gpu/test_device.py        |   82 ++
 tests/python/gpu/test_kvstore_gpu.py   |   67 +-
 tests/python/gpu/test_nccl.py          |    3 +-
 9 files changed, 2538 insertions(+), 77 deletions(-)
 create mode 100644 src/kvstore/comm_tree.h
 create mode 100644 src/kvstore/gpu_topology.h
 create mode 100644 tests/cpp/kvstore/gpu_topology_test.cc
 create mode 100644 tests/python/gpu/test_device.py

diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 12a898aad..881bc14fd 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -83,6 +83,32 @@ export MXNET_GPU_WORKER_NTHREADS=3
   - The minimum size of a "big array".
   - When the array size is bigger than this threshold, MXNET_KVSTORE_REDUCTION_NTHREADS threads are used for reduction.
   - This parameter is also used as a load balancer in kvstore. It controls when to partition a single weight to all the servers. If the size of a single weight is less than MXNET_KVSTORE_BIGARRAY_BOUND then, it is sent to a single randomly picked server otherwise it is partitioned to all the servers.
+
+* MXNET_KVSTORE_USETREE
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - If true, MXNet tries to use tree reduction for Push and Pull communication.
+  - Otherwise, MXNet uses the default Push and Pull implementation.
+  - [Tree reduction technology](http://www.sysml.cc/doc/178.pdf) has been shown to be faster than the standard ```--kv-store device``` Push/Pull and ```--kv-store nccl``` Push/Pull for small batch sizes.
+
+* MXNET_KVSTORE_LOGTREE
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - If true and MXNET_KVSTORE_USETREE is set to 1, MXNet will log the reduction trees that have been generated.
+
+* MXNET_KVSTORE_TREE_ARRAY_BOUND
+  - Values: Int ```(default=10000000)```
+  - The minimum size of a "big array".
+  - When the array size is bigger than this threshold and MXNET_KVSTORE_USETREE is set to 1, multiple trees are used to load balance the big gradient being communicated in order to better saturate link bandwidth.
+  - Note: This environmental variable only takes effect if Tree KVStore is being used (MXNET_KVSTORE_USETREE=1).
+
+* MXNET_KVSTORE_TREE_BACKTRACK
+  - Values: 0(false) or 1(true) ```(default=0)
+  - If true and MXNET_KVSTORE_USETREE is set to 1, MXNet tries to use backtracking to generate the trees required for tree reduction.
+  - If false and MXNET_KVSTORE_USETREE is set to 1, MXNet tries to use Kernighan-Lin heuristic to generate the trees required for tree reduction.
+
+* MXNET_KVSTORE_TREE_LINK_USAGE_PENALTY
+  - Values: Float ```(default=0.7)```
+  - The multiplicative penalty term to a link being used once.
+
 * MXNET_ENABLE_GPU_P2P
   - Values: 0(false) or 1(true) ```(default=1)```
   - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index d242dc304..34cab3037 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -474,6 +474,31 @@ class CommDevice : public Comm {
     }
   }
 
+  const NDArray& ReduceRowSparse(int key, const std::vector<NDArray>& src,
+                                 int priority) {
+    auto& buf = merge_buf_[key];
+    std::vector<NDArray> reduce(src.size());
+
+    const NDArrayStorageType stype = src[0].storage_type();
+    NDArray& buf_merged = buf.merged_buf(stype);
+    if (buf.copy_buf.empty()) {
+      // initialize buffer for copying during reduce
+      buf.copy_buf.resize(src.size());
+      for (size_t j = 0; j < src.size(); ++j) {
+        buf.copy_buf[j] = NDArray(stype, src[0].shape(), buf_merged.ctx(), true, src[0].dtype());
+      }
+    }
+    CHECK(src[0].storage_type() == buf.copy_buf[0].storage_type())
+         << "Storage type mismatch detected. " << src[0].storage_type() << "(src) vs. "
+         << buf.copy_buf[0].storage_type() << "(buf.copy_buf)";
+    for (size_t i = 0; i < src.size(); ++i) {
+      CopyFromTo(src[i], &(buf.copy_buf[i]), priority);
+      reduce[i] = buf.copy_buf[i];
+    }
+    ElementwiseSum(reduce, &buf_merged, priority);
+    return buf_merged;
+  }
+
   const NDArray& Reduce(int key, const std::vector<NDArray>& src,
                         int priority) override {
     // when this reduce is called from kvstore_dist, gc is not set
@@ -490,13 +515,14 @@ class CommDevice : public Comm {
 
     InitBuffersAndComm(src);
     auto& buf = merge_buf_[key];
-    std::vector<NDArray> reduce(src.size());
 
     const NDArrayStorageType stype = src[0].storage_type();
     NDArray& buf_merged = buf.merged_buf(stype);
     // normal dense reduce
     if (stype == kDefaultStorage) {
       CopyFromTo(src[0], &buf_merged, priority);
+
+      std::vector<NDArray> reduce(src.size());
       reduce[0] = buf_merged;
 
       if (buf.copy_buf.empty()) {
@@ -514,24 +540,11 @@ class CommDevice : public Comm {
         CopyFromTo(src[i+1], &(buf.copy_buf[i]), priority);
         reduce[i+1] = buf.copy_buf[i];
       }
+      ElementwiseSum(reduce, &buf_merged, priority);
     } else {
       // sparse reduce
-      if (buf.copy_buf.empty()) {
-        // initialize buffer for copying during reduce
-        buf.copy_buf.resize(src.size());
-        for (size_t j = 0; j < src.size(); ++j) {
-          buf.copy_buf[j] = NDArray(stype, src[0].shape(), buf_merged.ctx(), true, src[0].dtype());
-        }
-      }
-      CHECK(src[0].storage_type() == buf.copy_buf[0].storage_type())
-           << "Storage type mismatch detected. " << src[0].storage_type() << "(src) vs. "
-           << buf.copy_buf[0].storage_type() << "(buf.copy_buf)";
-      for (size_t i = 0; i < src.size(); ++i) {
-        CopyFromTo(src[i], &(buf.copy_buf[i]), priority);
-        reduce[i] = buf.copy_buf[i];
-      }
+      buf_merged = ReduceRowSparse(key, src, priority);
     }
-    ElementwiseSum(reduce, &buf_merged, priority);
     return buf_merged;
   }
 
@@ -659,6 +672,42 @@ class CommDevice : public Comm {
     }
   }
 
+  using KeyAttrs = std::tuple<int, TShape, int>;
+  // try to allocate buff on device evenly
+  void InitMergeBuffer(const std::vector<Context>& devs) {
+    std::sort(sorted_key_attrs_.begin(), sorted_key_attrs_.end(), [](
+              const KeyAttrs& a, const KeyAttrs& b) {
+      return std::get<1>(a).Size() > std::get<1>(b).Size();
+    });
+
+    std::unordered_map<int, std::pair<Context, size_t>> ctx_info;
+    for (auto d : devs) {
+      ctx_info[d.dev_id] = std::make_pair(d, 0);
+    }
+
+    for (size_t i = 0; i < sorted_key_attrs_.size(); ++i) {
+      const int key  = std::get<0>(sorted_key_attrs_[i]);
+      const TShape& shape = std::get<1>(sorted_key_attrs_[i]);
+      const int type = std::get<2>(sorted_key_attrs_[i]);
+      auto& buf = merge_buf_[key];
+      Context ctx;
+      size_t min_size = std::numeric_limits<size_t>::max();
+      for (auto it = ctx_info.begin(); it != ctx_info.end(); ++it) {
+        size_t size = it->second.second;
+        if (size <= min_size) {
+          ctx = it->second.first;
+          min_size = size;
+        }
+      }
+      // Delayed allocation - as the dense merged buffer might not be used at all if push()
+      // only sees sparse arrays
+      bool delay_alloc = true;
+      buf.merged = NDArray(shape, ctx, delay_alloc, type);
+      ctx_info[ctx.dev_id].second += shape.Size();
+    }
+    inited_ = true;
+  }
+
  private:
   void EnableP2P(const std::vector<Context>& devs) {
 #if MXNET_USE_CUDA
@@ -702,43 +751,6 @@ class CommDevice : public Comm {
 #endif
   }
 
-  using KeyAttrs = std::tuple<int, TShape, int>;
-  // try to allocate buff on device evenly
-  void InitMergeBuffer(const std::vector<Context>& devs) {
-    std::sort(sorted_key_attrs_.begin(), sorted_key_attrs_.end(), [](
-              const KeyAttrs& a, const KeyAttrs& b) {
-      return std::get<1>(a).Size() > std::get<1>(b).Size();
-    });
-
-    std::unordered_map<int, std::pair<Context, size_t>> ctx_info;
-    for (auto d : devs) {
-      ctx_info[d.dev_id] = std::make_pair(d, 0);
-    }
-
-    for (size_t i = 0; i < sorted_key_attrs_.size(); ++i) {
-      const int key  = std::get<0>(sorted_key_attrs_[i]);
-      const TShape& shape = std::get<1>(sorted_key_attrs_[i]);
-      const int type = std::get<2>(sorted_key_attrs_[i]);
-      auto& buf = merge_buf_[key];
-      Context ctx;
-      size_t min_size = std::numeric_limits<size_t>::max();
-      for (auto it = ctx_info.begin(); it != ctx_info.end(); ++it) {
-        size_t size = it->second.second;
-        if (size <= min_size) {
-          ctx = it->second.first;
-          min_size = size;
-        }
-      }
-      // Delayed allocation - as the dense merged buffer might not be used at all if push()
-      // only sees sparse arrays
-      bool delay_alloc = true;
-      buf.merged = NDArray(shape, ctx, delay_alloc, type);
-      ctx_info[ctx.dev_id].second += shape.Size();
-    }
-    inited_ = true;
-  }
-
-  std::vector<KeyAttrs> sorted_key_attrs_;
   /// \brief temporal space for pushing and pulling
   struct BufferEntry {
     /// \brief the dense merged value for reduce and broadcast operations
@@ -773,7 +785,10 @@ class CommDevice : public Comm {
     NDArray sparse_merged;
   };
   std::unordered_map<int, BufferEntry> merge_buf_;
+
+ public:
   bool inited_;
+  std::vector<KeyAttrs> sorted_key_attrs_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
new file mode 100644
index 000000000..1ebfcdc80
--- /dev/null
+++ b/src/kvstore/comm_tree.h
@@ -0,0 +1,532 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Copyright (c) 2018 by Contributors
+ */
+#ifndef MXNET_KVSTORE_COMM_TREE_H_
+#define MXNET_KVSTORE_COMM_TREE_H_
+#include <dmlc/omp.h>
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <vector>
+#include <tuple>
+#include <thread>
+#include <map>
+#include "mxnet/ndarray.h"
+#include "gradient_compression.h"
+#include "../ndarray/ndarray_function.h"
+#include "../operator/tensor/sparse_retain-inl.h"
+#include "./kvstore_utils.h"
+#include "./gpu_topology.h"
+namespace mxnet {
+namespace kvstore {
+/**
+ * \brief an implementation of Comm that performs reduction on device
+ * directly using tree.
+ *
+ * It is faster if the total device-to-device bandwidths is larger than
+ * device-to-cpu, which is often true for 4 or 8 GPUs. But it uses more device
+ * memory.
+ */
+class CommDeviceTree : public CommDevice {
+ public:
+  CommDeviceTree() {
+    inited_ = false;
+    gpuarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_TREE_ARRAY_BOUND", 10000000);
+    backtrack_ = dmlc::GetEnv("MXNET_KVSTORE_TREE_BACKTRACK", 0);
+    link_usage_penalty_ = dmlc::GetEnv("MXNET_KVSTORE_TREE_LINK_USAGE_PENALTY", 0.7);
+  }
+
+  virtual ~CommDeviceTree() { }
+
+  void Init(int key, const NDArrayStorageType stype, const TShape& shape,
+            int dtype = mshadow::kFloat32) override {
+    tree_sorted_key_attrs_.emplace_back(key, shape, dtype);
+    sorted_key_attrs_.emplace_back(key, shape, dtype);
+  }
+
+  void InitBuffersAndComm(const std::vector<NDArray>& src) {
+    if (!inited_) {
+      for (const auto& a : src) {
+        devs_.push_back(a.ctx());
+      }
+      QueryTopology();
+      // Note: delayed allocation set to true, because we do not want to allocate
+      // both in TreeBufferEntry and BufferEntry, so we use a size_t to keep
+      // track of each key's shape within BufferEntry
+      // -this information is required for inherited Reduce- and
+      //  BroadcastRowSparse
+      InitMergeBuffer(devs_);
+      InitMergeBufferTree();
+      if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
+        EnableP2P();
+      }
+    }
+  }
+
+  /**
+   * \brief Reduce src to tree_merge_buf_
+   * \param key is the id of the gradient we are doing Reduce on
+   * \param src is the array of values located on different GPUs
+   * \param root is the id of the GPU we want to send result of reduce to
+   * \param merged_row is the id of the slice we are taking
+   * \param priority the priority of the operation
+   */
+  const NDArray& ReduceInner(int key, const std::vector<NDArray>& src, int root,
+                             int merged_row, int priority) {
+    std::vector<std::vector<NDArray>> reduce(devs_.size());
+
+    TreeBufferEntry& random_buf = tree_merge_buf_[0][key];
+    const NDArrayStorageType stype = random_buf.merged[0].storage_type();
+    std::vector<size_t>& topology = topology_[root];
+    NDArray buf_slice;
+
+    if (stype == kDefaultStorage) {
+      // Copy everything into buf.merged for each gpu
+      for (size_t i = 0; i < src.size(); ++i) {
+        int start = scan_[root][depth_];
+        int end = scan_[root][depth_+1];
+
+        for (int j = start; j < end; ++j) {
+          int topo_id = topology[j];
+          TreeBufferEntry& buf = tree_merge_buf_[topo_id][key];
+
+          if (devs_[topo_id] == src[i].ctx()) {
+            CopyFromTo(src[i], &(buf.merged[merged_row]), priority);
+          }
+        }
+      }
+
+      for (int level = depth_; level > 0; --level) {
+        int start = scan_[root][level  ];
+        int end = scan_[root][level+1];
+
+        unsigned is_dest = 0;
+        int dest_id = 0;
+        for (int j = start; j < end; ++j) {
+          int topo_id = topology[j];
+          dest_id = (is_dest == 0) ? topo_id : dest_id;
+
+          TreeBufferEntry& buf_dest = tree_merge_buf_[dest_id][key];
+          TreeBufferEntry& buf_from = tree_merge_buf_[topo_id][key];
+
+          if (!is_dest) {
+            if (reduce[dest_id].size() == 0) {
+              reduce[dest_id].push_back(buf_dest.merged[merged_row]);
+            }
+          } else {
+            if (dest_id != topo_id) {
+              CopyFromTo(buf_from.merged[merged_row],
+                         &(buf_dest.copy_buf[merged_row][is_dest-1]),
+                         priority);
+              reduce[dest_id].push_back(
+                  buf_dest.copy_buf[merged_row][is_dest-1]);
+            }
+          }
+
+          is_dest = (is_dest == static_cast<unsigned>(kBranch)-1) ?
+              0 : is_dest+1;
+        }
+
+        start = scan_[root][level-1];
+        end = scan_[root][level];
+        int source = end;
+        for (int i = start; i < end; ++i) {
+          int gpu_id = topology[i];
+
+          // source keeps track of 2 leaf nodes, while start keeps track of parent
+          int dest_id = topology[source];
+          int from_id = topology[source+1];
+          source += 2;
+
+          // conditional to detect whether operation must be done
+          if (reduce[gpu_id].size() > 1 && dest_id != from_id) {
+            TreeBufferEntry& buf = tree_merge_buf_[gpu_id][key];
+            ElementwiseSum(reduce[gpu_id], &(buf.merged[merged_row]), priority);
+          }
+        }
+
+        // reset
+        for (unsigned i = 0; i < devs_.size(); ++i) {
+          reduce[i].clear();
+        }
+      }
+    } else {
+      LOG(FATAL) << "Only dense input supported for now";
+    }
+
+    int topo_id = topology[0];
+    TreeBufferEntry& buf = tree_merge_buf_[topo_id][key];
+    return buf.merged[merged_row];
+  }
+
+  const NDArray& Reduce(int key, const std::vector<NDArray>& src,
+                        int priority) override {
+    // when this reduce is called from kvstore_dist, gc is not set
+    // we don't do compression twice in dist_sync_device
+    if ((gc_ != nullptr) && (gc_->get_type() != CompressionType::kNone)) {
+      return ReduceCompressed(key, src, priority);
+    }
+
+    // avoid extra copy for single device, but it may bring problems for
+    // abnormal usage of kvstore
+    if (src.size() == 1) {
+      return src[0];
+    }
+
+    InitBuffersAndComm(src);
+    std::vector<std::vector<NDArray>>  slice(devs_.size());
+    std::vector<std::vector<NDArray*>> broadcast_slice(devs_.size());
+    std::vector<int>                   slice_scan(devs_.size()+1);
+
+    int total_size = src[0].shape().Size();
+    unsigned first_size = src[0].shape()[0];
+
+    const NDArrayStorageType stype = src[0].storage_type();
+    // normal dense reduce
+    if (stype == kDefaultStorage) {
+      if (total_size > gpuarray_bound_ && first_size >= 2*devs_.size()) {
+        // Find slice bounds
+        slice_scan[0] = 0;
+        int slice_size = first_size/devs_.size();
+        for (unsigned i = 1; i < devs_.size(); ++i) {
+          slice_scan[i] = slice_scan[i-1] + slice_size;
+        }
+        slice_scan[devs_.size()] = src[0].shape()[0];
+
+        // row: which slice
+        // col: which gpu
+        for (unsigned row = 0; row < devs_.size(); ++row) {
+          for (unsigned col = 0; col < devs_.size(); ++col) {
+            TreeBufferEntry& buf = tree_merge_buf_[col][key];
+            NDArray curr_slice = src[col].Slice(slice_scan[row],
+                slice_scan[row+1]);
+            slice[row].push_back(curr_slice);
+            broadcast_slice[row].push_back(&(buf.merged[row]));
+          }
+        }
+
+        // Do reduce-scatter (multiroot reduce)
+        // input:  slice (src)
+        // output: buf.merge_buf
+        for (unsigned i = 0; i < devs_.size(); ++i) {
+          ReduceInner(key, slice[i], i, i, priority);
+        }
+
+        for (unsigned i = 0; i < devs_.size(); ++i) {
+          BroadcastInner(key, *(broadcast_slice[i][i]), broadcast_slice[i], i, i, priority);
+        }
+      } else {
+        int root = 0;
+        ReduceInner(key, src, root, 0, priority);
+
+        TreeBufferEntry& buf = tree_merge_buf_[root][key];
+        return buf.merged[0];
+      }
+
+      // Copy from list of small NDArrays to one big NDArray, which is returned
+      int gpu_id = 0;
+      return src[gpu_id];
+    } else {
+      // sparse reduce
+      return ReduceRowSparse(key, src, priority);
+    }
+  }
+
+  void BroadcastInner(int key, const NDArray& src,
+                      const std::vector<NDArray*>& dst, int root,
+                      int merged_row, int priority) {
+    // copy to root of tree
+    std::vector<size_t>& topology = topology_[root];
+    std::vector<NDArray> temp(devs_.size());
+    int gpu_id = topology[0];
+    if (merged_row == -1)
+      CopyFromTo(src, dst[gpu_id], priority);
+    temp[gpu_id] = *dst[gpu_id];
+
+    for (int level = 1; level <= depth_; ++level) {
+      int start = scan_[root][level];
+      int end = scan_[root][level+1];
+
+      unsigned is_src = 0;
+      int src_id = 0;
+      for (int j = start; j < end; ++j) {
+        int topo_id = topology[j];
+        src_id = (is_src == 0) ? topo_id : src_id;
+
+        if (is_src && src_id != topo_id) {
+          CopyFromTo(temp[src_id], dst[topo_id], priority);
+          temp[topo_id] = *dst[topo_id];
+        }
+
+        is_src = (is_src == static_cast<unsigned>(kBranch)-1) ? 0 : is_src+1;
+      }
+    }
+  }
+
+  void Broadcast(int key, const NDArray& src,
+                 const std::vector<NDArray*> dst, int priority) override {
+    if (!inited_) {
+      // copy to a random device first
+      int dev_id = key % dst.size();
+      CopyFromTo(src, dst[dev_id], priority);
+      for (size_t i = 0; i < dst.size(); ++i) {
+        if (i != static_cast<size_t>(dev_id)) {
+          CopyFromTo(*dst[dev_id], dst[i], priority);
+        }
+      }
+    } else {
+      int total_size = src.shape().Size();
+      unsigned first_size = src.shape()[0];
+      const NDArrayStorageType stype = src.storage_type();
+      // normal dense reduce
+      if (stype == kDefaultStorage) {
+      if (total_size > gpuarray_bound_ && first_size >= 2*devs_.size()) {
+        std::vector<int> slice_scan(devs_.size()+1);
+        slice_scan[0] = 0;
+        int slice_size = (dst[0]->shape()[0])/devs_.size();
+        for (unsigned i = 1; i < devs_.size(); ++i) {
+          slice_scan[i] = slice_scan[i-1] + slice_size;
+        }
+        slice_scan[devs_.size()] = dst[0]->shape()[0];
+
+        for (unsigned gpu_id = 0; gpu_id < dst.size(); ++gpu_id) {
+          TreeBufferEntry& buf = tree_merge_buf_[gpu_id][key];
+          for (unsigned i = 0; i < devs_.size(); ++i) {
+            if (devs_[gpu_id] == dst[gpu_id]->ctx()) {
+              NDArray curr_slice = dst[gpu_id]->Slice(slice_scan[i], slice_scan[i+1]);
+              CopyFromTo(buf.merged[i], &curr_slice, priority);
+            }
+          }
+        }
+      } else {
+        int root = 0;
+        BroadcastInner(key, src, dst, root, -1, priority);
+      }} else {
+        LOG(FATAL) << "Only dense input supported for now";
+      }
+    }
+  }
+
+ private:
+  void EnableP2P() {
+#if MXNET_USE_CUDA
+    std::vector<int> gpus;
+    for (const auto& d : devs_) {
+      if (d.dev_mask() == gpu::kDevMask) {
+        gpus.push_back(d.dev_id);
+      }
+    }
+    int n = static_cast<int>(gpus.size());
+    int enabled = 0;
+    std::vector<int> p2p(n*n);
+    for (int i = 0; i < n; ++i) {
+      cudaSetDevice(gpus[i]);
+      for (int j = 0; j < n; j++) {
+        int access;
+        cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
+        if (access) {
+          cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
+          if (e == cudaSuccess || e == cudaErrorPeerAccessAlreadyEnabled) {
+            ++enabled;
+            p2p[i*n+j] = 1;
+          }
+        }
+      }
+    }
+    if (enabled != n*(n-1)) {
+      // print warning info if not fully enabled
+      LOG(WARNING) << "only " << enabled <<  " out of "
+                   << n*(n-1) << " GPU pairs are enabled direct access. "
+                   << "It may affect the performance. "
+                   << "You can set MXNET_ENABLE_GPU_P2P=0 to turn it off";
+      std::string access(n, '.');
+      for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+          access[j] = p2p[i*n+j] ? 'v' : '.';
+        }
+        LOG(WARNING) << access;
+      }
+    }
+#endif
+  }
+
+  void QueryTopology() {
+#if MXNET_USE_CUDA
+    std::vector<float> link_matrix(devs_.size()*devs_.size());
+    GetP2PWeight(devs_, &link_matrix);
+    if (backtrack_)
+      LOG(INFO) << "Using Backtracking to generate trees";
+    else
+      LOG(INFO) << "Using Kernighan-Lin to generate trees";
+    ComputeTrees(link_matrix, devs_.size(), link_usage_penalty_, backtrack_,
+        &topology_, &scan_);
+
+    depth_ = ComputeDepth(devs_.size());
+#endif
+  }
+
+  using KeyAttrs = std::tuple<int, TShape, int>;
+  // try to allocate buff on device evenly
+  void InitMergeBufferTree() {
+    LOG(INFO) << "Using Tree";
+
+    // same as all-reduce, except:
+    // 1) Allocate copy_buf here instead of in Reduce()
+    // 2) Force copy_buf to be of kRecvBufferSize
+    // 3) Do not use greedy assignment; all keys are assigned to each GPU
+    for (unsigned i = 0; i < devs_.size(); ++i)
+      tree_merge_buf_.push_back(std::unordered_map<int, TreeBufferEntry>());
+
+    bool delay_alloc = true;
+    std::map<int, int> key_dist;
+
+    for (size_t i = 0; i < tree_sorted_key_attrs_.size(); ++i) {
+      const int key  = std::get<0>(tree_sorted_key_attrs_[i]);
+      const TShape& shape = std::get<1>(tree_sorted_key_attrs_[i]);
+      const int type = std::get<2>(tree_sorted_key_attrs_[i]);
+
+      if (key_dist.find(shape.Size()) == key_dist.end())
+        key_dist[shape.Size()] = 1;
+      else
+        key_dist[shape.Size()]++;
+
+      int start = scan_[0][depth_];
+      int end = scan_[0][depth_+1];
+
+      // In order to generalize to any number of GPUs in arbitrary order, we use
+      // strategy of having found the mapping from 0, 1, ..., n_gpus to dev_id.
+      // For example, if the user wants to use --gpus 4,2,3,1,7,5,0, they can do      // so:
+      //
+      //   idx:    0 1 2 3 4 5 6
+      //   dev_id: 4 2 3 1 7 5 0
+      //
+      // From this, we:
+      // 1) generate a link topology matrix with dimensions n_gpus x n_gpus
+      //    (link_matrix)
+      //
+      // 2) the reduction trees are saved as indices from 0, 1, ..., n_gpus
+      //    in a vector of vectors (topology_):
+      //
+      //    index  | topology_[index]
+      //    -------------------------
+      //    0      | [Tree 0]
+      //    1      | [Tree 1]
+      //           .
+      //           .
+      //           .
+      //    n_gpus | [Tree n_gpus]
+      //
+      // 3) We use the mapping (devs_) to retrieve dev_id and device context
+      for (int j = start; j < end; ++j) {
+        int topo_id = topology_[0][j];
+        auto& buf = tree_merge_buf_[topo_id][key];
+        Context ctx = devs_[topo_id];
+
+        // buf.merged enforces that we only visit each GPU once
+        if (buf.merged.empty()) {
+          TShape shape_copy = shape;
+          int total_size = shape.Size();
+          unsigned first_size = shape[0];
+          if (total_size > gpuarray_bound_ && first_size >= 2*devs_.size()) {
+            // Find slice bounds
+            int slice_size = first_size/devs_.size();
+            int last_slice = first_size-(devs_.size()-1)*slice_size;
+            shape_copy[0]   = slice_size;
+            buf.merged.resize(devs_.size());
+            for (unsigned row = 0; row < devs_.size(); ++row) {
+              if (row == devs_.size()-1)
+                shape_copy[0] = last_slice;
+              buf.merged[row] = NDArray(shape_copy, ctx, delay_alloc, type);
+              buf.copy_buf.push_back(std::vector<NDArray>());
+              if (buf.copy_buf[row].empty()) {
+                buf.copy_buf[row].resize(kBranch-1);
+                for (size_t col = 0; col < buf.copy_buf[0].size(); ++col) {
+                  buf.copy_buf[row][col] = NDArray(buf.merged[row].shape(),
+                                                   buf.merged[row].ctx(),
+                                                   delay_alloc,
+                                                   buf.merged[row].dtype());
+                }
+              }
+            }
+          } else {
+            buf.merged.push_back(NDArray(shape, ctx, false, type));
+            if (buf.copy_buf.empty()) {
+              buf.copy_buf.push_back(std::vector<NDArray>());
+              buf.copy_buf[0].resize(kBranch-1);
+              for (size_t col = 0; col < buf.copy_buf[0].size(); ++col) {
+                buf.copy_buf[0][col] = NDArray(buf.merged[0].shape(),
+                                               buf.merged[0].ctx(), delay_alloc,
+                                               buf.merged[0].dtype());
+              }
+            }
+          }
+        }
+      }
+    }
+
+    for (auto it = key_dist.begin(); it != key_dist.end(); ++it) {
+      LOG(INFO) << "Size " << it->first << " occurs " << it->second << " times";
+    }
+    inited_ = true;
+  }
+
+  std::vector<KeyAttrs> tree_sorted_key_attrs_;
+  /// \brief temporal space for pushing and pulling
+  struct TreeBufferEntry {
+    /// \brief the dense merged value for reduce and broadcast operations
+    std::vector<NDArray> merged;
+    /// \brief the gpu buffer for copy during reduce operation
+    std::vector<std::vector<NDArray>> copy_buf;
+    /// \brief the residual buffer for gradient compression
+    std::vector<NDArray> residual;
+    /// \brief the small buffer for compressed data in sender
+    std::vector<NDArray> compressed_send_buf;
+    /// \brief the small buffer for compressed data in receiver
+    std::vector<NDArray> compressed_recv_buf;
+
+   private:
+    /// \brief the sparse merged value for reduce and rowsparse broadcast operations
+    NDArray sparse_merged;
+  };
+  /// \brief intent of tree_merge_buf_ in old comm.h: store key->gpu mapping
+  ///        new intent: for every gpu: store key->memory mapping
+  std::vector<std::unordered_map<int, TreeBufferEntry>> tree_merge_buf_;
+
+  /// \brief NVLink-connected topology in full binary tree format
+  std::vector<std::vector<size_t>> topology_;
+  std::vector<std::vector<size_t>> scan_;
+  std::vector<Context> devs_;
+
+  int depth_;
+  int gpuarray_bound_;
+  bool backtrack_;
+  float link_usage_penalty_;
+
+  /// \brief constant for maximum size of recv buffer per GPU
+  ///        2: only receive from 1 other GPU
+  const int kBranch = 2;
+};
+
+}  // namespace kvstore
+}  // namespace mxnet
+#endif  // MXNET_KVSTORE_COMM_TREE_H_
diff --git a/src/kvstore/gpu_topology.h b/src/kvstore/gpu_topology.h
new file mode 100644
index 000000000..a8801499c
--- /dev/null
+++ b/src/kvstore/gpu_topology.h
@@ -0,0 +1,1100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef MXNET_KVSTORE_GPU_TOPOLOGY_H_
+#define MXNET_KVSTORE_GPU_TOPOLOGY_H_
+#if MXNET_USE_CUDA
+  #include <cuda_runtime_api.h>
+  #include <cuda.h>
+#endif
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <random>
+#include <stack>
+#include <queue>
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+
+#define MXNET_KVSTORE_MAXDEPTH 16
+
+namespace mxnet {
+namespace kvstore {
+
+static bool kLogTree = dmlc::GetEnv("MXNET_KVSTORE_LOGTREE", false);
+
+template <typename T>
+inline void PrintVector(const std::string& str, const std::vector<T>& vec) {
+  LOG(INFO) << str << ":";
+  std::string output;
+  for (unsigned i = 0; i < vec.size(); ++i)
+    output += std::to_string(vec[i]) + " ";
+  LOG(INFO) << output;
+}
+
+template <typename T>
+inline void PrintMatrix(const std::string& str, const std::vector<T>& matrix,
+    int num_rows, int num_cols) {
+  LOG(INFO) << str << ":";
+  int count = 0;
+  for (int row = 0; row < num_rows; ++row) {
+    std::string output;
+    for (int col = 0; col < num_cols; ++col) {
+      output += std::to_string(static_cast<int>(matrix[count++])) + " ";
+    }
+    LOG(INFO) << output;
+  }
+}
+
+inline void PrintTopo(const std::string& str, const std::vector<size_t>& topo_row,
+    std::vector<size_t> scan_row) {
+  LOG(INFO) << str << ":";
+  int depth = scan_row.size()-1;
+  for (int row = 0; row < depth; ++row) {
+    int start = scan_row[row];
+    int end = scan_row[row+1];
+    std::string output;
+    for (; start < end; start++) {
+      for (int i = 0; i < (2 << (depth-row-2))+1; ++i) {
+        output += " ";
+      }
+      output += std::to_string(topo_row[start]);
+    }
+    LOG(INFO) << output;
+  }
+}
+
+/** 
+ * \brief Uses BFS to find whether undirected graph is connected or not given its
+ * adjacency matrix
+ * Note: only consider matrix values > 1, because we care about whether it is
+ * connected using only NVLink connections
+ */
+template <typename T>
+inline bool IsConnected(const std::vector<T>& matrix, int num_gpus) {
+  int source = 0;
+  std::vector<bool> visited(num_gpus, false);
+  std::queue<int> work_list;
+
+  work_list.push(source);
+  visited[source] = true;
+  while (!work_list.empty()) {
+    int curr = work_list.front();
+    work_list.pop();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      int neighbour = matrix[curr*num_gpus + i];
+      if (i != curr && neighbour > 1 && visited[i] == false) {
+        visited[i] = true;
+        work_list.push(i);
+      }
+    }
+  }
+
+  for (int i = 0; i < num_gpus; ++i) {
+    if (visited[i] == false)
+      return false;
+  }
+  return true;
+}
+
+/**
+ * \brief Generate adjacency matrix with row/col numbering from 0, 1, ..., n_gpu
+ * \param devs is a vector of GPU contexts
+ * \param matrix is adjacency matrix of link topology graph
+ *        where edge weight represents relative performance of NVIDIA GPUs
+ *          0: Self-connection
+ *          1: PCI-E
+ *          2: 1 NVLink connection
+ *          3: 2 NVLink connections
+ */
+template <typename T>
+inline void GetP2PWeight(const std::vector<Context>& devs, std::vector<T>* matrix) {
+  int num_gpus = devs.size();
+  int count    = 0;
+  std::vector<int> zero_dev_id(num_gpus, -1);
+  for (auto d : devs) {
+    zero_dev_id[count] = d.dev_id;
+    count++;
+  }
+
+#if MXNET_USE_CUDA
+  cudaDeviceP2PAttr attr;
+  attr = cudaDevP2PAttrPerformanceRank;
+  std::vector<int> max(num_gpus, 0);
+
+  for (int row = 0; row < num_gpus; ++row) {
+    for (int col = 0; col < num_gpus; ++col) {
+      if (row == col) {
+        (*matrix)[row*num_gpus+col] = 0;
+      } else {
+        int value;
+        int row_gpu = zero_dev_id[row];
+        int col_gpu = zero_dev_id[col];
+        cudaDeviceGetP2PAttribute(&value, attr, row_gpu, col_gpu);
+        if (value > max[row])
+          max[row] = value;
+        (*matrix)[row*num_gpus+col] = static_cast<T>(value)+1;
+      }
+    }
+  }
+
+  // Check that all GPUs have at least 1 NVLink connection
+  int max_value = 0;
+  for (unsigned int i = 0; i < max.size(); ++i) {
+    if (max[i] > max_value)
+      max_value = max[i];
+  }
+
+  // If all GPUs are connected by NVLink, then we can use NVLink only
+  // to communicate instead of going over PCI-E, so we set PCI-E links to 0
+  //
+  // Otherwise, we will make distinction between PCI-E GPUDirect links and
+  // PCI-E through CPU links, which are slower and show queueing effect (i.e.
+  // The most packets there are, the slower).
+  //
+  // For the latter links, we will set links that were 0 to 1/num_gpus to
+  // account for this queuing effect.
+  bool connected = IsConnected(*matrix, num_gpus);
+
+  if (connected) {
+    for (auto& matrix_value : *matrix) {
+      matrix_value = (matrix_value == 1) ? 0 : matrix_value;
+    }
+  } else {
+    for (auto& matrix_value : *matrix) {
+      matrix_value = (matrix_value == 1) ? 1./num_gpus : matrix_value;
+    }
+  }
+
+#else
+  LOG(WARNING) << "GPU required for link topology";
+#endif
+}
+
+/**
+ * \brief Dense matrix-vector multiplication
+ * Assume: matrix is square
+ *   y = A*x (no accumulate)
+ */
+template <typename T>
+inline void gemv(const std::vector<T>& A, const std::vector<int>& x,
+                 std::vector<T>* y) {
+  int nrows = x.size();
+  int count = 0;
+  for (int row=0; row < nrows; ++row) {
+    (*y)[row] = 0;
+    for (int col=0; col < nrows; ++col) {
+      (*y)[row] += A[count]*static_cast<T>(x[col]);
+      count++;
+    }
+  }
+}
+
+/**
+ * \brief Element-wise multiplication between 2 dense vectors
+ *   w = w * alpha*u
+ */
+template <typename T>
+inline void ewisemult(const std::vector<int>& u, T alpha, std::vector<T>* w) {
+  int nelem = u.size();
+  for (int i=0; i < nelem; ++i) {
+    (*w)[i] *= alpha*static_cast<T>(u[i]);
+  }
+}
+
+/**
+ * \brief Computes best 2 nodes a,b to swap given objective function:
+ *   g = max_{a \in A, b \in B} D(a) + D(b) - 2*W(a,b)
+ *
+ * Optimization: Only need to look at upper triangular since weight matrix is
+ * symmetric
+ */
+template <typename T>
+inline void FindBestMove(const std::vector<T>& W,
+                         const std::vector<int>& P_temp,
+                         const std::vector<T>& D,
+                         const std::unordered_set<int>& used,
+                         int* a, int* b, T* g) {
+  int nrows = P_temp.size();
+  *g = 0;
+  *a = -1;
+  *b = -1;
+  for (int row=0; row < nrows; ++row) {
+    if (P_temp[row] == 0 || used.find(row) != used.end()) continue;
+    for (int col=row+1; col < nrows; ++col) {
+      if (P_temp[col] == 0 || P_temp[row] == P_temp[col]) continue;
+
+      T cost = D[row]+D[col]-2*W[row*nrows+col];
+      if (cost > *g) {
+        *g = cost;
+        *a = row;
+        *b = col;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Performs partition on each existing partition in graph W if partition has
+ * more than 4 elements in it
+ * \param stop returns true if no partitions with >=4 elements found
+ *             returns false otherwise
+ * \param cluster_pairs stores the mapping that tells us which 2 clusters are
+ *        the output of partitioning one large cluster
+ */
+template <typename T>
+inline bool KernighanLin(const std::vector<T>& W, std::vector<int>* P,
+                         int* num_partitions,
+                         std::vector<std::pair<int, int>>* cluster_pairs,
+                         std::mt19937* gen) {
+  std::vector<int> histogram(*num_partitions, 0);
+  std::vector<int> P_temp(P->size(), 0);
+  std::vector<int> P_temp2(P->size(), 0);
+  std::vector<T> D(P->size(), 0);
+  std::vector<T> D_temp(P->size(), 0);
+
+  // 0) For every partition, determine if it can be partitioned further.
+  //    To do this, we must do a histogram of each partition:
+  for (unsigned i=0; i < P->size(); ++i) {
+    histogram[(*P)[i]]++;
+  }
+
+  bool stop = true;
+  for (unsigned color=0; color < histogram.size(); ++color) {
+    int partition_size = histogram[color];
+    // Save cluster in preparation for push to topo in GenerateBinaryTree()
+    if (partition_size <= 2) {
+      cluster_pairs->push_back(
+         std::pair<int, int>(static_cast<int>(color), -partition_size));
+
+    // Do Kernighan-Lin if clustering is necessary
+    } else {
+      stop = false;
+
+      // 1) If it has more than 4 elements, we can partition further.
+      //    Assign random balanced partition of it
+      //   -balanced is more important than random, so allocate first half to A
+      //    and rest to B
+      int first_partition = 0;
+      int target_partition = partition_size/2;
+      std::vector<int> cluster_list;
+
+      for (unsigned i = 0; i < P->size(); ++i) {
+        // Required to shift from [0,1] to {-1,1}
+        //  1 means vertex i is in Cluster A
+        // -1 means vertex i is in Cluster B
+        if ((*P)[i] == static_cast<int>(color)) {
+          cluster_list.push_back(i);
+        } else {
+          P_temp[i] = 0;
+        }
+      }
+
+      // 1b) Shuffle using random generator
+      std::shuffle(cluster_list.begin(), cluster_list.end(), *gen);
+      for (unsigned i = 0; i < cluster_list.size(); ++i) {
+        if (first_partition < target_partition) {
+          int dest = cluster_list[i];
+          P_temp[dest] = 1;
+          first_partition++;
+        } else {
+          int dest = cluster_list[i];
+          P_temp[dest] = -1;
+        }
+      }
+
+      // 2) Do iterations of Kernighan-Lin until convergence
+      T g_max = 0;
+      int g_k = -1;
+      unsigned count = 0;
+      do {
+        count++;
+        P_temp2 = P_temp;
+
+        // a) Compute difference between external and internal costs of all
+        //    elements in vector D
+        gemv(W, P_temp, &D);
+        ewisemult(P_temp, -1.f, &D);
+
+        // av and bv are used to hold candidates for moving
+        // gv stores the score associated with move
+        std::vector<int> av;
+        std::vector<int> bv;
+        std::vector<T> gv;
+
+        std::unordered_set<int> used;
+
+        for (int iter=0; iter < partition_size/2; ++iter) {
+          // b) Find best move by looking through upper triangular of W matrix
+          int a, b;
+          T g;
+          FindBestMove(W, P_temp, D, used, &a, &b, &g);
+          if (g > 0) {
+          } else {
+            g_max = 0;
+            break;
+          }
+
+          // c) Store best move to av, bv, gv
+          av.push_back(a);
+          bv.push_back(b);
+          gv.push_back(g);
+
+          // d) Eliminate best move from consideration in vector P_temp
+          P_temp[a] *= -1;
+          P_temp[b] *= -1;
+          used.insert(a);
+          used.insert(b);
+
+          // e) Update D using P_temp
+          gemv(W, P_temp, &D);
+          ewisemult(P_temp, -1.f, &D);
+          D[a] = 0;
+          D[b] = 0;
+        }
+
+        // 3) Find when to stop by doing linear scan through gv
+        //    Recompute score g_max
+        for (unsigned k = 0; k < gv.size(); ++k) {
+          if (k > 0)
+            gv[k] += gv[k-1];
+          if (gv[k] > g_max) {
+            g_max = gv[k];
+            g_k = k + 1;
+          }
+        }
+
+        // 4) If move is "good", commit moves by updating P_temp and P_temp2
+        //    Otherwise, rollback changes to P_temp2
+        if (g_max > 0) {
+          for (int i = 0; i < g_k; i++) {
+            int a = av[i];
+            int b = bv[i];
+            int temp = P_temp2[a];
+            P_temp2[a] = P_temp2[b];
+            P_temp2[b] = temp;
+
+            P_temp = P_temp2;
+          }
+        } else {
+          P_temp = P_temp2;
+        }
+      } while (g_max > 0 && count <= P->size());
+
+      // 5) Update P using P_temp
+      int moves = 0;
+      for (unsigned i=0; i < P->size(); ++i) {
+        if (P_temp[i] == -1) {
+          (*P)[i] = *num_partitions;
+          moves++;
+        }
+      }
+      cluster_pairs->push_back(std::pair<int, int>(static_cast<int>(color),
+          static_cast<int>(*num_partitions)));
+
+      (*num_partitions)++;
+    }
+  }
+
+  return stop;
+}
+
+/**
+ * \brief Returns root of a given color if found in roots
+ *        Returns -1 if it is not found
+ */
+inline int GetRoot(const std::vector<int>& P, int color,
+                   const std::unordered_set<int>& roots) {
+  for (auto root : roots) {
+    if (P[root] == color)
+      return root;
+  }
+  return -1;
+}
+
+/**
+ * \brief Returns root of a given color if found in roots
+ *        Returns -1 if it is not found
+ */
+inline int GetChild(const std::vector<int>& P, int color, int parent) {
+  for (unsigned i = 0; i < P.size(); ++i) {
+    if (P[i] == color && static_cast<int>(i) != parent)
+      return i;
+  }
+  return -1;
+}
+
+// Computes highest weighted edge a-b
+//
+// Contraints:
+//  -vertex a must be parent
+//  -vertex b must be in dest_cluster
+//
+// @output: b is vector of candidates if a tie happens
+//          g is weight of edge
+// Optimization: Only need to look at row a in matrix
+template <typename T>
+inline void FindBestEdge(const std::vector<T>& W, const std::vector<int>& P,
+                         int parent, int dest_cluster, std::vector<int>* b, T* g) {
+  int nrows = P.size();
+  int row = parent;
+  *g = 0;
+  b->push_back(-1);
+  for (int col=0; col < nrows; ++col) {
+    if (col == row || P[col] != dest_cluster) continue;
+
+    T cost = W[row*nrows+col];
+    if (cost > *g) {
+      b->clear();
+    }
+    if (cost >= *g) {
+      b->push_back(col);
+      *g = cost;
+    }
+  }
+}
+
+// Given a vector of color pairs, appends to binary tree matrix topo
+// @input:  W gives the link topology
+//          P gives the result of KL partitioning
+//          cluster_pairs gives pairing between clusters, an edge is found
+//                        between each pairing
+//          roots gives source vertices
+//          gen gives random number generation to break ties
+// @output: cluster_pairs
+//          topo_row says where new edges are appended to
+//          scan_row says where we should start looking for topo_row
+template <typename T>
+inline int KLGenerateBinaryTree(const std::vector<T>& W,
+                                const std::vector<int>& P,
+                                std::vector<std::pair<int, int>>* cluster_pairs,
+                                std::unordered_set<int>* roots,
+                                std::vector<size_t>* topo_row,
+                                std::vector<size_t>* scan_row,
+                                std::mt19937* gen) {
+  std::unordered_set<int> new_roots;
+  std::unordered_map<int, int> new_topo;
+  int reset = 0;
+
+  for (unsigned i = 0; i < cluster_pairs->size(); ++i) {
+    if (i == 0)
+      scan_row->push_back(topo_row->size());
+    int parent, child = -1;
+    if ((*cluster_pairs)[i].second == -2) {
+      // Root must be color of pair.first
+      int color  = (*cluster_pairs)[i].first;
+      parent = GetRoot(P, color, *roots);
+      if (parent == -1) return 1;
+      child = GetChild(P, color, parent);
+    } else if ((*cluster_pairs)[i].second == -1) {
+      int color = (*cluster_pairs)[i].first;
+      parent = GetRoot(P, color, *roots);
+      if (parent == -1) return 1;
+      child = parent;
+    } else {
+      // Root must exist in either first or second element of pair
+      int color = (*cluster_pairs)[i].first;
+      parent = GetRoot(P, color, *roots);
+      color = (parent == -1) ? (*cluster_pairs)[i].second  : color;
+      parent = (parent == -1) ? GetRoot(P, color, *roots) : parent;
+
+      int from_cluster = color;
+      int dest_cluster = (from_cluster == (*cluster_pairs)[i].first) ?
+          (*cluster_pairs)[i].second : (*cluster_pairs)[i].first;
+
+      std::vector<int> candidates;
+      T weight;
+      FindBestEdge(W, P, parent, dest_cluster, &candidates, &weight);
+
+      // If no candidates
+      if (candidates[0] != -1) {
+        std::shuffle(candidates.begin(), candidates.end(), *gen);
+        child = candidates[0];
+      }
+
+      if (child == -1) {
+        new_roots.insert(parent);
+        return 1;
+      } else {
+        new_roots.insert(parent);
+        new_roots.insert(child);
+      }
+    }
+
+    new_topo[parent] = child;
+  }
+
+  int depth = scan_row->size();
+  int start = (*scan_row)[depth-2];
+  int end = (*scan_row)[depth-1];
+
+  for (int i = start; i < end; ++i) {
+    int parent = (*topo_row)[i];
+    int child;
+
+    // If not first, check previous level whether or not we are encountering
+    // this root for the first time in this level of the tree
+    if (i != start && parent == static_cast<int>((*topo_row)[i-1]))
+      child = parent;
+    else
+      child = new_topo[parent];
+    topo_row->push_back(parent);
+    topo_row->push_back(child);
+  }
+
+  cluster_pairs->clear();
+  roots->clear();
+  *roots = std::move(new_roots);
+
+  return reset;
+}
+
+// @input: n is the number of nodes in a balanced binary tree
+// @output: returns how many levels of binary tree there are
+inline int ComputeDepth(int n) {
+  for (int depth = 0; depth < MXNET_KVSTORE_MAXDEPTH; ++depth) {
+    int num = 2 << depth;
+    if (n <= num)
+      return depth+1;
+  }
+  return 0;
+}
+
+// Checks whether a given state forms a spanning tree that satisfies:
+//   -balanced
+//   -binary
+//   -each edge in tree corresponds to link in network topology
+//   -each edge in tree does not form self-loop
+template <typename T>
+inline bool IsValid(const std::vector<T>& W, const std::vector<int>& state,
+                    int num_elements, int row, int depth) {
+  // At each level of tree, check whether edge:
+  //   -corresponds to link in network topology
+  //   -corresponds to self-loop
+  for (int i = 0; i < depth; ++i) {
+    int stride = 1 << i;
+    for (int j = 0; j+stride < row; j += 2*stride) {
+      int from = state[j];
+      int dest = state[j+stride];
+      if (W[from*num_elements + dest] == static_cast<T>(0) && from != dest) {
+        return false;
+      }
+    }
+  }
+
+  // If we encounter GPU for first time, increment found_vec.
+  // Otherwise, do nothing
+  std::unordered_set<int> found;
+  std::vector<int> found_vec(num_elements, 0);
+  for (auto val : state) {
+    if (val == -1)
+      continue;
+    if (val < num_elements) {
+      if (found.find(val) == found.end()) {
+        found.insert(val);
+        found_vec[val] = 1;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  // modifier is maximum number of repeats a single GPU can take
+  //   e.g. 5 GPUs in 3-level binary tree => one GPU can repeat 3x
+  //        GPU0 GPU0 GPU0 GPU0 GPU1 GPU2 GPU3 GPU4
+  int modifier = (1 << depth) - num_elements;
+  int num_found = found.size();
+
+  // So we know we have an invalid state if we find:
+  //   -only 4 unique GPUs
+  //   -9 unique GPUs
+  if (row < num_elements) {
+    if (num_found > row || num_found < row - modifier) {
+      return false;
+    }
+
+  // If we are at last recursive level, we can apply a more stringent check:
+  //   -if some GPU is not found, then we are in invalid state
+  } else if (row == static_cast<int>(state.size())) {
+    for (int i = 0; i < num_elements; ++i) {
+      if (found_vec[i] == 0) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// This function takes a spanning tree encoded as state (result), which may have
+// repeated GPUs representing NO-SENDs and converts it into a unique format.
+// This has the effect of recognizing redundant sends, grouping them together,
+// so that the Reduce call knows not to perform a CopyFromTo.
+//
+// Initial result: [3 0 0 4 1 2 5 6]
+// Final result:   [3 3 0 4 1 2 5 6]
+//
+// Initial:
+//         3
+//     3     1
+//   3   0   1   5
+// 3 0 0 4 1 2 5 6    // GPU3 will make redundant send to GPU0
+//
+// Final:
+//         3
+//     3     1
+//   3   0   1   5
+// 3 3 0 4 1 2 5 6    // GPU3 knows not to make redundant send to itself
+inline void Postprocess(std::vector<int>* result, int num_elements, int depth) {
+  for (int level = depth - 1; level >= 0; --level) {
+    int stride = 1 << level;
+    std::vector<int> histogram_above(num_elements, 0);
+    for (unsigned i = 0; i < result->size(); i += 2*stride) {
+      int val = (*result)[i];
+      histogram_above[val]++;
+    }
+    std::vector<int> histogram(num_elements, 0);
+    for (unsigned i = 0; i < result->size(); i += stride) {
+      int val = (*result)[i];
+      histogram[val]++;
+    }
+
+    for (int i = result->size()-stride; i-stride >= 0; i -= 2*stride) {
+      int from = (*result)[i];
+      int dest = (*result)[i-stride];
+      if ((histogram[from] > 1 || histogram_above[from] >= 1) && from != dest) {
+        (*result)[i] = dest;
+        histogram[from]--;
+      }
+    }
+  }
+}
+
+// Given a spanning tree encoded as a state (result) and weight of each edge
+// in the link topology graph, compute its weight.
+// @input: penalty controls whether or not penalties are applied to tree
+//         -usually turned on when backtracking to get better solutions
+//         -usually turned off when outside the penalty to get weight of tree
+template <typename T>
+inline T ComputeTreeWeight(const std::vector<T>& W, const std::vector<int>& result,
+                           int num_elements, int depth, bool penalty) {
+  T weight = 0.f;
+  std::unordered_set<int> links_used;
+
+  for (int i = 0; i < depth; ++i) {
+    int stride = 1 << i;
+    std::vector<bool> nodes_used(num_elements, false);
+    for (unsigned j = 0; j+stride < result.size(); j += 2*stride) {
+      int from = result[j];
+      int dest = result[j+stride];
+      if (from != dest) {
+        weight += W[from*num_elements+dest];
+
+        // Penalize: (1) use of redundant edges in a single tree
+        //           (2) repeated use of a GPU in a single tree at the same
+        //               level above the leaf level
+        if (links_used.find(from*num_elements+dest) != links_used.end()
+                            && penalty) {
+          weight -= 100;
+        }
+        links_used.insert(from*num_elements+dest);
+        links_used.insert(dest*num_elements+from);
+      }
+
+      nodes_used[from] = true;
+      if (i > 0 && nodes_used[dest] && penalty) {
+        weight -= 10;
+      }
+      nodes_used[dest] = true;
+    }
+  }
+
+  return weight;
+}
+
+/**
+ * \brief Given a spanning tree encoded as result, which was convenient for performing
+ * backtracking, convert it topology_ and scan_ in the classic "binary tree
+ * stored in an array" format. For binary trees scan_ is redundant, but this
+ * additional data structure leaves future generalization to k-radix trees.
+ *
+ * Initial result: [3 3 0 4 1 2 5 6]
+ * topology_:      [3 3 1 3 0 1 5 3 3 0 4 1 2 5 6]
+ * scan_:          [0 1 3 7 15]
+ *
+ * topology_ is stored in the classic "binary tree stored in an array" format
+ * e.g.    3
+ *     3     1
+ *   3   0   1   5
+ * 3 3 0 4 1 2 5 6
+ *
+ * Returns false if invalid tree in result
+ * Otherwise returns true
+ */
+inline bool FormTopology(const std::vector<int>& result,
+                         std::vector<size_t>* topo_row,
+                         std::vector<size_t>* scan_row,
+                         int depth) {
+  for (unsigned i = 0; i < result.size(); ++i)
+    if (result[i] == -1)
+      return false;
+
+  scan_row->push_back(topo_row->size());
+  for (int i = depth; i > 0; --i) {
+    int stride = 1 << i;
+    for (unsigned j = 0; j < result.size(); j += stride) {
+      int from = result[j];
+      topo_row->push_back(from);
+    }
+    scan_row->push_back(topo_row->size());
+  }
+
+  // Insert at the end, result vector
+  topo_row->insert(topo_row->end(), result.begin(), result.end());
+  scan_row->push_back(topo_row->size());
+  return true;
+}
+
+/**
+ * \brief Recursive function that finds a spanning tree, which fulfills the following
+ * conditions:
+ *   -balanced
+ *   -binary
+ *   -maximum weight
+ */
+template <typename T>
+inline bool RecursiveBacktrack(const std::vector<T>& W,
+                               std::vector<int>* state,
+                               std::vector<int>* best_result,
+                               T* best_result_weight,
+                               int row,
+                               int num_elements,
+                               int depth,
+                               bool optimal) {
+  if (row == static_cast<int>(state->size())) {
+    std::vector<int> result = *state;
+    Postprocess(&result, num_elements, depth);
+    T weight = ComputeTreeWeight(W, result, num_elements, depth, true);
+
+    // Save this spanning tree if it is highest weight tree found sofar
+    if (weight > *best_result_weight) {
+      std::swap(*best_result_weight, weight);
+      *best_result = result;
+    }
+    return !optimal;
+  }
+
+  // If not last recursive level, try to find valid tree for next level
+  bool stop = false;
+  for (int j = 0; j < num_elements; ++j) {
+    (*state)[row] = j;
+    if (IsValid(W, state, num_elements, row+1, depth))
+      stop = RecursiveBacktrack(W, state, best_result, best_result_weight,
+                                row+1, num_elements, depth, optimal);
+    (*state)[row] = -1;
+    if (stop)
+      return stop;
+  }
+  return stop;
+}
+
+template <typename T>
+inline void IterativeBacktrack(const std::vector<T>& W,
+                               std::vector<int>* state,
+                               std::vector<int>* best_result,
+                               T* best_result_weight,
+                               int row,
+                               int num_elements,
+                               int depth,
+                               bool optimal) {
+  std::stack<int> state_stack;
+  row = 1;
+  int pos = 0;
+  state_stack.push(pos);
+
+  while (true) {
+    // If there is no valid position, 2 cases:
+    // a) if stack is empty, break and stop search
+    // b) if stack is not empty, pop stack and set current position to next
+    //    position backtrack to previous row
+    while (!state_stack.empty() && pos >= num_elements) {
+      pos = state_stack.top();
+      pos++;
+      state_stack.pop();
+      (*state)[state_stack.size()+1] = -1;
+      row--;
+    }
+    if (state_stack.empty()) break;
+
+    (*state)[row] = pos;
+    // If there is a valid position push the position to stack, set current
+    // position to 0 and move to next row
+    if (IsValid(W, *state, num_elements, row+1, depth)) {
+      state_stack.push(pos);
+      pos = 0;
+      row++;
+    } else {
+      pos++;
+      (*state)[row] = -1;
+    }
+
+    // If stack has size N, a solution is found
+    // Pop stack, set current position to next position
+    // Backtrack to find next solution
+    if (row == static_cast<int>(state->size())) {
+      std::vector<int> result = *state;
+      Postprocess(&result, num_elements, depth);
+      T weight = ComputeTreeWeight(W, result, num_elements, depth, true);
+
+      // Save this spanning tree if it is highest weight tree found so far
+      if (weight > *best_result_weight) {
+        std::swap(*best_result_weight, weight);
+        *best_result = result;
+      }
+      if (!optimal) break;
+
+      pos = state_stack.top();
+      pos++;
+      state_stack.pop();
+      (*state)[state_stack.size()] = -1;
+      row--;
+    }
+  }
+}
+
+/**
+ * \brief Apply penalty factor alpha to each link in link topology graph that is used
+ * by the spanning tree
+ */
+template <typename T>
+inline void UpdateWeight(std::vector<T>* W, const std::vector<size_t>& topo_row,
+                         int num_elements, float alpha) {
+  for (unsigned i = 1; i < topo_row.size() - 1; i += 2) {
+    unsigned parent = topo_row[i];
+    unsigned child = topo_row[i+1];
+    if (!(parent >= num_elements*num_elements ||
+        child >= num_elements*num_elements) && (parent != child)) {
+      (*W)[parent*num_elements+child] *= alpha;
+      (*W)[child*num_elements+parent] *= alpha;
+    }
+  }
+}
+
+/** 
+ * \brief Do brute-force backtracking approach if Kernighan-Lin fails to find a binary
+ * tree of height Log P.
+ *
+ * Constraints:
+ * 1) minimize depth (balance)
+ * 2) maximize edge weight
+ * 3) tree is binary
+ */
+template <typename T>
+inline bool BacktrackGenerateBinaryTree(std::vector<T>* W,
+                                        int num_elements,
+                                        int root,
+                                        std::vector<size_t>* topo_row,
+                                        std::vector<size_t>* scan_row) {
+  // Clear before starting
+  topo_row->clear();
+  scan_row->clear();
+
+  // Compute depth
+  // num_elements: depth
+  // 5: 3 8
+  // 6: 3 8
+  // 7: 3 8
+  // 8: 3 8
+  // 9: 4 16
+  int depth = ComputeDepth(num_elements);
+  int depth_leaves = 1 << depth;
+
+  // State vector
+  // -1 means unplaced
+  std::vector<int> state(depth_leaves, -1);
+  std::vector<int> result(depth_leaves, -1);
+  T result_weight = std::numeric_limits<T>::lowest();
+
+  // Place root and try all combinations
+  state[0] = root;
+
+  // Seek optimal solution until depth <= 3 i.e. 8 GPUs
+  // For larger numbers of GPUs, settle for first tree found (non-optimal), but
+  // this saves a lot of runtime, because Backtrack is exponential time
+  if (depth <= 3) {
+    IterativeBacktrack(*W, &state, &result, &result_weight, 1, num_elements,
+                       depth, true);
+  } else {
+    IterativeBacktrack(*W, &state, &result, &result_weight, 1, num_elements,
+                       depth, false);
+  }
+  return FormTopology(result, topo_row, scan_row, depth);
+}
+
+/**
+ * \brief ComputeTreesFromRoot does the same thing as ComputeTrees, with the only
+ * exception being it will do it from a fixed GPU as root
+ */
+template <typename T>
+inline void ComputeTreesFromRoot(std::vector<T>* W,
+                                 int num_elements,
+                                 int root,
+                                 float alpha,
+                                 bool backtrack,
+                                 std::vector<size_t>* topo,
+                                 std::vector<size_t>* scan) {
+  int num_partitions = 1;
+
+  // Initialize partition array to indicate which partition each element belongs
+  // to beginning with 0
+  std::vector<int> P(num_elements, 0);
+
+  // Initialize vector of pairs that will tell us edges between what 2 clusters
+  // we should be looking to build the tree from
+  std::vector<std::pair<int, int>> cluster_pairs;
+
+  // Initialize vector of roots that will tell us edges between
+  std::unordered_set<int> roots;
+  roots.insert(root);
+
+  // Will be used to obtain a seed for the random number engine
+  // RNG: Standard mersenne_twister_engine seeded with rd()
+  //     -use 0 for testing (TODO: remove this)
+  // std::random_device rd;
+  // std::mt19937 gen(rd());
+  std::mt19937 gen(1);
+
+  // Temporary variables for rewinding
+  std::vector<int> P_temp;
+  int num_partitions_temp;
+  std::unordered_set<int> roots_temp;
+  std::vector<size_t> topo_temp;
+  std::vector<size_t> scan_temp;
+
+  // Determine number of partition levels
+  // If first partition, determine root of maximal spanning tree
+  bool stop = false;
+  int reset = 1;
+  int level = 0;
+
+  while (!backtrack && (!stop || reset)) {
+    if (reset == 1) {
+      cluster_pairs.clear();
+      P_temp = P;
+      num_partitions_temp = num_partitions;
+      roots_temp = roots;
+      topo_temp = *topo;
+      scan_temp = *scan;
+    }
+
+    // Run Kernighan-Lin to generate partition
+    stop = KernighanLin(*W, &P_temp, &num_partitions_temp, &cluster_pairs,
+                        &gen);
+
+    // Use partitions found and a given root to find best inter-cluster edge for
+    // each pair of clusters, and returns them as roots of next cluster
+    // If reset is true, then rewind back to previous clustering
+    reset = KLGenerateBinaryTree(*W, P_temp, &cluster_pairs, &roots_temp,
+                                 &topo_temp, &scan_temp, &gen);
+
+    if (reset)
+      level++;
+    if (level > 10) break;
+  }
+
+  bool success = true;
+  if (reset == 1) {
+    // LOG(INFO) << "No valid binary tree found from root " << root << ", try backtracking";
+    success = BacktrackGenerateBinaryTree(W, num_elements, root, topo, scan);
+  } else {
+    *topo = topo_temp;
+    *scan = scan_temp;
+    scan->push_back(topo->size());
+  }
+  if (success)
+    UpdateWeight(W, *topo, num_elements, alpha);
+  else
+    LOG(FATAL) << "No valid binary tree found from root " << root << " using backtracking";
+}
+
+/**
+ * \brief ComputeTrees computes balanced binary spanning trees of maximum edge weight
+ * given a link topology graph stored in adjacency matrix format
+ * \param W is the link topology matrix
+ * \param num_elements is the number of GPUs
+ * \param alpha is the link usage penalty
+ * \param backtrack is whether or not we use backtracking to generate trees
+ * \param topo stores the trees generated
+ * \param scan stores the start of each level of each tree
+ */
+template <typename T>
+inline void ComputeTrees(const std::vector<T>& W,
+                         int num_elements,
+                         float alpha,
+                         bool backtrack,
+                         std::vector<std::vector<size_t>>* topo,
+                         std::vector<std::vector<size_t>>* scan) {
+  std::vector<T> W_copy = W;
+
+  topo->clear();
+  scan->clear();
+  for (int i = 0; i < num_elements; ++i) {
+    topo->push_back(std::vector<size_t>());
+    scan->push_back(std::vector<size_t>());
+    (*topo)[i].push_back(i);
+    (*scan)[i].push_back(0);
+    ComputeTreesFromRoot(&W_copy, num_elements, i, alpha, backtrack,
+                         &((*topo)[i]), &((*scan)[i]));
+  }
+
+  // Note: must sum up adj matrix to show link usage before we readjust topo
+  // from 0, 1, ..., n_gpus format to dev_id format, which will cause segfault
+  std::vector<int> adj(W.size(), 0);
+  for (int row = 0; row < num_elements; ++row) {
+    for (unsigned col = 1; col < (*topo)[0].size(); col += 2) {
+      int from = std::min((*topo)[row][col], (*topo)[row][col+1]);
+      int dest = std::max((*topo)[row][col], (*topo)[row][col+1]);
+      if (from != dest) {
+        adj[from*num_elements+dest] += 1;
+        adj[dest*num_elements+from] += 1;
+      }
+    }
+  }
+
+  std::vector<std::vector<size_t>> topo_temp(num_elements,
+      std::vector<size_t>());
+
+  if (kLogTree) {
+    for (int i = 0; i < num_elements; ++i)
+      PrintTopo("Tree "+std::to_string(i), (*topo)[i], (*scan)[i]);
+
+    PrintMatrix("W", W, num_elements, num_elements);
+    PrintMatrix("Links", adj, num_elements, num_elements);
+  }
+}
+}  // namespace kvstore
+}  // namespace mxnet
+#endif  // MXNET_KVSTORE_GPU_TOPOLOGY_H_
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 84e2700a2..324bc2c95 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -34,6 +34,7 @@
 #include <functional>
 #include <algorithm>
 #include "./comm.h"
+#include "./comm_tree.h"
 #include "./kvstore_utils.h"
 #include "../ndarray/ndarray_function.h"
 
@@ -56,7 +57,12 @@ class KVStoreLocal : public KVStore {
    */
   explicit KVStoreLocal(bool use_device_comm) : KVStore() {
     if (use_device_comm) {
-      comm_ = new CommDevice();
+      bool tree = dmlc::GetEnv("MXNET_KVSTORE_USETREE", 0) & MXNET_USE_CUDA;
+      if (tree) {
+        comm_ = new CommDeviceTree();
+      } else {
+        comm_ = new CommDevice();
+      }
     } else {
       comm_ = new CommCPU();
     }
diff --git a/tests/cpp/kvstore/gpu_topology_test.cc b/tests/cpp/kvstore/gpu_topology_test.cc
new file mode 100644
index 000000000..0f6d5f11c
--- /dev/null
+++ b/tests/cpp/kvstore/gpu_topology_test.cc
@@ -0,0 +1,676 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file gpu_topology_test.cc
+ * \brief gpu topology tests
+*/
+
+#include <gtest/gtest.h>
+#include <mxnet/base.h>
+#include <mxnet/kvstore.h>
+#include "../src/kvstore/gpu_topology.h"
+
+void GenerateMatrix(std::vector<float>* W, int num_gpus, float k,
+                    std::mt19937* gen) {
+  std::uniform_real_distribution<> dis(0., 1.);
+  for (int row = 0; row < num_gpus; ++row) {
+    for (int col = row+1; col < num_gpus; ++col) {
+      float sample = dis(*gen);
+      if (sample < k)
+        continue;
+      sample = dis(*gen);
+      if (sample < 0.33f) {
+        (*W)[row*num_gpus+col] = 1.f;
+        (*W)[col*num_gpus+row] = 1.f;
+      } else if (sample < 0.66f) {
+        (*W)[row*num_gpus+col] = 2.f;
+        (*W)[col*num_gpus+row] = 2.f;
+      } else {
+        (*W)[row*num_gpus+col] = 3.f;
+        (*W)[col*num_gpus+row] = 3.f;
+      }
+    }
+  }
+}
+
+bool IsSatisfactory(const std::vector<float>& W, int num_gpus, int depth) {
+  for (int row = 0; row < num_gpus; ++row) {
+    int out_edges = 0;
+    for (int col = 0; col < num_gpus; ++col) {
+      if (W[row*num_gpus+col] > 0.f)
+        out_edges++;
+    }
+    if (out_edges < depth)
+      return false;
+  }
+  return true;
+}
+
+// Generates random link topology matrix using random number generator
+void TestComputeTreesRandomized(int num_gpus, float alpha, int backtrack,
+                                std::mt19937* gen) {
+  std::uniform_real_distribution<> dis(0.f, 1.f);
+  bool satisfied = false;
+  std::vector<float> W(num_gpus*num_gpus, 0.f);
+  int depth = mxnet::kvstore::ComputeDepth(num_gpus);
+  while (!satisfied) {
+    float k = dis(*gen);
+    std::fill(W.begin(), W.end(), 0.f);
+    GenerateMatrix(&W, num_gpus, k, gen);
+    satisfied = IsSatisfactory(W, num_gpus, depth);
+  }
+
+  std::vector<std::vector<size_t>> topo;
+  std::vector<std::vector<size_t>> scan;
+  mxnet::kvstore::ComputeTrees(W, num_gpus, alpha, backtrack, &topo, &scan);
+
+  unsigned correct_topo_size = (1 << (depth + 1)) - 1;
+  unsigned correct_scan_size = depth+2;
+  ASSERT_EQ(topo.size(), static_cast<unsigned>(num_gpus));
+  for (unsigned i = 0; i < topo.size(); ++i) {
+    ASSERT_EQ(correct_topo_size, topo[i].size());
+    ASSERT_EQ(correct_scan_size, scan[i].size());
+  }
+}
+
+// Permutes matrix W using permutation vector P and stores output in matrix A
+// Assumption: W is square and symmetric
+void PermuteMatrix(const std::vector<int>& W,
+                   const std::vector<int>& P,
+                   std::vector<int>*       A) {
+  int nrows = P.size();
+  std::vector<int> temp(nrows*nrows, 0);
+
+  int count = 0;
+  for (int row=0; row < nrows; ++row) {
+    for (int col=0; col < nrows; ++col) {
+      int row_start = P[row];
+      temp[count] = W[row_start*nrows+col];
+      count++;
+    }
+  }
+
+  count = 0;
+  for (int row=0; row < nrows; ++row) {
+    for (int col=0; col < nrows; ++col) {
+      int col_index = P[col];
+      (*A)[count] = temp[row*nrows+col_index];
+      count++;
+    }
+  }
+}
+
+TEST(GpuTopology, TestFormTopology) {
+  std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
+  std::vector<size_t> topo0;
+  std::vector<size_t> scan0;
+  std::vector<int> correct0 = {3, 3, 0, 3, 1, 0, 4, 3, 2, 1, 5, 0, 0, 4, 6};
+  std::vector<int> correct_scan0 = {0, 1, 3, 7, 15};
+  mxnet::kvstore::FormTopology(state0, &topo0, &scan0, 3);
+  ASSERT_EQ(topo0.size(), correct0.size());
+  for (unsigned i = 0; i < correct0.size(); ++i)
+    ASSERT_EQ(static_cast<int>(topo0[i]), correct0[i]);
+  ASSERT_EQ(scan0.size(), correct_scan0.size());
+  for (unsigned i = 0; i < correct_scan0.size(); ++i)
+    ASSERT_EQ(static_cast<int>(scan0[i]), correct_scan0[i]);
+
+  std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
+  std::vector<size_t> topo1;
+  std::vector<size_t> scan1;
+  std::vector<int> correct1 = {3, 3, 1, 3, 0, 1, 5, 3, 2, 0, 4, 1, 1, 5, 6};
+  std::vector<int> correct_scan1 = {0, 1, 3, 7, 15};
+  mxnet::kvstore::FormTopology(state1, &topo1, &scan1, 3);
+  ASSERT_EQ(topo1.size(), correct1.size());
+  for (unsigned i = 0; i < correct1.size(); ++i)
+    ASSERT_EQ(static_cast<int>(topo1[i]), correct1[i]);
+  ASSERT_EQ(scan1.size(), correct_scan1.size());
+  for (unsigned i = 0; i < correct_scan1.size(); ++i)
+    ASSERT_EQ(static_cast<int>(scan1[i]), correct_scan1[i]);
+}
+
+TEST(GpuTopology, TestComputeTreeWeight) {
+  std::vector<int> W = {0, 2, 2, 3, 3, 0, 0,
+                        2, 0, 3, 2, 0, 3, 0,
+                        2, 3, 0, 3, 0, 0, 2,
+                        3, 2, 3, 0, 0, 0, 0,
+                        3, 0, 0, 0, 0, 2, 2,
+                        0, 3, 0, 0, 2, 0, 3,
+                        0, 0, 2, 0, 2, 3, 0};
+
+  std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
+  ASSERT_EQ(mxnet::kvstore::ComputeTreeWeight(W, state0, 7, 3, false), 16);
+
+  std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
+  ASSERT_EQ(mxnet::kvstore::ComputeTreeWeight(W, state1, 7, 3, false), 17);
+}
+
+TEST(GpuTopology, TestPostprocess) {
+  std::vector<int> result0  = {3, 0, 0, 4, 1, 2, 5, 6};
+  std::vector<int> correct0 = {3, 3, 0, 4, 1, 2, 5, 6};
+  mxnet::kvstore::Postprocess(&result0, 7, 3);
+  for (unsigned i = 0; i < correct0.size(); ++i)
+    ASSERT_EQ(result0[i], correct0[i]);
+
+  std::vector<int> result1  = {2, 0, 0, 4, 1, 3, 5, 1};
+  std::vector<int> correct1 = {2, 2, 0, 4, 1, 3, 5, 5};
+  mxnet::kvstore::Postprocess(&result1, 6, 3);
+  for (unsigned i = 0; i < correct1.size(); ++i)
+    ASSERT_EQ(result1[i], correct1[i]);
+
+  std::vector<int> result2  = {5, 4, 1, 3, 1, 0, 2, 0};
+  std::vector<int> correct2 = {5, 4, 5, 3, 1, 0, 2, 2};
+  mxnet::kvstore::Postprocess(&result2, 6, 3);
+  for (unsigned i = 0; i < correct2.size(); ++i)
+    ASSERT_EQ(result2[i], correct2[i]);
+
+  std::vector<int> result3  = {10, 10,  0,  0, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
+  std::vector<int> correct3 = {10, 10, 10, 10, 0, 0, 0, 1, 2, 3, 6, 4, 7, 5, 8, 9};
+  mxnet::kvstore::Postprocess(&result3, 11, 4);
+  for (unsigned i = 0; i < correct3.size(); ++i)
+    ASSERT_EQ(result3[i], correct3[i]);
+}
+
+TEST(GpuTopology, TestDepth) {
+  ASSERT_EQ(mxnet::kvstore::ComputeDepth(2), 1);
+  ASSERT_EQ(mxnet::kvstore::ComputeDepth(3), 2);
+  ASSERT_EQ(mxnet::kvstore::ComputeDepth(8), 3);
+  ASSERT_EQ(mxnet::kvstore::ComputeDepth(7), 3);
+  ASSERT_EQ(mxnet::kvstore::ComputeDepth(5), 3);
+  ASSERT_EQ(mxnet::kvstore::ComputeDepth(4), 2);
+  ASSERT_EQ(mxnet::kvstore::ComputeDepth(16), 4);
+}
+
+TEST(GpuTopology, TestIsValid) {
+  std::vector<int> W = {0, 2, 2, 3, 3, 0, 0,
+                        2, 0, 3, 2, 0, 3, 0,
+                        2, 3, 0, 3, 0, 0, 2,
+                        3, 2, 3, 0, 0, 0, 0,
+                        3, 0, 0, 0, 0, 2, 2,
+                        0, 3, 0, 0, 2, 0, 3,
+                        0, 0, 2, 0, 2, 3, 0};
+
+  std::vector<int> state0 = {3, 2, 1, 5, 0, 0, 4, 6};
+  ASSERT_EQ(mxnet::kvstore::IsValid(W, state0, 7, 7, 3), true);
+
+  // 3 connects to 1 first
+  std::vector<int> state1 = {3, 2, 0, 4, 1, 1, 5, 6};
+  ASSERT_EQ(mxnet::kvstore::IsValid(W, state1, 7, 7, 3), true);
+
+  // 3 does not connect to 5
+  std::vector<int> state2 = {3, 2, 5, 1, 0, 4, 2, 5};
+  ASSERT_EQ(mxnet::kvstore::IsValid(W, state2, 7, 7, 3), false);
+
+  // 7 exceeds number of GPUs
+  std::vector<int> state3 = {3, 7, 2, 6, 0, 1, 4, 5};
+  ASSERT_EQ(mxnet::kvstore::IsValid(W, state3, 7, 7, 3), false);
+
+  // Test -1
+  std::vector<int> state4 = {3, -1, 2, 6, 0, 1, 4, 5};
+  ASSERT_EQ(mxnet::kvstore::IsValid(W, state4, 7, 7, 3), true);
+
+  // Test -1
+  std::vector<int> state5 = {3, -1, 2, 6, 0, 1, 4, -1};
+  ASSERT_EQ(mxnet::kvstore::IsValid(W, state5, 7, 8, 3), false);
+
+  // Test 1 row
+  std::vector<int> state6 = {3, -1, -1, -1, -1, -1, -1, -1};
+  ASSERT_EQ(mxnet::kvstore::IsValid(W, state6, 7, 1, 3), true);
+}
+
+// gemvTest
+TEST(GpuTopology, TestGemv) {
+  std::vector<int> A = {0, 2, 2, 3, 3, 1, 1, 1,   // 13
+                        2, 0, 3, 2, 1, 3, 1, 1,   // 13
+                        2, 3, 0, 3, 1, 1, 2, 1,   // 13
+                        3, 2, 3, 0, 1, 1, 1, 2,   // 13
+                        3, 1, 1, 1, 0, 2, 2, 3,   // 13
+                        1, 3, 1, 1, 2, 0, 3, 2,   // 13
+                        1, 1, 2, 1, 2, 3, 0, 3,   // 13
+                        1, 1, 1, 2, 3, 2, 3, 0};  // 13
+  std::vector<int> x(8, 1);
+  std::vector<int> y(8, 0);
+  std::iota(y.begin(), y.end(), 0);
+  std::vector<int> correct_y(8, 13);
+  mxnet::kvstore::gemv(A, x, &y);
+
+  ASSERT_EQ(y.size(), correct_y.size());
+  for (unsigned i = 0; i < y.size(); ++i)
+    ASSERT_EQ(y[i], correct_y[i]);
+}
+
+// ewisemultTest
+TEST(GpuTopology, TestEwisemult) {
+  std::vector<int> x(8, 1);
+  std::vector<int> y(8, 0);
+  std::iota(y.begin(), y.end(), 0);
+  int alpha = 5;
+  std::vector<int> correct_y = {0, 5, 10, 15, 20, 25, 30, 35};
+  mxnet::kvstore::ewisemult(x, alpha, &y);
+
+  ASSERT_EQ(y.size(), correct_y.size());
+  for (unsigned i = 0; i < y.size(); ++i)
+    ASSERT_EQ(y[i], correct_y[i]);
+}
+
+// FindBestMoveTest
+TEST(GpuTopology, TestFindBestMove) {
+  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1,
+                        2, 0, 3, 2, 1, 3, 1, 1,
+                        2, 3, 0, 3, 1, 1, 2, 1,
+                        3, 2, 3, 0, 1, 1, 1, 2,
+                        3, 1, 1, 1, 0, 2, 2, 3,
+                        1, 3, 1, 1, 2, 0, 3, 2,
+                        1, 1, 2, 1, 2, 3, 0, 3,
+                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> P(8, 0);
+  std::iota(P.begin(), P.end(), 1);
+  std::unordered_set<int> used;
+
+  std::vector<int> D1 = {20, 0, 0, 0, 0, 0, 0, 20};
+  int a1, b1, g1;
+  int correct_a1 = 0;
+  int correct_b1 = 7;
+  int correct_g1 = 38;
+  mxnet::kvstore::FindBestMove(W, P, D1, used, &a1, &b1, &g1);
+  ASSERT_EQ(a1, correct_a1);
+  ASSERT_EQ(b1, correct_b1);
+  ASSERT_EQ(g1, correct_g1);
+
+  // -1, -1, 0 indicates no best edge found
+  std::vector<int> D2 = {0, 0, 0, 0, 0, 0, 0, 0};
+  int a2, b2, g2;
+  int correct_a2 = -1;
+  int correct_b2 = -1;
+  int correct_g2 = 0;
+  mxnet::kvstore::FindBestMove(W, P, D2, used, &a2, &b2, &g2);
+  ASSERT_EQ(a2, correct_a2);
+  ASSERT_EQ(b2, correct_b2);
+  ASSERT_EQ(g2, correct_g2);
+}
+
+// GetRootTest
+TEST(GpuTopology, TestGetRoot) {
+  std::vector<int> P = {0, 0, 1, 1, 2, 2, 3, 3};
+
+  // Test when roots are non-empty, and matches color
+  std::unordered_set<int> roots1 = {0, 2, 4, 6};
+  std::vector<int> color1 = {0, 1, 2, 3};
+  for (unsigned i = 0; i < color1.size(); ++i) {
+    int root1 = mxnet::kvstore::GetRoot(P, color1[i], roots1);
+    int correct_root1 = 2*i;
+    ASSERT_EQ(root1, correct_root1);
+  }
+
+  // Test when roots is empty
+  std::unordered_set<int> roots2;
+  int color2 = 0;
+  int correct_root2 = -1;
+  int root2  = mxnet::kvstore::GetRoot(P, color2, roots2);
+  ASSERT_EQ(root2, correct_root2);
+
+  // Test when roots is non-empty, but no root matches color
+  std::unordered_set<int> roots3 = {0};
+  int color3 = 1;
+  int correct_root3 = -1;
+  int root3  = mxnet::kvstore::GetRoot(P, color3, roots3);
+  ASSERT_EQ(root3, correct_root3);
+
+  std::vector<int> P2 = {0, 1, 1, 0, 2, 3, 3, 2};
+  std::unordered_set<int> roots4 = roots1;
+  int color4 = 0;
+  int correct_root4 = 0;
+  int root4 = mxnet::kvstore::GetRoot(P, color4, roots4);
+  ASSERT_EQ(root4, correct_root4);
+}
+
+// GetChildTest
+TEST(GpuTopology, TestGetChild) {
+  std::vector<int> P = {0, 0, 1, 2, 2, 2, 3, 3};
+
+  // Test when color is not found
+  int color1  = 4;
+  int parent1 = 4;
+  int correct_child1 = -1;
+  int child1  = mxnet::kvstore::GetChild(P, color1, parent1);
+  ASSERT_EQ(child1, correct_child1);
+
+  // Test when color is found, but is equal to parent
+  int color2  = 1;
+  int parent2 = 2;
+  int correct_child2 = -1;
+  int child2  = mxnet::kvstore::GetChild(P, color2, parent2);
+  ASSERT_EQ(child2, correct_child2);
+
+  // Test when color is found and not equal to parent
+  int color3  = 3;
+  int parent3 = 6;
+  int correct_child3 = 7;
+  int child3  = mxnet::kvstore::GetChild(P, color3, parent3);
+  ASSERT_EQ(child3, correct_child3);
+}
+
+// FindBestEdgeTest
+TEST(GpuTopology, TestFindBestEdge) {
+  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1,
+                        2, 0, 3, 2, 1, 3, 1, 1,
+                        2, 3, 0, 3, 1, 1, 2, 1,
+                        3, 2, 3, 0, 1, 1, 1, 2,
+                        3, 1, 1, 1, 0, 2, 2, 3,
+                        1, 3, 1, 1, 2, 0, 3, 2,
+                        1, 1, 2, 1, 2, 3, 0, 3,
+                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> P(8, 0);
+  std::unordered_set<int> used;
+
+  int parent1 = 3;
+  int dest1   = 0;
+  std::vector<int> b1;
+  int g1;
+  std::vector<int> correct_b1 = {0, 2};
+  int correct_g1 = 3;
+  mxnet::kvstore::FindBestEdge(W, P, parent1, dest1, &b1, &g1);
+  ASSERT_EQ(b1.size(), correct_b1.size());
+  for (unsigned i = 0; i < b1.size(); ++i)
+    ASSERT_EQ(b1[i], correct_b1[i]);
+  ASSERT_EQ(g1, correct_g1);
+
+  // {-1}, 0 indicates no best edge found
+  int parent2 = 4;
+  int dest2   = 1;
+  std::vector<int> b2;
+  int g2;
+  std::vector<int> correct_b2 = {-1};
+  int correct_g2 = 0;
+  mxnet::kvstore::FindBestEdge(W, P, parent2, dest2, &b2, &g2);
+  ASSERT_EQ(b2.size(), correct_b2.size());
+  for (unsigned i = 0; i < b2.size(); ++i)
+    ASSERT_EQ(b2[i], correct_b2[i]);
+  ASSERT_EQ(g2, correct_g2);
+}
+
+// KLGenerateBinaryTreeTest
+TEST(GpuTopology, TestKLGenerateBinaryTree1) {
+  std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1,
+                        2, 0, 3, 2, 1, 3, 1, 1,
+                        2, 3, 0, 3, 1, 1, 2, 1,
+                        3, 2, 3, 0, 1, 1, 1, 2,
+                        3, 1, 1, 1, 0, 2, 3, 3,
+                        1, 3, 1, 1, 2, 0, 3, 2,
+                        1, 1, 2, 1, 2, 3, 0, 3,
+                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
+  std::vector<std::pair<int, int>> cluster_pairs;
+  cluster_pairs.push_back(std::pair<int, int>(0, -2));
+  cluster_pairs.push_back(std::pair<int, int>(1, -2));
+  cluster_pairs.push_back(std::pair<int, int>(2, -2));
+  cluster_pairs.push_back(std::pair<int, int>(3, -2));
+  std::unordered_set<int> roots = {0, 2, 4, 6};
+  std::vector<size_t> topo = {0, 2, 4, 6};
+  std::vector<size_t> scan(2, 0);
+  std::mt19937 gen(1);
+  mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo,
+                                       &scan, &gen);
+  std::vector<size_t> correct_topo = {0, 2, 4, 6, 0, 3, 2, 1, 4, 7, 6, 5};
+  std::vector<size_t> correct_scan = {0, 0, 4};
+  ASSERT_EQ(topo.size(), correct_topo.size());
+  for (unsigned i = 0; i < topo.size(); ++i)
+    ASSERT_EQ(topo[i], correct_topo[i]);
+  ASSERT_EQ(scan.size(), correct_scan.size());
+  for (unsigned i = 0; i < scan.size(); ++i)
+    ASSERT_EQ(scan[i], correct_scan[i]);
+}
+
+TEST(GpuTopology, TestKLGenerateBinaryTree2) {
+  std::vector<int> W = {0, 2, 3, 3, 3, 1, 1, 1,
+                        2, 0, 3, 2, 1, 3, 1, 1,
+                        2, 3, 0, 3, 1, 1, 2, 1,
+                        3, 2, 3, 0, 1, 1, 1, 2,
+                        3, 1, 1, 1, 0, 2, 3, 3,
+                        1, 3, 1, 1, 2, 0, 3, 2,
+                        1, 1, 2, 1, 2, 3, 0, 3,
+                        1, 1, 1, 2, 3, 2, 3, 0};
+  std::vector<int> P = {0, 1, 1, 0, 2, 3, 3, 2};
+  std::vector<std::pair<int, int>> cluster_pairs;
+  cluster_pairs.push_back(std::pair<int, int>(0, -2));
+  cluster_pairs.push_back(std::pair<int, int>(1, -2));
+  cluster_pairs.push_back(std::pair<int, int>(2, -2));
+  cluster_pairs.push_back(std::pair<int, int>(3, -2));
+  std::unordered_set<int> roots = {0, 2, 4, 6};
+  std::vector<size_t> topo = {0, 6, 4, 2};
+  std::vector<size_t> scan(2, 0);
+  std::mt19937 gen(1);
+  mxnet::kvstore::KLGenerateBinaryTree(W, P, &cluster_pairs, &roots, &topo,
+                                       &scan, &gen);
+  std::vector<size_t> correct_topo = {0, 6, 4, 2, 0, 3, 6, 5, 4, 7, 2, 1};
+  std::vector<size_t> correct_scan = {0, 0, 4};
+  ASSERT_EQ(topo.size(), correct_topo.size());
+  for (unsigned i = 0; i < topo.size(); ++i)
+    ASSERT_EQ(topo[i], correct_topo[i]);
+  ASSERT_EQ(scan.size(), correct_scan.size());
+  for (unsigned i = 0; i < scan.size(); ++i)
+    ASSERT_EQ(scan[i], correct_scan[i]);
+}
+
+// UpdateWeightTest
+TEST(GpuTopology, TestUpdateWeight) {
+  std::vector<float> W = {0.f, 1.f,
+                          1.f, 0.f};
+  std::vector<size_t> topo = {1, 1, 0};
+  int num_gpus = 2;
+  float alpha  = 0.7;
+  std::vector<float> correct_W = {0.f, 0.7f,
+                                  0.7f, 0.f};
+  mxnet::kvstore::UpdateWeight(&W, topo, num_gpus, alpha);
+  ASSERT_EQ(W.size(), correct_W.size());
+  for (unsigned i = 0; i < W.size(); ++i) {
+    ASSERT_EQ(W[i], correct_W[i]);
+  }
+}
+
+// ComputeTreesFromRoot
+TEST(GpuTopology, TestComputeTreesFromRoot1) {
+  std::vector<float> W = {0, 2, 2, 3, 3, 1, 1, 1,
+                          2, 0, 3, 2, 1, 3, 1, 1,
+                          2, 3, 0, 3, 1, 1, 2, 1,
+                          3, 2, 3, 0, 1, 1, 1, 2,
+                          3, 1, 1, 1, 0, 2, 2, 3,
+                          1, 3, 1, 1, 2, 0, 3, 2,
+                          1, 1, 2, 1, 2, 3, 0, 3,
+                          1, 1, 1, 2, 3, 2, 3, 0};
+  int num_gpus = 8;
+  int root     = 0;
+  float alpha  = 0.7;
+  bool backtrack = true;
+  unsigned correct_topo_size = 15;
+  unsigned correct_scan_size = 5;
+  std::vector<size_t> topo;
+  std::vector<size_t> scan;
+
+  mxnet::kvstore::ComputeTreesFromRoot(&W, num_gpus, root, alpha, backtrack,
+                                       &topo, &scan);
+
+  ASSERT_EQ(topo.size(), correct_topo_size);
+  ASSERT_EQ(scan.size(), correct_scan_size);
+}
+
+// IsConnected
+// Test on graph that is "disconnected" by NVLink
+TEST(GpuTopology, TestIsConnected1) {
+  std::vector<float> W = {0, 0, 2, 0,
+                          0, 0, 0, 2,
+                          2, 0, 0, 0,
+                          0, 2, 0, 0};
+  int num_gpus = 4;
+
+  bool connected = mxnet::kvstore::IsConnected(W, num_gpus);
+
+  bool correct_connected = false;
+  ASSERT_EQ(connected, correct_connected);
+}
+
+// IsConnected
+// Test on graph that is "disconnected" by NVLink
+TEST(GpuTopology, TestIsConnected2) {
+  std::vector<float> W = {1, 1, 2, 1,
+                          1, 1, 1, 2,
+                          2, 1, 1, 1,
+                          1, 2, 1, 1};
+  int num_gpus = 4;
+
+  bool connected = mxnet::kvstore::IsConnected(W, num_gpus);
+
+  bool correct_connected = false;
+  ASSERT_EQ(connected, correct_connected);
+}
+
+// IsConnected
+// Test on graph that is "disconnected" by NVLink
+TEST(GpuTopology, TestIsConnected3) {
+  std::vector<float> W = {1, 1, 2, 2,
+                          1, 1, 1, 2,
+                          2, 1, 1, 1,
+                          2, 2, 1, 1};
+  int num_gpus = 4;
+
+  bool connected = mxnet::kvstore::IsConnected(W, num_gpus);
+
+  bool correct_connected = true;
+  ASSERT_EQ(connected, correct_connected);
+}
+
+// ComputeTreesTest with backtracking
+TEST(GpuTopology, TestComputeTrees1) {
+  std::mt19937 gen(1);
+  float alpha = 0.7;
+  bool backtrack = true;
+  // Do 5 randomized tests per GPU count from 2 to 16
+  for (int num_gpus = 2; num_gpus <= 16; ++num_gpus) {
+    LOG(INFO) << "Testing " << num_gpus << " x " << num_gpus;
+    for (int i = 0; i < 5; ++i) {
+      TestComputeTreesRandomized(num_gpus, alpha, backtrack, &gen);
+    }
+  }
+}
+
+// ComputeTreesTest with Kernighan-Lin
+TEST(GpuTopology, TestComputeTrees2) {
+  std::mt19937 gen(1);
+  float alpha = 0.7;
+  bool backtrack = false;
+  // Do 5 randomized tests per GPU count from 2 to 16
+  for (int num_gpus = 2; num_gpus <= 16; ++num_gpus) {
+    LOG(INFO) << "Testing " << num_gpus << " x " << num_gpus;
+    for (int i = 0; i < 5; ++i) {
+      TestComputeTreesRandomized(num_gpus, alpha, backtrack, &gen);
+    }
+  }
+}
+
+TEST(GpuTopology, TestPermuteMatrix) {
+  std::vector<int> W = {0, 2, 2, 3, 3, 1, 1, 1,
+                        2, 0, 3, 2, 1, 3, 1, 1,
+                        2, 3, 0, 3, 1, 1, 2, 1,
+                        3, 2, 3, 0, 1, 1, 1, 2,
+                        3, 1, 1, 1, 0, 2, 2, 3,
+                        1, 3, 1, 1, 2, 0, 3, 2,
+                        1, 1, 2, 1, 2, 3, 0, 3,
+                        1, 1, 1, 2, 3, 2, 3, 0};
+
+  std::vector<int> P1 = {0, 1, 2, 3, 4, 5, 6, 7};
+  std::vector<int> A(8*8, 0);
+  PermuteMatrix(W, P1, &A);
+  for (unsigned i=0; i < W.size(); ++i)
+    ASSERT_EQ(A[i], W[i]);
+}
+
+TEST(GpuTopology, TestKernighanLin1) {
+  std::vector<float> W = {0, 1, 2, 3, 2, 4,
+                          1, 0, 1, 4, 2, 1,
+                          2, 1, 0, 3, 2, 1,
+                          3, 4, 3, 0, 4, 3,
+                          2, 2, 2, 4, 0, 2,
+                          4, 1, 1, 3, 2, 0};
+  std::vector<int> P(6, 0);
+  std::vector<std::pair<int, int>> cluster_pairs;
+  int num_partitions = 1;
+  std::mt19937 gen(1);
+  bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions,
+                                           &cluster_pairs, &gen);
+
+  std::vector<std::pair<int, int>> correct_pairs;
+  correct_pairs.push_back(std::pair<int, int>(0, 1));
+  std::vector<int> correct_P = {0, 1, 0, 1, 1, 0};
+  ASSERT_EQ(stop, false);
+  ASSERT_EQ(num_partitions, 2);
+  ASSERT_EQ(cluster_pairs.size(), correct_pairs.size());
+  for (unsigned i = 0; i < cluster_pairs.size(); ++i) {
+    ASSERT_EQ(cluster_pairs[i].first, correct_pairs[i].first);
+    ASSERT_EQ(cluster_pairs[i].second, correct_pairs[i].second);
+  }
+  ASSERT_EQ(P.size(), correct_P.size());
+  unsigned error = 0;
+  for (unsigned i = 0; i < P.size(); ++i) {
+    if (P[i] != correct_P[i])
+      error++;
+  }
+  EXPECT_TRUE(error == 0 || error == P.size())
+           << "Where real value: "   << error
+           << " not equal neither: " << 0
+           << " nor: "               << P.size() << ".";
+}
+
+TEST(GpuTopology, TestKernighanLin2) {
+  std::vector<float> W = {0, 1, 0, 0, 1, 1, 0, 0,
+                           1, 0, 0, 0, 1, 1, 0, 0,
+                           0, 0, 0, 1, 0, 1, 1, 1,
+                           0, 0, 1, 0, 0, 0, 1, 1,
+                           1, 1, 0, 0, 0, 1, 0, 0,
+                           1, 1, 1, 0, 1, 0, 0, 0,
+                           0, 0, 1, 1, 0, 0, 0, 1,
+                           0, 0, 1, 1, 0, 0, 1, 0};
+  std::vector<int> P(8, 0);
+  std::vector<std::pair<int, int>> cluster_pairs;
+  int num_partitions = 1;
+  std::mt19937 gen(1);
+  bool stop = mxnet::kvstore::KernighanLin(W, &P, &num_partitions,
+                                           &cluster_pairs, &gen);
+
+  std::vector<std::pair<int, int>> correct_pairs;
+  correct_pairs.push_back(std::pair<int, int>(0, 1));
+  std::vector<int> correct_P = {0, 0, 1, 1, 0, 0, 1, 1};
+  ASSERT_EQ(stop, false);
+  ASSERT_EQ(num_partitions, 2);
+  ASSERT_EQ(cluster_pairs.size(), correct_pairs.size());
+  for (unsigned i = 0; i < cluster_pairs.size(); ++i) {
+    ASSERT_EQ(cluster_pairs[i].first, correct_pairs[i].first);
+    ASSERT_EQ(cluster_pairs[i].second, correct_pairs[i].second);
+  }
+  ASSERT_EQ(P.size(), correct_P.size());
+  unsigned error = 0;
+  for (unsigned i = 0; i < P.size(); ++i) {
+    if (P[i] != correct_P[i])
+      error++;
+  }
+  EXPECT_TRUE(error == 0 || error == P.size())
+           << "Where real value: "   << error
+           << " not equal neither: " << 0
+           << " nor: "               << P.size() << ".";
+}
diff --git a/tests/python/gpu/test_device.py b/tests/python/gpu/test_device.py
new file mode 100644
index 000000000..66772dc86
--- /dev/null
+++ b/tests/python/gpu/test_device.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+import unittest
+import os
+
+shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
+keys = [1,2,3,4,5,6,7]
+num_gpus = len(mx.test_utils.list_gpus())
+
+
+if num_gpus > 8 :
+    print("The machine has {} gpus. We will run the test on 8 gpus.".format(num_gpus))
+    print("There is a limit for all PCI-E hardware on creating number of P2P peers. The limit is 8.")
+    num_gpus = 8;
+
+gpus = range(1, 1+num_gpus)
+
+class EnvManager:
+    def __init__(self, key, val):
+        self._key = key
+        self._next_val = val
+        self._prev_val = None
+
+    def __enter__(self):
+        try:
+            self._prev_val = os.environ[self._key]
+        except KeyError:
+            self._prev_val = ''
+        os.environ[self._key] = self._next_val
+
+    def __exit__(self, ptype, value, trace):
+        os.environ[self._key] = self._prev_val
+
+def test_device_pushpull():
+    def check_dense_pushpull(kv_type):
+        for shape, key in zip(shapes, keys):
+            for n_gpus in gpus:
+                kv_device = mx.kv.create(kv_type)
+                a = mx.nd.ones(shape, mx.gpu(0))
+                cur_key = str(key*max(gpus)+n_gpus)
+                kv_device.init(cur_key, a)
+                arr_list = [mx.nd.ones(shape, mx.gpu(x)) for x in range(n_gpus)]
+                res = [mx.nd.zeros(shape, mx.gpu(x)) for x in range(n_gpus)]
+                kv_device.push(cur_key, arr_list)
+                kv_device.pull(cur_key, res)
+                for x in range(n_gpus):
+                    assert(np.sum(np.abs((res[x]-n_gpus).asnumpy()))==0)
+
+    envs1 = '1'
+    key1 = 'MXNET_KVSTORE_TREE_ARRAY_BOUND'
+    envs2 = ['','1']
+    key2  = 'MXNET_KVSTORE_USETREE'
+    for i in range(2):
+        for val2 in envs2:
+            with EnvManager(key2, val2):
+                check_dense_pushpull('local')
+                check_dense_pushpull('device')
+
+        os.environ[key1] = envs1
+    os.environ[key1] = ''
+
+    print ("Passed")
+
+if __name__ == '__main__':
+    test_device_pushpull()
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index 31120aa14..5e9b120f1 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -30,6 +30,21 @@
 keys = [5, 7, 11]
 str_keys = ['b', 'c', 'd']
 
+class EnvManager:
+    def __init__(self, key, val):
+        self._key = key
+        self._next_val = val
+        self._prev_val = None
+
+    def __enter__(self):
+        try:
+            self._prev_val = os.environ[self._key]
+        except KeyError:
+            self._prev_val = ''
+        os.environ[self._key] = self._next_val
+
+    def __exit__(self, ptype, value, trace):
+        os.environ[self._key] = self._prev_val
 
 def init_kv_with_str(stype='default', kv_type='local'):
     """init kv """
@@ -44,13 +59,13 @@ def init_kv_with_str(stype='default', kv_type='local'):
 # Not reproducible, so this test is back on random seeds.
 @with_seed()
 def test_rsp_push_pull():
-    def check_rsp_push_pull(kv_type, is_push_cpu=True):
+    def check_rsp_push_pull(kv_type, sparse_pull, is_push_cpu=True):
         kv = init_kv_with_str('row_sparse', kv_type)
         kv.init('e', mx.nd.ones(shape).tostype('row_sparse'))
         push_ctxs = [mx.cpu(i) if is_push_cpu else mx.gpu(i) for i in range(2)]
         kv.push('e', [mx.nd.ones(shape, ctx=context).tostype('row_sparse') for context in push_ctxs])
 
-        def check_rsp_pull(kv, count, ctxs, is_same_rowid=False, use_slice=False):
+        def check_rsp_pull(kv, count, ctxs, sparse_pull, is_same_rowid=False, use_slice=False):
             num_rows = shape[0]
             row_ids = []
             all_row_ids = np.arange(num_rows)
@@ -77,26 +92,34 @@ def check_rsp_pull(kv, count, ctxs, is_same_rowid=False, use_slice=False):
                     expected_val += 0 if row in excluded_row_ids else 2
                     assert_almost_equal(retained[row], expected_val)
 
-            kv.pull('e', out=vals_to_pull, ignore_sparse=False)
-            for val in vals:
-                retained = val.asnumpy()
-                expected_val = np.zeros_like(retained)
-                expected_val[:] = 2
-                assert_almost_equal(retained, expected_val)
-
-        check_rsp_pull(kv, 1, [mx.gpu(0)])
-        check_rsp_pull(kv, 1, [mx.cpu(0)])
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)])
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], is_same_rowid=True)
-        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)])
-        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], is_same_rowid=True)
-        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], use_slice=True)
-        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], use_slice=True)
-
-    check_rsp_push_pull('local')
-    check_rsp_push_pull('device')
-    check_rsp_push_pull('device', is_push_cpu=False)
-
+            if sparse_pull is True:
+                kv.pull('e', out=vals_to_pull, ignore_sparse=False)
+                for val in vals:
+                    retained = val.asnumpy()
+                    expected_val = np.zeros_like(retained)
+                    expected_val[:] = 2
+                    assert_almost_equal(retained, expected_val)
+
+        check_rsp_pull(kv, 1, [mx.gpu(0)], sparse_pull)
+        check_rsp_pull(kv, 1, [mx.cpu(0)], sparse_pull)
+        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], sparse_pull)
+        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], sparse_pull, is_same_rowid=True)
+        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], sparse_pull)
+        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], sparse_pull, is_same_rowid=True)
+        check_rsp_pull(kv, 4, [mx.gpu(i//2) for i in range(4)], sparse_pull, use_slice=True)
+        check_rsp_pull(kv, 4, [mx.cpu(i) for i in range(4)], sparse_pull, use_slice=True)
+
+    envs = ["","1"]
+    key  = "MXNET_KVSTORE_USETREE"
+    for val in envs:
+        with EnvManager(key, val):
+            if val is "1":
+                sparse_pull = False
+            else:
+                sparse_pull = True
+            check_rsp_push_pull('local', sparse_pull)
+            check_rsp_push_pull('device', sparse_pull)
+            check_rsp_push_pull('device', sparse_pull, is_push_cpu=False)
 
 def test_row_sparse_pull_single_device():
     kvstore = mx.kv.create('device')
diff --git a/tests/python/gpu/test_nccl.py b/tests/python/gpu/test_nccl.py
index 8e00ba05f..40ef6fdfd 100644
--- a/tests/python/gpu/test_nccl.py
+++ b/tests/python/gpu/test_nccl.py
@@ -18,6 +18,7 @@
 import mxnet as mx
 import numpy as np
 import unittest
+import os
 
 shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
 keys = [1,2,3,4,5,6,7]
@@ -29,7 +30,7 @@
     print("There is a limit for all PCI-E hardware on creating number of P2P peers. The limit is 8.")
     num_gpus = 8;
 
-gpus = range(1,1+num_gpus)
+gpus = range(1, 1+num_gpus)
 
 @unittest.skip("Test requires NCCL library installed and enabled during build")
 def test_nccl_pushpull():

From fa935a833f6ebc6581cb399e2ea0a73dd6939520 Mon Sep 17 00:00:00 2001
From: Sergey Sokolov <Sergei.Sokolov@gmail.com>
Date: Tue, 24 Jul 2018 14:57:23 -0700
Subject: [PATCH 04/63] Fix file name creation for Windows (#11765)

* Fix file name creation for Windows

* Forcing build

* Force build again
---
 docs/tutorials/python/predict_image.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/tutorials/python/predict_image.md b/docs/tutorials/python/predict_image.md
index 3e68be07f..a9a0d2901 100644
--- a/docs/tutorials/python/predict_image.md
+++ b/docs/tutorials/python/predict_image.md
@@ -57,8 +57,8 @@ from collections import namedtuple
 Batch = namedtuple('Batch', ['data'])
 
 def get_image(url, show=False):
-    # download and show the image
-    fname = mx.test_utils.download(url)
+    # download and show the image. Remove query string from the file name.
+    fname = mx.test_utils.download(url, fname=url.split('/')[-1].split('?')[0])
     img = mx.image.imread(fname)
     if img is None:
         return None

From 8a21a06921ba3f514cf89e1320054834dedfb563 Mon Sep 17 00:00:00 2001
From: Tong He <hetong007@gmail.com>
Date: Tue, 24 Jul 2018 17:06:36 -0700
Subject: [PATCH 05/63] update vgg pretrained model (#11860)

* update vgg pretrained model

* Trigger CI

* Trigger CI
---
 docs/api/python/gluon/model_zoo.md          | 8 ++++----
 python/mxnet/gluon/model_zoo/model_store.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/api/python/gluon/model_zoo.md b/docs/api/python/gluon/model_zoo.md
index 996a9450c..b139bfa8a 100644
--- a/docs/api/python/gluon/model_zoo.md
+++ b/docs/api/python/gluon/model_zoo.md
@@ -60,12 +60,12 @@ The following table summarizes the available models.
 | squeezenet1.1 | [SqueezeNet 1.1](https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1) | 1,235,496    | 0.5496         | 0.7817         | Converted from pytorch vision                                                                                                                        |
 | vgg11         | [VGG-11](https://arxiv.org/abs/1409.1556)                                             | 132,863,336  | 0.6662         | 0.8734         | Converted from pytorch vision                                                                                                                        |
 | vgg13         | [VGG-13](https://arxiv.org/abs/1409.1556)                                             | 133,047,848  | 0.6774         | 0.8811         | Converted from pytorch vision                                                                                                                        |
-| vgg16         | [VGG-16](https://arxiv.org/abs/1409.1556)                                             | 138,357,544  | 0.7238         | 0.9102         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
-| vgg19         | [VGG-19](https://arxiv.org/abs/1409.1556)                                             | 143,667,240  | 0.7370         | 0.9151         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
+| vgg16         | [VGG-16](https://arxiv.org/abs/1409.1556)                                             | 138,357,544  | 0.7323         | 0.9132         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
+| vgg19         | [VGG-19](https://arxiv.org/abs/1409.1556)                                             | 143,667,240  | 0.7411         | 0.9135         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
 | vgg11_bn      | [VGG-11 with batch normalization](https://arxiv.org/abs/1409.1556)                    | 132,874,344  | 0.6859         | 0.8872         | Converted from pytorch vision                                                                                                                        |
 | vgg13_bn      | [VGG-13 with batch normalization](https://arxiv.org/abs/1409.1556)                    | 133,059,624  | 0.6884         | 0.8882         | Converted from pytorch vision                                                                                                                        |
-| vgg16_bn      | [VGG-16 with batch normalization](https://arxiv.org/abs/1409.1556)                    | 138,374,440  | 0.7254         | 0.9114         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
-| vgg19_bn      | [VGG-19 with batch normalization](https://arxiv.org/abs/1409.1556)                    | 143,689,256  | 0.7365         | 0.9182         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
+| vgg16_bn      | [VGG-16 with batch normalization](https://arxiv.org/abs/1409.1556)                    | 138,374,440  | 0.7310         | 0.9176         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
+| vgg19_bn      | [VGG-19 with batch normalization](https://arxiv.org/abs/1409.1556)                    | 143,689,256  | 0.7433         | 0.9185         | Trained with [script](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)                                                                                                                        |
 
 ```eval_rst
 .. autosummary::
diff --git a/python/mxnet/gluon/model_zoo/model_store.py b/python/mxnet/gluon/model_zoo/model_store.py
index 075819560..7eead68f0 100644
--- a/python/mxnet/gluon/model_zoo/model_store.py
+++ b/python/mxnet/gluon/model_zoo/model_store.py
@@ -55,10 +55,10 @@
     ('ee79a8098a91fbe05b7a973fed2017a6117723a8', 'vgg11_bn'),
     ('6bc5de58a05a5e2e7f493e2d75a580d83efde38c', 'vgg13'),
     ('7d97a06c3c7a1aecc88b6e7385c2b373a249e95e', 'vgg13_bn'),
-    ('ef8c5d7fdddb7ef143bbcccbfecdea42552c51d1', 'vgg16'),
-    ('0cebe2f070390db194f0bfa367df00d42de7be29', 'vgg16_bn'),
-    ('02c12a31188a4f9da8651647b64fc2b944a72a40', 'vgg19'),
-    ('11ee6adf98209813dbe929acb57a539ac081e954', 'vgg19_bn')]}
+    ('e660d4569ccb679ec68f1fd3cce07a387252a90a', 'vgg16'),
+    ('7f01cf050d357127a73826045c245041b0df7363', 'vgg16_bn'),
+    ('ad2f660d101905472b83590b59708b71ea22b2e5', 'vgg19'),
+    ('f360b758e856f1074a85abd5fd873ed1d98297c3', 'vgg19_bn')]}
 
 apache_repo_url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'
 _url_format = '{repo_url}gluon/models/{file_name}.zip'

From 07a9977ba440fe0c4f110c775a05d32cb2d12039 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <leonard@lausen.nl>
Date: Wed, 25 Jul 2018 00:50:27 +0000
Subject: [PATCH 06/63] Add verify_ssl option to gluon.utils.download (#11546)

* Add verify_ssl option to gluon.utils.download

Sometimes datasets may be hosted on servers that serve invalid SSL certificates.

* Add warning

* Add test

* Mock gluon.utils.download tests

* Add Py2 mock dependency to Jenkinsfile
---
 Jenkinsfile                               |  2 ++
 python/mxnet/gluon/utils.py               | 14 ++++++--
 tests/python/unittest/test_gluon_utils.py | 44 ++++++++++++++++++++---
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index e81fb5039..ed998ee14 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -849,6 +849,7 @@ try {
               bat """xcopy C:\\mxnet\\data data /E /I /Y
                 xcopy C:\\mxnet\\model model /E /I /Y
                 call activate py2
+                pip install mock
                 set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
                 del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
                 C:\\mxnet\\test_cpu.bat"""
@@ -893,6 +894,7 @@ try {
               bat """xcopy C:\\mxnet\\data data /E /I /Y
                 xcopy C:\\mxnet\\model model /E /I /Y
                 call activate py2
+                pip install mock
                 set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
                 del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
                 C:\\mxnet\\test_gpu.bat"""
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index fcb7c97b9..f04479d23 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -175,7 +175,7 @@ def check_sha1(filename, sha1_hash):
     return sha1.hexdigest() == sha1_hash
 
 
-def download(url, path=None, overwrite=False, sha1_hash=None, retries=5):
+def download(url, path=None, overwrite=False, sha1_hash=None, retries=5, verify_ssl=True):
     """Download an given URL
 
     Parameters
@@ -192,6 +192,8 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5):
         but doesn't match.
     retries : integer, default 5
         The number of times to attempt the download in case of failure or non 200 return codes
+    verify_ssl : bool, default True
+        Verify SSL certificates.
 
     Returns
     -------
@@ -200,6 +202,9 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5):
     """
     if path is None:
         fname = url.split('/')[-1]
+        # Empty filenames are invalid
+        assert fname, 'Can\'t construct file-name from this URL. ' \
+            'Please set the `path` option manually.'
     else:
         path = os.path.expanduser(path)
         if os.path.isdir(path):
@@ -208,6 +213,11 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5):
             fname = path
     assert retries >= 0, "Number of retries should be at least 0"
 
+    if not verify_ssl:
+        warnings.warn(
+            'Unverified HTTPS request is being made (verify_ssl=False). '
+            'Adding certificate verification is strongly advised.')
+
     if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
         dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
         if not os.path.exists(dirname):
@@ -217,7 +227,7 @@ def download(url, path=None, overwrite=False, sha1_hash=None, retries=5):
             # pylint: disable=W0703
             try:
                 print('Downloading %s from %s...'%(fname, url))
-                r = requests.get(url, stream=True)
+                r = requests.get(url, stream=True, verify=verify_ssl)
                 if r.status_code != 200:
                     raise RuntimeError("Failed downloading url %s"%url)
                 with open(fname, 'wb') as f:
diff --git a/tests/python/unittest/test_gluon_utils.py b/tests/python/unittest/test_gluon_utils.py
index a5d3b1401..431852427 100644
--- a/tests/python/unittest/test_gluon_utils.py
+++ b/tests/python/unittest/test_gluon_utils.py
@@ -15,20 +15,56 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import io
 import os
 import tempfile
+import warnings
 
+try:
+    from unittest import mock
+except ImportError:
+    import mock
 import mxnet as mx
-from nose.tools import *
+import requests
+from nose.tools import raises
+
+
+class MockResponse(requests.Response):
+    def __init__(self, status_code, content):
+        super(MockResponse, self).__init__()
+        assert isinstance(status_code, int)
+        self.status_code = status_code
+        self.raw = io.BytesIO(content.encode('utf-8'))
 
 
 @raises(Exception)
+@mock.patch(
+    'requests.get', mock.Mock(side_effect=requests.exceptions.ConnectionError))
 def test_download_retries():
     mx.gluon.utils.download("http://doesnotexist.notfound")
 
+
+@mock.patch(
+    'requests.get',
+    mock.Mock(side_effect=
+              lambda *args, **kwargs: MockResponse(200, 'MOCK CONTENT' * 100)))
 def test_download_successful():
     tmp = tempfile.mkdtemp()
     tmpfile = os.path.join(tmp, 'README.md')
-    mx.gluon.utils.download("https://raw.githubusercontent.com/apache/incubator-mxnet/master/README.md",
-                            path=tmpfile)
-    assert os.path.getsize(tmpfile) > 100
\ No newline at end of file
+    mx.gluon.utils.download(
+        "https://raw.githubusercontent.com/apache/incubator-mxnet/master/README.md",
+        path=tmpfile)
+    assert os.path.getsize(tmpfile) > 100
+
+
+@mock.patch(
+    'requests.get',
+    mock.Mock(
+        side_effect=lambda *args, **kwargs: MockResponse(200, 'MOCK CONTENT')))
+def test_download_ssl_verify():
+    with warnings.catch_warnings(record=True) as warnings_:
+        mx.gluon.utils.download(
+            "https://mxnet.incubator.apache.org/index.html", verify_ssl=False)
+    assert any(
+        str(w.message).startswith('Unverified HTTPS request')
+        for w in warnings_)

From 424fafe58785520cfb8e3ad0e6ce26f9e9db5e24 Mon Sep 17 00:00:00 2001
From: Naveen Swamy <mn.naveen@gmail.com>
Date: Wed, 25 Jul 2018 00:09:51 -0700
Subject: [PATCH 07/63] [MXNET-710] Change POM files to be able to regularly
 publish to Apache Release & Maven Central Repo (#11862)

* pom file changes for maven builds
---
 Makefile                                      |  23 +++-
 .../assembly/linux-x86_64-cpu/pom.xml         |  36 +++++-
 .../assembly/linux-x86_64-gpu/pom.xml         |  36 +++++-
 scala-package/assembly/osx-x86_64-cpu/pom.xml |  41 +++++++
 scala-package/assembly/pom.xml                |   9 +-
 scala-package/core/pom.xml                    |   7 ++
 scala-package/dev/compile-mxnet-backend.sh    | 108 ++++++++++++++++++
 scala-package/infer/pom.xml                   |   7 ++
 .../init-native/linux-x86_64/pom.xml          |   9 +-
 scala-package/init-native/osx-x86_64/pom.xml  |  11 +-
 scala-package/init-native/pom.xml             |  13 +++
 scala-package/init/pom.xml                    |  33 ++++++
 scala-package/macros/pom.xml                  |   7 ++
 scala-package/native/linux-x86_64-cpu/pom.xml |   9 +-
 scala-package/native/linux-x86_64-gpu/pom.xml |   9 +-
 scala-package/native/osx-x86_64-cpu/pom.xml   |   9 +-
 scala-package/native/pom.xml                  |  13 +++
 scala-package/pom.xml                         |  75 ++++++------
 18 files changed, 402 insertions(+), 53 deletions(-)
 create mode 100755 scala-package/dev/compile-mxnet-backend.sh

diff --git a/Makefile b/Makefile
index 5816637f9..88f7dd927 100644
--- a/Makefile
+++ b/Makefile
@@ -589,6 +589,7 @@ scalaclean:
 scalapkg:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn package -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -Dcxx="$(CXX)" \
+		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
 			-Dcurrent_libdir="$(ROOTDIR)/lib" \
 			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
@@ -608,12 +609,32 @@ scalaintegrationtest:
 scalainstall:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn install -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -DskipTests -Dcxx="$(CXX)" \
+		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
 			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
 
+scalarelease-dryrun:
+	(cd $(ROOTDIR)/scala-package; \
+		mvn release:clean release:prepare -DdryRun=true -DautoVersionSubmodules=true \
+		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
+		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
+
+scalarelease-prepare:
+	(cd $(ROOTDIR)/scala-package; \
+		mvn release:clean release:prepare -DautoVersionSubmodules=true \
+		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
+		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
+
+scalarelease-perform:
+	(cd $(ROOTDIR)/scala-package; \
+		mvn release:perform -DautoVersionSubmodules=true \
+		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
+		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
+
 scaladeploy:
 	(cd $(ROOTDIR)/scala-package; \
-		mvn deploy -Prelease,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -DskipTests -Dcxx="$(CXX)" \
+		mvn deploy -Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \-DskipTests -Dcxx="$(CXX)" \
+		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
 			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
 
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index 5ae050c68..b410de10f 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -26,15 +26,49 @@
       <version>1.3.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
+      <version>1.3.0-SNAPSHOT</version>
+    </dependency>
   </dependencies>
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>flatten-maven-plugin</artifactId>
+        <!--<version>1.1.0-SNAPSHOT</version>-->
+        <configuration>
+          <pomElements>
+            <dependencies>remove</dependencies>
+          </pomElements>
+        </configuration>
+        <executions>
+          <!-- enable flattening -->
+          <execution>
+            <id>flatten</id>
+            <phase>process-resources</phase>
+            <goals>
+              <goal>flatten</goal>
+            </goals>
+          </execution>
+          <!-- ensure proper cleanup -->
+          <execution>
+            <id>flatten.clean</id>
+            <phase>clean</phase>
+            <goals>
+              <goal>clean</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-deploy-plugin</artifactId>
+        <inherited>false</inherited>
         <configuration>
-          <skip>true</skip>
+          <skip>false</skip>
         </configuration>
       </plugin>
       <plugin>
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index c4d7aa98b..0a1e79538 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -26,15 +26,49 @@
       <version>1.3.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
+      <version>1.3.0-SNAPSHOT</version>
+    </dependency>
   </dependencies>
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>flatten-maven-plugin</artifactId>
+        <!--<version>1.1.0-SNAPSHOT</version>-->
+        <configuration>
+          <pomElements>
+            <dependencies>remove</dependencies>
+          </pomElements>
+        </configuration>
+        <executions>
+          <!-- enable flattening -->
+          <execution>
+            <id>flatten</id>
+            <phase>process-resources</phase>
+            <goals>
+              <goal>flatten</goal>
+            </goals>
+          </execution>
+          <!-- ensure proper cleanup -->
+          <execution>
+            <id>flatten.clean</id>
+            <phase>clean</phase>
+            <goals>
+              <goal>clean</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-deploy-plugin</artifactId>
+        <inherited>false</inherited>
         <configuration>
-          <skip>true</skip>
+          <skip>false</skip>
         </configuration>
       </plugin>
       <plugin>
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index 69e87b7b1..8a12d8087 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -26,10 +26,51 @@
       <version>1.3.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
+      <version>1.3.0-SNAPSHOT</version>
+    </dependency>
   </dependencies>
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>flatten-maven-plugin</artifactId>
+        <!--<version>1.1.0-SNAPSHOT</version>-->
+        <configuration>
+          <pomElements>
+            <dependencies>remove</dependencies>
+          </pomElements>
+        </configuration>
+        <executions>
+          <!-- enable flattening -->
+          <execution>
+            <id>flatten</id>
+            <phase>process-resources</phase>
+            <goals>
+              <goal>flatten</goal>
+            </goals>
+          </execution>
+          <!-- ensure proper cleanup -->
+          <execution>
+            <id>flatten.clean</id>
+            <phase>clean</phase>
+            <goals>
+              <goal>clean</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <inherited>false</inherited>
+        <configuration>
+          <skip>false</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-assembly-plugin</artifactId>
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index 83d6c732a..aef50ce57 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -37,6 +37,13 @@
       <id>release</id>
       <build>
         <plugins>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-deploy-plugin</artifactId>
+            <configuration>
+              <skip>true</skip>
+            </configuration>
+          </plugin>
           <plugin>
             <groupId>org.apache.maven.plugins</groupId>
             <artifactId>maven-source-plugin</artifactId>
@@ -47,7 +54,7 @@
                   <goal>jar-no-fork</goal>
                 </goals>
                 <configuration>
-                  <includePom>true</includePom>>
+                  <includePom>true</includePom>
                 </configuration>
               </execution>
             </executions>
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 79da63b7c..134e0a59d 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -48,6 +48,13 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
diff --git a/scala-package/dev/compile-mxnet-backend.sh b/scala-package/dev/compile-mxnet-backend.sh
new file mode 100755
index 000000000..b065e01af
--- /dev/null
+++ b/scala-package/dev/compile-mxnet-backend.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# (Yizhi) This is mainly inspired by the script in apache/spark.
+# I did some modificaiton to get it with our project.
+#
+
+set -e
+echo "Compiling MXNet Backend, Hang tight!....."
+
+if [[ ($# -ne 2) || ( $1 == "--help") ||  $1 == "-h" ]]; then
+  echo "Usage: [-h|--help]  <osx-x86_64-cpu/linux-x86_64-cpu/linux-x86_64-gpu> <project.basedir>" 1>&2
+  exit 1
+fi
+PLATFORM=$1
+MXNETDIR=$2
+
+
+# below routine shamelessly copied from
+# https://github.com/apache/incubator-mxnet/blob/master/setup-utils/install-mxnet-osx-python.sh
+# This routine executes a command, 
+# prints error message on the console on non-zero exit codes and
+# returns the exit code to the caller.
+chkret() {
+	cmd=$*
+	echo "$cmd"
+	$cmd
+	ret=$?
+	if [[ ${ret} != 0 ]]; then
+		echo " "
+		echo "ERROR: Return value non-zero for: $cmd"
+		echo " "
+		exit 1
+	fi
+} # chkret()
+
+UNAME=`uname -s`
+chkret pushd $MXNETDIR
+chkret git submodule update --init --recursive
+
+# don't want to overwrite an existing config file
+cp make/config.mk ./config.mk
+
+if [[ $PLATFORM == "osx-x86_64-cpu" ]];
+then
+    echo "Building MXNet Backend on MAC OS"
+    echo "ADD_CFLAGS += -I/usr/local/opt/opencv/include" >> ./config.mk
+    echo "ADD_CFLAGS += -I/usr/local/opt/openblas/include" >> ./config.mk
+    echo "ADD_LDFLAGS += -L/usr/local/opt/opencv/lib" >> ./config.mk
+    echo "ADD_LDFLAGS += -L/usr/local/opt/openblas/lib" >> ./config.mk
+    echo "USE_OPENMP = 0" >> ./config.mk
+    echo "USE_LAPACK_PATH = /usr/local/opt/lapack/lib" >> ./config.mk
+    make -j$(sysctl -n hw.ncpu)
+elif [[ $PLATFORM == "linux-x86_64-cpu" ]];
+then
+    echo "Building MXNet Backend on Linux CPU"
+    echo "ADD_CFLAGS += -I/usr/local/include/opencv" >> ./config.mk
+    echo "ADD_LDFLAGS += -L/usr/local/lib" >> ./config.mk
+    echo "USE_OPENCV=1" >> ./config.mk
+    echo "USE_OPENMP=1" >> ./config.mk
+    echo "USE_BLAS=openblas" >> ./config.mk
+    echo "USE_LAPACK=1" >> ./config.mk
+    echo "USE_DIST_KVSTORE=1" >> ./config.mk
+    echo "USE_S3=1" >> ./config.mk
+    make -j$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | tail -1)
+elif [[ $PLATFORM == "linux-x86_64-gpu" ]]
+then
+    echo "Building MXNet Backend on Linux GPU"
+    echo "ADD_CFLAGS += -I/usr/local/include/opencv" >> ./config.mk
+    echo "ADD_LDFLAGS += -L/usr/local/lib" >> ./config.mk
+    echo "USE_OPENCV=1" >> ./config.mk
+    echo "USE_OPENMP=1" >> ./config.mk
+    echo "USE_BLAS=openblas" >> ./config.mk
+    echo "USE_LAPACK=1" >> ./config.mk
+    echo "USE_DIST_KVSTORE=1" >> ./config.mk
+    echo "USE_S3=1" >> ./config.mk
+    echo "USE_CUDA=1" >> ./config.mk
+    echo "USE_CUDNN=1" >> ./config.mk
+    echo "ADD_CFLAGS += -I/usr/local/cuda/include" >> ./config.mk
+    echo "ADD_LDFLAGS += -L/usr/local/cuda/lib64/  " >> ./config.mk
+    #update th nccl version approriately
+    echo "ADD_LDFLAGS += -L/lib/nccl/cuda-9.0/lib  " >> ./config.mk
+    eval "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/lib/nccl/cuda-9.0/lib"
+    eval "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
+    make -j$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | tail -1)
+    echo "Building MXNet Backend on Linux GPU"
+else
+    echo "MY ALIEN OVERLOADS HAVE NOT TOLD WHAT TO DO FOR INVALID INPUT !!!"
+    echo "Currently supported platforms: osx-x86_64-cpu or linux-x86_64-cpu or linux-x86_64-gpu"
+fi
+chkret popd
+echo "done building MXNet Backend"
+exit 0
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index 573684d2d..3425579ac 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -48,6 +48,13 @@
 
     <build>
         <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-deploy-plugin</artifactId>
+                <configuration>
+                    <skip>true</skip>
+                </configuration>
+            </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-jar-plugin</artifactId>
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 2ddeaba7a..7a6d90866 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -28,6 +28,13 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
@@ -55,7 +62,7 @@
             <compilerStartOption>-std=c++0x</compilerStartOption>
           </compilerStartOptions>
           <compilerEndOptions>
-            <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>-I${project.basedir}/../../../include</compilerEndOption>
             <compilerEndOption>${all_includes}</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index 120854986..0eaf4e343 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -28,6 +28,13 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
@@ -55,7 +62,7 @@
             <compilerStartOption>-std=c++0x</compilerStartOption>
           </compilerStartOptions>
           <compilerEndOptions>
-            <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>-I${project.basedir}/../../../include</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
           <linkerStartOptions>
@@ -66,7 +73,7 @@
             <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
             <linkerMiddleOption>${lddeps}</linkerMiddleOption>
-            <linkerMiddleOption>-force_load ../../../lib/libmxnet.a</linkerMiddleOption>
+            <linkerMiddleOption>-force_load ${project.basedir}/../../../lib/libmxnet.a</linkerMiddleOption>
           </linkerMiddleOptions>
           <linkerEndOptions>
             <linkerEndOption>${ldflags}</linkerEndOption>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 8ac369d1d..1cd79a840 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -34,4 +34,17 @@
       </modules>
     </profile>
   </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index ef1d67b5d..2c00ca5ff 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -32,5 +32,38 @@
         <platform>linux-x86_64-gpu</platform>
       </properties>
     </profile>
+    <profile>
+      <id>apache-release</id>
+   <!--Running the compile-backend inside a different profile did not work when used with apache-release profile for release-perform-->
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>exec-maven-plugin</artifactId>
+            <version>1.6.0</version>
+            <executions>
+              <execution>
+                <id>compile-mxnet-backend</id>
+                <phase>compile</phase>
+                <goals>
+                  <goal>exec</goal>
+                </goals>
+                <configuration>
+                  <executable>bash</executable>
+                  <commandlineArgs>${project.parent.basedir}/dev/compile-mxnet-backend.sh ${build.platform} ${project.parent.basedir}/../</commandlineArgs>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-deploy-plugin</artifactId>
+            <configuration>
+              <skip>true</skip>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
+      </profile>
   </profiles>
 </project>
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index 15f13f394..795900987 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -73,6 +73,13 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index 2504b1f31..c45635eb6 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -28,6 +28,13 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
@@ -55,7 +62,7 @@
             <compilerStartOption>-std=c++0x</compilerStartOption>
           </compilerStartOptions>
           <compilerEndOptions>
-            <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>-I${project.basedir}/../../../include</compilerEndOption>
             <compilerEndOption>${all_includes}</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index aca290f6d..a1f5ec34d 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -28,6 +28,13 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
@@ -55,7 +62,7 @@
             <compilerStartOption>-std=c++0x</compilerStartOption>
           </compilerStartOptions>
           <compilerEndOptions>
-            <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>-I${project.basedir}/../../../include</compilerEndOption>
             <compilerEndOption>${all_includes}</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 15033535e..3f66fe68e 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -28,6 +28,13 @@
 
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
@@ -66,8 +73,6 @@
             <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
             <linkerMiddleOption>${lddeps}</linkerMiddleOption>
-            <linkerMiddleOption>-force_load ../../../lib/libmxnet.a</linkerMiddleOption>
-            <linkerMiddleOption>-force_load ../../../3rdparty/tvm/nnvm/lib/libnnvm.a</linkerMiddleOption>
           </linkerMiddleOptions>
           <linkerEndOptions>
             <linkerEndOption>${ldflags}</linkerEndOption>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index 54ba2b57a..485b69f09 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -34,4 +34,17 @@
       </modules>
     </profile>
   </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index d931a82a6..c4f162008 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -3,31 +3,42 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
   <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache</groupId>
+    <artifactId>apache</artifactId>
+     <version>19</version>
+  </parent>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
   <version>1.3.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/apache/incubator-mxnet/tree/master/scala-package</url>
-  <description>MXNet Scala Package</description>
+  <description>
+    Scala Package for Apache MXNet (Incubating) - flexible and efficient library for deep learning.
+  </description>
   <organization>
-    <name>Distributed (Deep) Machine Learning Community</name>
-    <url>http://dmlc.ml</url>
+    <name>The Apache Software Foundation</name>
+    <url>https://www.apache.org/</url>
   </organization>
   <licenses>
     <license>
-      <name>The Apache License, Version 2.0</name>
-      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <name>Apache License, Version 2.0</name>
+      <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
     </license>
   </licenses>
+
   <scm>
-    <connection>scm:git:git@github.com:dmlc/mxnet.git</connection>
-    <developerConnection>scm:git:git@github.com:dmlc/mxnet.git</developerConnection>
+    <connection>scm:git:git@github.com:apache/incubator-mxnet.git</connection>
+    <developerConnection>scm:git:git@github.com:apache/incubator-mxnet.git</developerConnection>
     <url>https://github.com/apache/incubator-mxnet</url>
+    <tag>HEAD</tag>
   </scm>
 
   <properties>
     <scala.version>2.11.8</scala.version>
     <scala.binary.version>2.11</scala.binary.version>
+    <build.platform />
   </properties>
 
   <packaging>pom</packaging>
@@ -65,29 +76,6 @@
               </execution>
             </executions>
           </plugin>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-gpg-plugin</artifactId>
-            <executions>
-              <execution>
-                <id>sign-artifacts</id>
-                <phase>verify</phase>
-                <goals>
-                  <goal>sign</goal>
-                </goals>
-              </execution>
-            </executions>
-          </plugin>
-          <plugin>
-            <groupId>org.sonatype.plugins</groupId>
-            <artifactId>nexus-staging-maven-plugin</artifactId>
-            <extensions>true</extensions>
-            <configuration>
-              <serverId>ossrh</serverId>
-              <nexusUrl>https://oss.sonatype.org/</nexusUrl>
-              <autoReleaseAfterClose>true</autoReleaseAfterClose>
-            </configuration>
-          </plugin>
         </plugins>
       </build>
     </profile>
@@ -161,15 +149,23 @@
     </profile>
   </profiles>
 
-  <distributionManagement>
-    <snapshotRepository>
-      <id>ossrh</id>
-      <url>https://oss.sonatype.org/content/repositories/snapshots</url>
-    </snapshotRepository>
-  </distributionManagement>
-
   <build>
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-release-plugin</artifactId>
+        <configuration>
+          <localCheckout>true</localCheckout>
+          <pushChanges>false</pushChanges>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-source-plugin</artifactId>
@@ -213,11 +209,6 @@
         <artifactId>maven-assembly-plugin</artifactId>
         <version>2.5.5</version>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <version>2.8.2</version>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>

From 06f4ec7383da1667ffecf58147ba8f9ebb7d9a43 Mon Sep 17 00:00:00 2001
From: Kellen Sunderland <kellen.sunderland@gmail.com>
Date: Wed, 25 Jul 2018 00:30:52 -0700
Subject: [PATCH 08/63] Enable three retries for Docker build commands (#11877)

This enabled retries for Docker build commands executed by our master and PR builds.
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ed998ee14..6d21f4964 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -115,7 +115,7 @@ def collect_test_results_windows(original_file_name, new_file_name) {
 
 
 def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)

From 0b8b93995e253f362399de1e2e8c63d112933289 Mon Sep 17 00:00:00 2001
From: Todd Sundsted <todd@sundsted.com>
Date: Wed, 25 Jul 2018 13:52:15 -0400
Subject: [PATCH 09/63] Avoid Division by Zero (#11397)

* Return if iteration counter `N` is less than or equal to zero.

* Fix spelling.
---
 src/operator/random/sampler.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index e211e7547..44f80ab56 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -38,12 +38,17 @@ namespace op {
  * \brief Launch a generic kernel with parallel random generator.
  * \tparam gen random generator
  * \tparam N Number of iterations
- * \tparam Args Varargs type to eventually pass to the OP::Map() functoion
+ * \tparam Args Varargs type to eventually pass to the OP::Map() function
  */
 template<typename OP, typename xpu, typename GType, typename ...Args>
 inline static void LaunchRNG(mshadow::Stream<xpu> *s,
                              common::random::RandGenerator<xpu, GType> *gen,
                              const int N, Args... args) {
+  // minimal check to avoid division by zero, below.
+  // if `N` is zero the map operation is a no-op in any case.
+  if (N <= 0) {
+    return;
+  }
   const int nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
                     RandGenerator<xpu>::kMinNumRandomPerThread;
   const int nthread = std::min(nloop, RandGenerator<xpu>::kNumRandomStates);

From fe1c7ab1711debac2e615a6608e7ac671ce34f10 Mon Sep 17 00:00:00 2001
From: Hao Jin <haojin2@users.noreply.github.com>
Date: Wed, 25 Jul 2018 16:58:55 -0400
Subject: [PATCH 10/63] making AddTakeGrad as default for backward of embedding
 and take to avoid nan (#11795)

---
 src/operator/tensor/indexing_op.h | 76 +------------------------------
 1 file changed, 2 insertions(+), 74 deletions(-)

diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 3d349c9f4..07d96dcbd 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -548,46 +548,6 @@ struct tcast_clip {
   }
 };
 
-template<typename xpu, typename IndexType, typename DType>
-void AddTakeGradLargeBatchCaller(const OpContext& ctx, mshadow::Tensor<xpu, 2, DType> dst,
-                                 const mshadow::Tensor<xpu, 1, IndexType>& index,
-                                 const mshadow::Tensor<xpu, 2, DType> &src) {
-  using namespace mxnet_op;
-  using namespace mshadow::expr;
-
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-
-  // Calculate amount of temporary storage
-  size_t sort_workspace_size = mxnet::op::SortByKeyWorkspaceSize<int, int, xpu>
-    (index.shape_.Size());
-  size_t addtake_workspace_size = mxnet::op::AddTakeGradLargeBatchWorkspaceSize<int, xpu>
-    (index.shape_.Size());
-  size_t temp_storage_size = std::max(sort_workspace_size, addtake_workspace_size);
-  size_t workspace_size = 2*(index.shape_.Size()*sizeof(int)) + temp_storage_size;
-
-  // Request temporary storage
-  Tensor<xpu, 1, char> workspace =
-    ctx.requested[embedding::kTempSpace].get_space_typed<xpu, 1, char>(
-      Shape1(workspace_size), s);
-
-  // Create tensors
-  size_t pos = 0;
-  Tensor<xpu, 1, int> sorted_data(reinterpret_cast<int*>(&workspace[pos]),
-    Shape1(index.shape_.Size()), s);
-  pos += index.shape_.Size()*sizeof(int);
-  Tensor<xpu, 1, int> original_index(reinterpret_cast<int*>(&workspace[pos]),
-    Shape1(index.shape_.Size()), s);
-  pos += index.shape_.Size()*sizeof(int);
-  Tensor<xpu, 1, char> temp_storage(&workspace[pos], Shape1(temp_storage_size), s);
-  Kernel<tcast_clip, xpu>::Launch(s, index.shape_.Size(), sorted_data.dptr_, index.dptr_,
-    static_cast<int>(dst.shape_[0]));
-  Kernel<range_fwd, xpu>::Launch(s, index.shape_.Size(),
-    1, 0, 1, kWriteTo, original_index.dptr_);
-  int num_bits = ilog2((dst.shape_[0] - 1));
-  mxnet::op::SortByKey(sorted_data, original_index, true, &temp_storage, 0, num_bits);
-  mxnet::op::AddTakeGradLargeBatch(dst, sorted_data, original_index, src, &temp_storage);
-}
-
 template<typename xpu>
 void EmbeddingOpBackward(const nnvm::NodeAttrs& attrs,
                          const OpContext& ctx,
@@ -619,25 +579,7 @@ void EmbeddingOpBackward(const nnvm::NodeAttrs& attrs,
         if (req[embedding::kWeight] == kWriteTo) {
           grad_in = scalar<DType>(0.0f);
         }
-        // shape_out_prod ~= the number of elements loaded in AddTakeGrad
-        // shape_in_prod  ~= the number of elements stored in AddTakeGrad
-        // When the number of elements processed is low, use AddTakeGrad.
-        // The approximate cut-off value 16384 was found experimentally on Titan X Pascal
-        uint64_t shape_in_prod =
-          static_cast<uint64_t>(grad_in.shape_[0])*
-          static_cast<uint64_t>(grad_in.shape_[1]);
-        uint64_t shape_out_prod =
-          static_cast<uint64_t>(grad_out.shape_[0])*
-          static_cast<uint64_t>(grad_out.shape_[1]);
-
-        static bool force_addtakegrad =
-            dmlc::GetEnv("MXNET_FORCE_ADDTAKEGRAD", false);
-        if (force_addtakegrad || (shape_out_prod < (uint64_t)16384 &&
-                                  shape_in_prod < (uint64_t)16384)) {
-          AddTakeGrad(grad_in, data, grad_out);
-        } else {
-          AddTakeGradLargeBatchCaller(ctx, grad_in, data, grad_out);
-        }
+        AddTakeGrad(grad_in, data, grad_out);
       } else {
         LOG(FATAL) << "wrong req";
       }
@@ -1132,21 +1074,7 @@ void TakeOpBackward(const nnvm::NodeAttrs& attrs,
           if (req[take_::kArr] == kWriteTo) {
             grad_in = scalar<DType>(0.0f);
           }
-          // shape_out_prod ~= the number of elements loaded in AddTakeGrad
-          // shape_in_prod  ~= the number of elements stored in AddTakeGrad
-          // When the number of elements processed is low, use AddTakeGrad.
-          // The approximate cut-off value 16384 was found experimentally on Titan X Pascal
-          uint64_t shape_in_prod =
-            static_cast<uint64_t>(grad_in.shape_[0])*
-            static_cast<uint64_t>(grad_in.shape_[1]);
-          uint64_t shape_out_prod =
-            static_cast<uint64_t>(grad_out.shape_[0])*
-            static_cast<uint64_t>(grad_out.shape_[1]);
-          if (shape_out_prod < (uint64_t)16384 && shape_in_prod < (uint64_t)16384) {
-            AddTakeGrad(grad_in, idx, grad_out);
-          } else {
-            AddTakeGradLargeBatchCaller(ctx, grad_in, idx, grad_out);
-          }
+          AddTakeGrad(grad_in, idx, grad_out);
         } else {
           LOG(FATAL) << "wrong req";
         }

From be478700e01944ecfee0c30a7cc5dc07d1b2789a Mon Sep 17 00:00:00 2001
From: jeremiedb <jeremiedb@users.noreply.github.com>
Date: Wed, 25 Jul 2018 17:10:16 -0400
Subject: [PATCH 11/63] [MXNET-563] Refactor R optimizers to fix memory leak
 (#11374)

* refactor R optimizers to fix memory leak

* add Adadelta and Adagrad

* fix comments

* fix comments

* fix comments

* add tests

* fix whitespaces

* fix whitespaces

* fix typo

* fix typo

* add doc on clipping
---
 R-package/R/model.R                       |   2 +-
 R-package/R/model.rnn.R                   | 170 ++++----
 R-package/R/optimizer.R                   | 450 ++++++++++++----------
 R-package/tests/testthat/test_optimizer.R | 204 ++++++++++
 4 files changed, 545 insertions(+), 281 deletions(-)
 create mode 100644 R-package/tests/testthat/test_optimizer.R

diff --git a/R-package/R/model.R b/R-package/R/model.R
index b461f7973..a2c441968 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -147,7 +147,7 @@ mx.model.train <- function(symbol, ctx, input.shape, output.shape,
     kvstore$set.optimizer(optimizer)
   } else {
     updaters <- lapply(seq_len(ndevice), function(i) {
-      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
+      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays, ctx = ctx[[i]])
     })
   }
   if (!is.null(kvstore)) {
diff --git a/R-package/R/model.rnn.R b/R-package/R/model.rnn.R
index f328d1ba6..580c82a0a 100644
--- a/R-package/R/model.rnn.R
+++ b/R-package/R/model.rnn.R
@@ -1,51 +1,50 @@
 # Internal function to do multiple device training on RNN
-mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data, 
-                                   dlist, arg.params, aux.params, 
-                                   grad.req, arg.update.idx, 
+mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
+                                   dlist, arg.params, aux.params,
+                                   grad.req, arg.update.idx,
                                    begin.round, end.round, optimizer, metric, metric_cpu,
-                                   epoch.end.callback, batch.end.callback, kvstore, verbose,
-                                   gc_freq) {
-  
+                                   epoch.end.callback, batch.end.callback, kvstore, verbose) {
+
   ndevice <- length(ctx)
-  if (verbose) 
+  if (verbose)
     message("Start training with ", ndevice, " devices")
-  
+
   input.names <- names(dlist)
   arg.params.names <- names(arg.params)
-  
+
   if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else sym_ini <- symbol
-  
+
   slices <- lapply(seq_len(ndevice), function(i) {
     sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = FALSE))
   })
-  
+
   train.execs <- lapply(seq_len(ndevice), function(i) {
     s <- slices[[i]]
-    mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s, arg.params)[arg.update.idx], 
+    mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s, arg.params)[arg.update.idx],
                    aux.arrays = aux.params, ctx = ctx[[i]], grad.req = grad.req)
   })
-  
+
   # KVStore related stuffs
   params.index <- as.integer(
     mx.util.filter.null(
       lapply(seq_along(train.execs[[1]]$ref.grad.arrays), function(k) {
         if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL}
       )))
-  
+
   update.on.kvstore <- FALSE
   if (!is.null(kvstore) && kvstore$update.on.kvstore) {
     update.on.kvstore <- TRUE
     kvstore$set.optimizer(optimizer)
   } else {
     updaters <- lapply(seq_len(ndevice), function(i) {
-      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
+      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays, ctx = ctx[[i]])
     })
   }
-  
+
   if (!is.null(kvstore)) {
     kvstore$init(params.index, train.execs[[1]]$ref.arg.arrays[params.index])
   }
-  
+
   # train over specified number of epochs
   for (iteration in begin.round:end.round) {
     nbatch <- 0
@@ -55,20 +54,20 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
     }
     train.data$reset()
     while (train.data$iter.next()) {
-      
+
       # Get iterator data
       dlist <- train.data$value()[input.names]
-      
+
       # Slice inputs for multi-devices
       slices <- lapply(seq_len(ndevice), function(i) {
         sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = F))
       })
-      
+
       # Assign input to each executor - bug on inference if using BatchNorm
       if (is.list(symbol)) {
         train.execs <- lapply(seq_len(ndevice), function(i) {
           s <- slices[[i]]
-          mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]], 
+          mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]],
                          arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
                          aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
         })
@@ -78,12 +77,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
           mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
         }
       }
-      
+
       # forward pass
       for (texec in train.execs) {
         mx.exec.forward(texec, is.train = TRUE)
       }
-      
+
       # copy of preds and labels for metric
       if (!is.null(metric)) {
         preds <- lapply(train.execs, function(texec) {texec$ref.outputs[[1]]})
@@ -93,12 +92,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
           labels <- lapply(seq_along(train.execs), function(i) {mx.nd.copyto(labels[[i]], mx.cpu())})
         }
       }
-      
+
       # backward pass
       for (texec in train.execs) {
         mx.exec.backward(texec)
       }
-      
+
       if (!is.null(kvstore)) {
         # push the gradient
         kvstore$push(params.index, lapply(train.execs, function(texec) {
@@ -124,7 +123,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
           mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], skip.null = TRUE)
         }
       }
-      
+
       # Update the evaluation metrics
       if (!is.null(metric)) {
         for (i in seq_len(ndevice)) {
@@ -133,43 +132,40 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
                                         state = train.metric)
         }
       }
-      
+
       nbatch <- nbatch + 1
-      if (!is.null(gc_freq)) {
-        if (nbatch %% gc_freq == 0) gc()
-      }
-      
+
       if (!is.null(batch.end.callback)) {
         batch.end.callback(iteration, nbatch, environment())
       }
     }
-    
+
     if (!is.null(metric)) {
       result <- metric$get(train.metric)
-      if (verbose) 
+      if (verbose)
         message("[", iteration, "] Train-", result$name, "=", result$value)
     }
-    
+
     if (!is.null(eval.data)) {
       if (!is.null(metric)) {
         eval.metric <- metric$init()
       }
       eval.data$reset()
       while (eval.data$iter.next()) {
-        
+
         # Get iterator data
         dlist <- eval.data$value()[input.names]
-        
+
         # Slice input to multiple devices
         slices <- lapply(seq_len(ndevice), function(i) {
           sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = FALSE))
         })
-        
+
         # Assign input to each executor - bug on inference if using BatchNorm
         if (is.list(symbol)) {
           train.execs <- lapply(seq_len(ndevice), function(i) {
             s <- slices[[i]]
-            mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]], 
+            mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]],
                            arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
                            aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
           })
@@ -179,12 +175,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
             mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
           }
         }
-        
+
         # forward pass
         for (texec in train.execs) {
           mx.exec.forward(texec, is.train = FALSE)
         }
-        
+
         # copy of preds and labels for metric and update metric
         if (!is.null(metric)) {
           preds <- lapply(train.execs, function(texec) {texec$ref.outputs[[1]]})
@@ -194,17 +190,17 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
             labels <- lapply(seq_along(train.execs), function(i) {mx.nd.copyto(labels[[i]], mx.cpu())})
           }
           for (i in seq_len(ndevice)) {
-            eval.metric <- metric$update(label = labels[[i]], 
-                                         pred = preds[[i]], 
+            eval.metric <- metric$update(label = labels[[i]],
+                                         pred = preds[[i]],
                                          state = eval.metric)
           }
         }
       }
-      
+
       if (!is.null(metric)) {
         result <- metric$get(eval.metric)
         if (verbose) {
-          message("[", iteration, "] Validation-", result$name, "=", 
+          message("[", iteration, "] Validation-", result$name, "=",
                   result$value)
         }
       }
@@ -213,12 +209,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
     }
     # get the model out
     model <- mx.model.extract.model(sym_ini, train.execs)
-    
+
     epoch_continue <- TRUE
     if (!is.null(epoch.end.callback)) {
       epoch_continue <- epoch.end.callback(iteration, 0, environment(), verbose = verbose)
     }
-    
+
     if (!epoch_continue) {
       break
     }
@@ -227,7 +223,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
 }
 
 
-# 
+#
 #' Train RNN with bucket support
 #'
 #' @param symbol Symbol or list of Symbols representing the model
@@ -245,33 +241,33 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
 #' @param verbose
 #'
 #' @export
-mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric = NULL, 
-                             arg.params = NULL, aux.params = NULL, fixed.params = NULL, 
-                             num.round = 1, begin.round = 1, 
-                             initializer = mx.init.uniform(0.01), optimizer = "sgd", ctx = NULL, 
-                             batch.end.callback = NULL, epoch.end.callback = NULL, 
-                             kvstore = "local", verbose = TRUE, metric_cpu = TRUE, gc_freq = NULL) {
-  
+mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric = NULL,
+                             arg.params = NULL, aux.params = NULL, fixed.params = NULL,
+                             num.round = 1, begin.round = 1,
+                             initializer = mx.init.uniform(0.01), optimizer = "sgd", ctx = NULL,
+                             batch.end.callback = NULL, epoch.end.callback = NULL,
+                             kvstore = "local", verbose = TRUE, metric_cpu = TRUE) {
+
   if (!train.data$iter.next()) {
     train.data$reset()
-    if (!train.data$iter.next()) 
+    if (!train.data$iter.next())
       stop("Empty train.data")
   }
-  
+
   if (!is.null(eval.data)) {
     if (!eval.data$iter.next()) {
       eval.data$reset()
-      if (!eval.data$iter.next()) 
+      if (!eval.data$iter.next())
         stop("Empty eval.data")
     }
   }
-  
-  if (is.null(ctx)) 
+
+  if (is.null(ctx))
     ctx <- mx.ctx.default()
   if (is.mx.context(ctx)) {
     ctx <- list(ctx)
   }
-  if (!is.list(ctx)) 
+  if (!is.list(ctx))
     stop("ctx must be mx.context or list of mx.context")
   if (is.character(optimizer)) {
     if (is.numeric(input.shape)) {
@@ -283,75 +279,75 @@ mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric = NULL
     }
     optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
   }
-  
+
   sym_ini <- if (is.list(symbol)) symbol[[names(train.data$bucketID)]] else symbol
-  
+
   arguments <- sym_ini$arguments
   input.names <- intersect(names(train.data$value()), arguments)
-  
+
   input.shape <- sapply(input.names, function(n) {
     dim(train.data$value()[[n]])
   }, simplify = FALSE)
-  
+
   shapes <- sym_ini$infer.shape(input.shape)
-  
+
   # assign arg.params and aux.params arguments to arg.params.input and aux.params.input
   arg.params.input <- arg.params
   aux.params.input <- aux.params
-  
+
   # initialize all arguments with zeros
   arg.params <- lapply(shapes$arg.shapes, function(shape) {
     mx.nd.zeros(shape = shape, ctx = mx.cpu())
   })
-  
+
   # initialize input parameters
   dlist <- arg.params[input.names]
-  
+
   # initialize parameters - only argument ending with _weight and _bias are initialized
   arg.params.ini <- mx.init.create(initializer = initializer, shape.array = shapes$arg.shapes, ctx = mx.cpu(), skip.unknown = TRUE)
-  
+
   # assign initilized parameters to arg.params
   arg.params[names(arg.params.ini)] <- arg.params.ini
-  
+
   # assign input params to arg.params
   arg.params[names(arg.params.input)] <- arg.params.input
-  
+
   # remove input params from arg.params
   arg.params[input.names] <- NULL
-  
+
   # Grad request
   grad.req <- rep("null", length(arguments))
   grad.req.write <- arguments %in% setdiff(names(arg.params.ini), fixed.params)
   grad.req[grad.req.write] <- "write"
-  
+
   # Arg array order
   update_names <- c(input.names, names(arg.params))
   arg.update.idx <- match(arguments, update_names)
-  
+
   # aux parameters setup
   aux.params <- lapply(shapes$aux.shapes, function(shape) {
     mx.nd.zeros(shape = shape, ctx = mx.cpu())
   })
-  
+
   aux.params.ini <- mx.init.create(initializer, shapes$aux.shapes, ctx = mx.cpu(), skip.unknown = FALSE)
   if (length(aux.params) > 0) {
     aux.params[names(aux.params.ini)] <- aux.params.ini
   } else aux.params <- NULL
-  
+
   aux.params[names(aux.params.input)] <- aux.params.input
-  
+
   # kvstore initialization
-  kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx), 
+  kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx),
                                      verbose = verbose)
-  
+
   ### Execute training
-  model <- mx.model.train.buckets(symbol = symbol, ctx = ctx,  train.data = train.data, eval.data = eval.data, 
-                                  dlist = dlist,  arg.params = arg.params, aux.params = aux.params, 
-                                  grad.req = grad.req, arg.update.idx = arg.update.idx, 
-                                  optimizer = optimizer, metric = metric, 
-                                  begin.round = begin.round, end.round = num.round, 
-                                  batch.end.callback = batch.end.callback, epoch.end.callback = epoch.end.callback, 
-                                  kvstore = kvstore, verbose = verbose, metric_cpu = metric_cpu, gc_freq = gc_freq)
-  
+  model <- mx.model.train.buckets(symbol = symbol, ctx = ctx,  train.data = train.data, eval.data = eval.data,
+                                  dlist = dlist,  arg.params = arg.params, aux.params = aux.params,
+                                  grad.req = grad.req, arg.update.idx = arg.update.idx,
+                                  optimizer = optimizer, metric = metric,
+                                  begin.round = begin.round, end.round = num.round,
+                                  batch.end.callback = batch.end.callback, epoch.end.callback = epoch.end.callback,
+                                  kvstore = kvstore, verbose = verbose, metric_cpu = metric_cpu)
+
   return(model)
 }
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 3c503c2e8..7283f677f 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -1,31 +1,69 @@
 #' Create an SGD optimizer with respective parameters.
 #' Perform SGD with momentum update
 #'
-mx.opt.sgd <- function(learning.rate,
-                       momentum=0,
-                       wd=0,
-                       rescale.grad=1,
-                       clip_gradient = NULL, 
+#' @param learning.rate float, default=0.01
+#'      The initial learning rate.
+#' @param momentum float, default=0
+#'      The momentumvalue
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#'      The learning rate scheduler.
+mx.opt.sgd <- function(learning.rate = 0.01,
+                       momentum = 0,
+                       wd = 0,
+                       rescale.grad = 1,
+                       clip_gradient = -1,
                        lr_scheduler = NULL) {
-  # use lr as short for learing rate.
+
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   sgd <- new.env()
   sgd$lr <- lr
   sgd$count <- 0
   sgd$num_update <- 0
 
-  create.state <- function(index, weight) {
+  create_exec <- function(index, weight_dim, ctx) {
+
     if (momentum == 0) {
-      return(NULL)
+
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+
+      sym <- mx.symbol.sgd_update(weight,
+                                  grad,
+                                  lr = lr,
+                                  wd = wd,
+                                  rescale_grad = rescale.grad,
+                                  clip_gradient = clip_gradient,
+                                  name = "w")
     } else {
-      ret <- (mx.nd.zeros(dim(weight), ctx(weight)))
-      return(ret)
+
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+      mom <- mx.symbol.Variable("mom")
+
+      sym <- mx.symbol.sgd_mom_update(weight,
+                                      grad,
+                                      mom,
+                                      lr = lr,
+                                      wd = wd,
+                                      momentum= momentum,
+                                      rescale_grad = rescale.grad,
+                                      clip_gradient = clip_gradient,
+                                      name = "w")
     }
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, grad.req = "null")
+    return(exec)
   }
-  update <- function(index, weight, grad, state) {
+
+  update <- function(index, exec_w, weight, grad) {
 
     if (!is.null(lr_scheduler)){
       lr_scheduler(sgd) ## changing lr
@@ -40,77 +78,104 @@ mx.opt.sgd <- function(learning.rate,
         sgd$num_update <- max(sgd$num_update, sgd[[indexKey]])
       }
     }
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
-    if (is.null(state)) {
-      weight <- weight - lr * (grad + wd * weight)
-    } else {
-      mom <- state
-      mom <- mom * momentum
-      mom <- mom - lr * (grad + wd * weight)
-      weight <- weight + mom
-      state <- mom
-    }
-    return(list(weight=weight, state=state))
+
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
 #' Create an RMSProp optimizer with respective parameters.
 #' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
 #' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
-#' 
+#'
 #' @param learning.rate float, default=0.002
-#'      Step size.
+#'      The initial learning rate.
 #' @param gamma1 float, default=0.95
 #'      decay factor of moving average for gradient, gradient^2.
-#' @param gamm2 float, default=0.9
+#' @param gamma2 float, default=0.9
 #'      "momentum" factor.
+#' @param epsilon float, default=1e-4
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
-mx.opt.rmsprop <- function(learning.rate=0.002,
-                           gamma1=0.95,
-                           gamma2=0.9,
-                           wd=0,
-                           rescale.grad=1,
-                           clip_gradient = NULL, 
+mx.opt.rmsprop <- function(learning.rate = 0.002,
+                           centered = TRUE,
+                           gamma1 = 0.95,
+                           gamma2 = 0.9,
+                           epsilon = 1e-4,
+                           wd = 0,
+                           rescale.grad = 1,
+                           clip_gradient = -1,
                            lr_scheduler = NULL) {
-  # use lr as short for learing rate.
+
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   rmsprop <- new.env()
   rmsprop$lr <- lr
   rmsprop$count <- 0
   rmsprop$num_update <- 0
 
-  create.state <- function(index, weight) {
-      return (list(n=mx.nd.zeros(dim(weight), ctx(weight)),
-                   g=mx.nd.zeros(dim(weight), ctx(weight)),
-                   delta=mx.nd.zeros(dim(weight), ctx(weight))))
+  create_exec <- function(index, weight_dim, ctx) {
+
+    if (centered) {
+
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+      n <- mx.symbol.Variable("n")
+      g <- mx.symbol.Variable("g")
+      delta <- mx.symbol.Variable("delta")
+
+      sym <- mx.symbol.rmspropalex_update(weight,
+                                          grad,
+                                          n,
+                                          g,
+                                          delta,
+                                          lr = lr,
+                                          gamma1 = gamma1,
+                                          gamma2 = gamma2,
+                                          epsilon = epsilon,
+                                          wd = wd,
+                                          rescale_grad = rescale.grad,
+                                          clip_gradient = clip_gradient,
+                                          name = "w")
+    } else {
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+      n <- mx.symbol.Variable("n")
+
+      sym <- mx.symbol.rmsprop_update(weight,
+                                      grad,
+                                      n,
+                                      lr = lr,
+                                      gamma1 = gamma1,
+                                      epsilon = epsilon,
+                                      wd = wd,
+                                      rescale_grad = rescale.grad,
+                                      clip_gradient = clip_gradient,
+                                      name = "w")
+    }
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, grad.req = "null")
+    return(exec)
   }
 
-  update <- function(index, weight, grad, state) {
+  update <- function(index, exec_w, weight, grad) {
     if (!is.null(lr_scheduler)){
       lr_scheduler(rmsprop) ## changing lr
       lr <- rmsprop$lr
       ## update count
       indexKey <- paste0('ik', index)
-      if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)){
+      if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)) {
         rmsprop[[indexKey]] <- 0
       } else {
         indexValue <- rmsprop[[indexKey]]
@@ -118,27 +183,12 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
         rmsprop$num_update <- max(rmsprop$num_update, rmsprop[[indexKey]])
       }
     }
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
 
-    n <- state$n
-    g <- state$g
-    delta <- state$delta
-    n <- gamma1 * n + (1 - gamma1) * (grad * grad)
-    g <- gamma1 * g + (1 - gamma1) * grad
-    delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - g*g + 1e-4) + wd * weight)
-    weight <- weight + delta
-    state <- list(n=n, g=g, delta=delta)
-
-    return(list(weight=weight, state=state))
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
 #' Create an Adam optimizer with respective parameters.
@@ -148,8 +198,8 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
 #' Adam: A Method for Stochastic Optimization,
 #' http://arxiv.org/abs/1412.6980
 #'
-#' @param learning.rate float, default=0.001
-#'      Step size.
+#' @param learning.rate float, default=1e-3
+#'      The initial learning rate.
 #' @param beta1 float, default=0.9
 #'      Exponential decay rate for the first moment estimates.
 #' @param beta2 float, default=0.999
@@ -159,41 +209,60 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
-mx.opt.adam <- function(learning.rate=0.001,
-                        beta1=0.9,
-                        beta2=0.999,
-                        epsilon=1e-8,
-                        wd=0,
-                        rescale.grad=1,
-                        clip_gradient = NULL,
+mx.opt.adam <- function(learning.rate = 1e-3,
+                        beta1 = 0.9,
+                        beta2 = 0.999,
+                        epsilon = 1e-8,
+                        wd = 0,
+                        rescale.grad = 1,
+                        clip_gradient = -1,
                         lr_scheduler = NULL) {
-  # use lr as short for learing rate.
+
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   adam <- new.env()
   adam$lr <- lr
   adam$count <- 0
   adam$num_update <- 0
 
-  create.state <- function(index, weight) {
-      return (list(mean=mx.nd.zeros(dim(weight), ctx(weight)),
-                   variance=mx.nd.zeros(dim(weight), ctx(weight))))
+  create_exec <- function(index, weight_dim, ctx) {
+
+    weight <- mx.symbol.Variable("weight")
+    grad <- mx.symbol.Variable("grad")
+    mean <- mx.symbol.Variable("mean")
+    var <- mx.symbol.Variable("var")
+
+    sym <- mx.symbol.adam_update(weight,
+                                 grad,
+                                 mean,
+                                 var,
+                                 lr = lr,
+                                 beta1 = beta1,
+                                 beta2 = beta2,
+                                 epsilon = epsilon,
+                                 wd = wd,
+                                 rescale_grad = rescale.grad,
+                                 clip_gradient = clip_gradient,
+                                 name = "w")
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, grad.req = "null")
+    return(exec)
   }
 
-  update <- function(index, weight, grad, state) {
+  update <- function(index, exec_w, weight, grad) {
     if (!is.null(lr_scheduler)){
       lr_scheduler(adam) ## changing lr
       lr <- adam$lr
       ## update count
       indexKey <- paste0('ik', index)
-      if (!exists(envir = adam, x = indexKey, inherits = FALSE)){
+      if (!exists(envir = adam, x = indexKey, inherits = FALSE)) {
         adam[[indexKey]] <- 0
       } else {
         indexValue <- adam[[indexKey]]
@@ -202,44 +271,15 @@ mx.opt.adam <- function(learning.rate=0.001,
       }
     }
 
-    # increment time
-    time.key <- paste0('t', index)
-    if (!exists(envir = adam, x = time.key, inherits = FALSE)){
-      adam[[time.key]] <- 0
-    }
-    t <- adam[[time.key]]
-    t <- t + 1
-    adam[[time.key]] <- t
-
-    mean <- state$mean
-    variance <- state$variance
-
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
-
-    mean <- beta1 * mean + (1 - beta1) * grad
-    variance <- beta2 * variance + (1 - beta2) * (grad * grad)
-
-    coef1 <- 1 - beta1^t
-    coef2 <- 1 - beta2^t
-    lr <- lr * sqrt(coef2)/coef1
-
-    weight <- weight - lr * mean / (mx.nd.sqrt(variance) + epsilon)
-    weight <- weight - lr * wd * weight
-
-    state <- list(mean=mean, variance=variance)
-
-    return(list(weight=weight, state=state))
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
+
+
 #' Create an AdaGrad optimizer with respective parameters.
 #' AdaGrad optimizer of Duchi et al., 2011,
 #'
@@ -254,38 +294,58 @@ mx.opt.adam <- function(learning.rate=0.001,
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, default=-1.0 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
-mx.opt.adagrad <- function(learning.rate=0.05,
-                           epsilon=1e-8,
-                           wd=0,
-                           rescale.grad=1,
-                           clip_gradient = NULL,
+mx.opt.adagrad <- function(learning.rate = 0.05,
+                           epsilon = 1e-8,
+                           wd = 0,
+                           rescale.grad = 1,
+                           clip_gradient = -1,
                            lr_scheduler = NULL) {
   # use lr as short for learing rate.
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   adagrad <- new.env()
   adagrad$lr <- lr
   adagrad$count <- 0
   adagrad$num_update <- 0
 
-  create.state <- function(index, weight) {
-      return (mx.nd.zeros(dim(weight), ctx(weight))) #history
+  create_exec <- function(index, weight_dim, ctx) {
+
+    weight <- mx.symbol.Variable("weight")
+    grad <- mx.symbol.Variable("grad")
+    history <- mx.symbol.Variable("history")
+
+    grad <- grad * rescale.grad
+    if (!is.null(clip_gradient)) {
+      if (clip_gradient >= 0) {
+        grad <- mx.symbol.clip(data = grad, a.min = -clip_gradient, a.max = clip_gradient)
+      }
+    }
+
+    history <- history + (grad * grad)
+    weight <- weight - lr * (grad / mx.symbol.sqrt(history + epsilon) + wd * weight)
+
+    w <- mx.symbol.identity(weight, name = "w")
+    h <- mx.symbol.identity(history, name = "h")
+    sym <- mx.symbol.Group(c(w, h))
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, grad.req = "null")
+    return(exec)
   }
 
-  update <- function(index, weight, grad, state) {
-    if (!is.null(lr_scheduler)){
+  update <- function(index, exec_w, weight, grad) {
+    if (!is.null(lr_scheduler)) {
       lr_scheduler(adagrad) ## changing lr
       lr <- adagrad$lr
       ## update count
       indexKey <- paste0('ik', index)
-      if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)){
+      if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)) {
         adagrad[[indexKey]] <- 0
       } else {
         indexValue <- adagrad[[indexKey]]
@@ -294,25 +354,18 @@ mx.opt.adagrad <- function(learning.rate=0.05,
       }
     }
 
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
 
-    history <- state
-    history <- history + (grad * grad)
-    weight <- weight - lr * (grad / mx.nd.sqrt(history + epsilon) + wd * weight)
-    state <- history
+    # update state
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(history = exec_w$ref.outputs$h_output), match.name = T)
 
-    return(list(weight=weight, state=state))
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
+
 #' Create an AdaDelta optimizer with respective parameters.
 #'
 #' AdaDelta optimizer as described in Zeiler, M. D. (2012).
@@ -325,50 +378,64 @@ mx.opt.adagrad <- function(learning.rate=0.05,
 #'      The constant as described in the thesis.
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
-#' @param rescale.grad float, default=1.0
+#' @param rescale.grad float, default=1
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, default=-1 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #'
-mx.opt.adadelta <- function(rho=0.90,
-                            epsilon=1e-5,
-                            wd=0,
-                            rescale.grad=1,
-                            clip_gradient = NULL) {
+mx.opt.adadelta <- function(rho = 0.90,
+                            epsilon = 1e-5,
+                            wd = 0,
+                            rescale.grad = 1,
+                            clip_gradient = -1) {
   adadelta <- new.env()
 
-  create.state <- function(index, weight) {
-    return (list(acc.g=mx.nd.zeros(dim(weight), ctx(weight)),       # accumulated g
-                 acc.delta=mx.nd.zeros(dim(weight), ctx(weight))))  # accumulated delta
-  }
+  create_exec <- function(index, weight_dim, ctx) {
+    weight <- mx.symbol.Variable("weight")
+    grad <- mx.symbol.Variable("grad")
+    acc.g <- mx.symbol.Variable("acc.g")
+    acc.delta <- mx.symbol.Variable("acc.delta")
 
-  update <- function(index, weight, grad, state) {
-    # preprocess grad
     grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
+    if (!is.null(clip_gradient)) {
+      if (clip_gradient >= 0) {
+        grad <- mx.symbol.clip(data = grad, a.min = -clip_gradient, a.max = clip_gradient)
       }
     }
 
-    # accumulated g and delta initlization
-    acc.g <- state$acc.g
-    acc.delta <- state$acc.delta
-
-    # update g, delta
+    # update state (acc.g, acc.delta)
     acc.g <- rho * acc.g + (1 - rho) * (grad * grad)
-    current.delta <- mx.nd.sqrt(acc.delta + epsilon) / mx.nd.sqrt(acc.g + epsilon) * grad
+    current.delta <- mx.symbol.sqrt(acc.delta + epsilon) / mx.symbol.sqrt(acc.g + epsilon) * grad
     acc.delta <- rho * acc.delta + (1 - rho) * (current.delta * current.delta)
     weight <- weight - current.delta - wd * weight
-    state <- list(acc.g=acc.g, acc.delta=acc.delta)
 
-    return(list(weight=weight, state=state))
+    w <- mx.symbol.identity(weight, name = "w")
+    g <- mx.symbol.identity(acc.g, name = "g")
+    delta <- mx.symbol.identity(acc.delta, name = "delta")
+    sym <- mx.symbol.Group(c(w, g, delta))
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, grad.req = "null")
+    return(exec)
   }
-  return(list(create.state=create.state, update=update))
+
+  update <- function(index, exec_w, weight, grad) {
+
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+
+    # update state
+    mx.exec.update.arg.arrays(exec_w,
+                              arg.arrays = list(
+                                acc.g = exec_w$ref.outputs$g_output,
+                                acc.delta = exec_w$ref.outputs$delta_output),
+                              match.name = T)
+
+    return(exec_w$ref.outputs$w_output)
+  }
+  return(list(create_exec = create_exec, update = update))
 }
 
+
 #' Create an optimizer by name and parameters
 #'
 #' @param name The name of the optimizer
@@ -392,31 +459,28 @@ mx.opt.create <- function(name, ...) {
 #' @param weights The weights to be optimized
 #'
 #' @export
-mx.opt.get.updater <- function(optimizer, weights) {
-  # This is the list to keep track of internal states of optimzer
-  state.list <- lapply(seq_along(weights), function(i) {
-    if (is.null(weights[[i]])) return(NULL)
-    optimizer$create.state(i, weights[[i]])
+mx.opt.get.updater <- function(optimizer, weights, ctx) {
+
+  exec_list <- lapply(seq_along(weights), function(i) {
+    if (is.null(weights[[i]])) {
+      return(NULL)
+    } else {
+      optimizer$create_exec(index = i, weight_dim = dim(weights[[i]]), ctx = ctx)
+    }
   })
+
   update <- optimizer$update
 
   update.closure <- function(weight, grad) {
-    ulist <- lapply(seq_along(weight), function(i) {
+
+    weight_list <- lapply(seq_along(weight), function(i) {
       if (!is.null(grad[[i]])) {
-        update(i, weight[[i]], grad[[i]], state.list[[i]])
+        return(update(i, exec_list[[i]], weight[[i]], grad[[i]]))
       } else {
         return(NULL)
       }
     })
-    # update state list, use mutate assignment
-    state.list <<- lapply(ulist, function(x) {
-      x$state
-    })
-    # return updated weight list
-    weight.list <- lapply(ulist, function(x) {
-      x$weight
-    })
-    return(weight.list)
+    return(weight_list)
   }
   return(update.closure)
 }
diff --git a/R-package/tests/testthat/test_optimizer.R b/R-package/tests/testthat/test_optimizer.R
new file mode 100644
index 000000000..c6dacaa72
--- /dev/null
+++ b/R-package/tests/testthat/test_optimizer.R
@@ -0,0 +1,204 @@
+context("optimizer")
+
+test_that("sgd", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("sgd",
+                             learning.rate = 1,
+                             momentum = 0,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2,1)), tolerance = 1e-1)
+
+})
+
+
+test_that("rmsprop", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("rmsprop", learning.rate = 1,
+                             centered = TRUE,
+                             gamma1 = 0.95,
+                             gamma2 = 0.9,
+                             epsilon = 1e-4,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2,1)), tolerance = 1e-1)
+
+})
+
+
+test_that("adam", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("adam",
+                             learning.rate = 1,
+                             beta1 = 0.9,
+                             beta2 = 0.999,
+                             epsilon = 1e-8,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2,1)), tolerance = 1e-1)
+
+})
+
+
+test_that("adagrad", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("adagrad",
+                             learning.rate = 1,
+                             epsilon = 1e-8,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2,1)), tolerance = 1e-1)
+
+})
+
+
+test_that("adadelta", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("adadelta",
+                             rho = 0.90,
+                             epsilon = 1e-5,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2,1)), tolerance = 1e-1)
+
+})

From 832a5fbab9ca6e2f13a19abb75b20b845fc86f6e Mon Sep 17 00:00:00 2001
From: Sergey Sokolov <Sergei.Sokolov@gmail.com>
Date: Wed, 25 Jul 2018 15:52:38 -0700
Subject: [PATCH 12/63] Add logistic regression tutorial (#11651)

* Add logistic regression tutorial

* Code review fix

* Add F1 metric, fix code review comments

* Add Download buttons script
---
 .../gluon/logistic_regression_explained.md    | 238 ++++++++++++++++++
 docs/tutorials/index.md                       |   1 +
 tests/tutorials/test_tutorials.py             |   3 +
 3 files changed, 242 insertions(+)
 create mode 100644 docs/tutorials/gluon/logistic_regression_explained.md

diff --git a/docs/tutorials/gluon/logistic_regression_explained.md b/docs/tutorials/gluon/logistic_regression_explained.md
new file mode 100644
index 000000000..8e5e4a547
--- /dev/null
+++ b/docs/tutorials/gluon/logistic_regression_explained.md
@@ -0,0 +1,238 @@
+
+# Logistic regression using Gluon API explained
+
+Logistic Regression is one of the first models newcomers to Deep Learning are implementing. The focus of this tutorial is to show how to do logistic regression using Gluon API.
+
+Before anything else, let's import required packages for this tutorial.
+
+
+```python
+import numpy as np
+import mxnet as mx
+from mxnet import nd, autograd, gluon
+from mxnet.gluon import nn, Trainer
+from mxnet.gluon.data import DataLoader, ArrayDataset
+
+mx.random.seed(12345)  # Added for reproducibility
+```
+
+In this tutorial we will use fake dataset, which contains 10 features drawn from a normal distribution with mean equals to 0 and standard deviation equals to 1, and a class label, which can be either 0 or 1. The size of the dataset is an arbitrary value. The function below helps us to generate a dataset. Class label `y` is generated via a non-random logic, so the network would have a pattern to look for. Boundary of 3 is selected to make sure that number of positive examples smaller than negative, but not too small
+
+
+```python
+def get_random_data(size, ctx):
+    x = nd.normal(0, 1, shape=(size, 10), ctx=ctx)
+    y = x.sum(axis=1) > 3
+    return x, y
+```
+
+Also, let's define a set of hyperparameters, that we are going to use later. Since our model is simple and dataset is small, we are going to use CPU for calculations. Feel free to change it to GPU for a more advanced scenario.
+
+
+```python
+ctx = mx.cpu()
+train_data_size = 1000
+val_data_size = 100
+batch_size = 10
+```
+
+## Working with data
+
+To work with data, Apache MXNet provides [Dataset](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.Dataset) and [DataLoader](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.DataLoader) classes. The former is used to provide an indexed access to the data, the latter is used to shuffle and batchify the data. To learn more about working with data in Gluon, please refer to [Gluon Datasets and Dataloaders](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html) tutorial.
+
+Below we define training and validation datasets, which we are going to use in the tutorial.
+
+
+```python
+train_x, train_ground_truth_class = get_random_data(train_data_size, ctx)
+train_dataset = ArrayDataset(train_x, train_ground_truth_class)
+train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+val_x, val_ground_truth_class = get_random_data(val_data_size, ctx)
+val_dataset = ArrayDataset(val_x, val_ground_truth_class)
+val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
+```
+
+## Defining and training the model
+
+The only requirement for the logistic regression is that the last layer of the network must be a single neuron. Apache MXNet allows us to do so by using [Dense](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.Dense) layer and specifying the number of units to 1. The rest of the network can be arbitrarily complex. 
+
+Below, we define a model which has an input layer of 10 neurons, a couple of inner layers of 10 neurons each, and output layer of 1 neuron. We stack the layers using [HybridSequential](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.HybridSequential) block and initialize parameters of the network using [Xavier](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initialization. 
+
+
+```python
+net = nn.HybridSequential()
+
+with net.name_scope():
+    net.add(nn.Dense(units=10, activation='relu'))  # input layer
+    net.add(nn.Dense(units=10, activation='relu'))   # inner layer 1
+    net.add(nn.Dense(units=10, activation='relu'))   # inner layer 2
+    net.add(nn.Dense(units=1))   # output layer: notice, it must have only 1 neuron
+
+net.initialize(mx.init.Xavier())
+```
+
+After defining the model, we need to define a few more things: our loss, our trainer and our metric.
+
+Loss function is used to calculate how the output of the network differs from the ground truth. Because classes  of the logistic regression are either 0 or 1, we are using [SigmoidBinaryCrossEntropyLoss](https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss). Notice that we do not specify `from_sigmoid` attribute in the code, which means that the output of the neuron doesn't need to go through sigmoid, but at inference we'd have to pass it through sigmoid. You can learn more about cross entropy on [wikipedia](https://en.wikipedia.org/wiki/Cross_entropy).
+
+Trainer object allows to specify the method of training to be used. For our tutorial we use [Stochastic Gradient Descent (SGD)](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.SGD). For more information on SGD refer to [the following tutorial](https://gluon.mxnet.io/chapter06_optimization/gd-sgd-scratch.html). We also need to parametrize it with learning rate value, which defines the weight updates, and weight decay, which is used for regularization.
+
+Metric helps us to estimate how good our model is in terms of a problem we are trying to solve. Where loss function has more importance for the training process, a metric is usually the thing we are trying to improve and reach maximum value. We also can use more than one metric, to measure various aspects of our model. In our example, we are using [Accuracy](https://mxnet.incubator.apache.org/api/python/model.html#mxnet.metric.Accuracy) and [F1 score](https://mxnet.incubator.apache.org/api/python/model.html#mxnet.metric.F1) as measurements of success of our model. 
+
+Below we define these objects.
+
+
+```python
+loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
+trainer = Trainer(params=net.collect_params(), optimizer='sgd', 
+                  optimizer_params={'learning_rate': 0.1})
+accuracy = mx.metric.Accuracy()
+f1 = mx.metric.F1()
+```
+
+The next step is to define the training function in which we iterate over all batches of training data, execute the forward pass on each batch and calculate training loss. On line 19, we sum losses of every batch per epoch into a single variable, because we calculate loss per single batch, but want to display it per epoch.
+
+
+```python
+def train_model():
+    cumulative_train_loss = 0
+    
+    for i, (data, label) in enumerate(train_dataloader):
+        with autograd.record():
+            # Do forward pass on a batch of training data
+            output = net(data)
+        
+            # Calculate loss for the training data batch
+            loss_result = loss(output, label)
+
+        # Calculate gradients 
+        loss_result.backward()
+
+        # Update parameters of the network
+        trainer.step(batch_size)
+
+        # sum losses of every batch
+        cumulative_train_loss += nd.sum(loss_result).asscalar()
+    
+    return cumulative_train_loss
+```
+
+## Validating the model
+
+Our validation function is very similar to the training one. The main difference is that we want to calculate accuracy of the model. We use [Accuracy metric](https://mxnet.incubator.apache.org/api/python/model.html#mxnet.metric.Accuracy) to do so. 
+
+`Accuracy` metric requires 2 arguments: 1) a vector of ground-truth classes and 2) A vector or matrix of predictions. When predictions are of the same shape as the vector of ground-truth classes, `Accuracy` class assumes that prediction vector contains predicted classes. So, it converts the vector to `Int32` and compare each item of ground-truth classes to prediction vector. 
+
+Because of the behaviour above, you will get an unexpected result if you just apply [Sigmoid](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.sigmoid) function to the network result and pass it to `Accuracy` metric. As mentioned before, we need to apply `Sigmoid` function to the output of the neuron to get a probability of belonging to the class 1. But `Sigmoid` function produces output in range [0; 1], and all numbers in that range are going to be casted to 0, even if it is as high as 0.99. To avoid this we write a custom bit of code on line 12, that:
+
+1. Calculates sigmoid using `Sigmoid` function
+
+2. Subtracts a threshold from the original sigmoid output. Usually, the threshold is equal to 0.5, but it can be higher, if you want to increase certainty of an item to belong to class 1.
+
+3. Uses [mx.nd.ceil](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.ceil) function, which converts all negative values to 0 and all positive values to 1
+
+After these transformations we can pass the result to `Accuracy.update()` method and expect it to behave in a proper way.
+
+For `F1` metric to work, instead of one number per class, we must pass probabilities of belonging to both classes. Because of that, on lines 21-22 we:
+
+1. Reshape predictions to a single vector
+
+2. We stack together two vectors: probabilities of belonging to class 0 (1 - `prediction`) and probabilities of belonging to class 1.
+
+Then we pass this stacked matrix to `F1` score.
+
+
+```python
+def validate_model(threshold):
+    cumulative_val_loss = 0
+    
+    for i, (val_data, val_ground_truth_class) in enumerate(val_dataloader):
+        # Do forward pass on a batch of validation data
+        output = net(val_data)
+        
+        # Similar to cumulative training loss, calculate cumulative validation loss
+        cumulative_val_loss += nd.sum(loss(output, val_ground_truth_class)).asscalar()
+        
+        # getting prediction as a sigmoid
+        prediction = net(val_data).sigmoid()
+        
+        # Converting neuron outputs to classes
+        predicted_classes = mx.nd.ceil(prediction - threshold)
+        
+        # Update validation accuracy
+        accuracy.update(val_ground_truth_class, predicted_classes.reshape(-1)) 
+        
+        # calculate probabilities of belonging to different classes. F1 metric works only with this notation
+        prediction = prediction.reshape(-1)
+        probabilities = mx.nd.stack(1 - prediction, prediction, axis=1)
+        
+        f1.update(val_ground_truth_class, probabilities)
+
+    return cumulative_val_loss
+```
+
+## Putting it all together
+
+By using the defined above functions, we can finally write our main training loop.
+
+
+```python
+epochs = 10
+threshold = 0.5
+
+for e in range(epochs):
+    avg_train_loss = train_model() / train_data_size
+    avg_val_loss = validate_model(threshold) / val_data_size
+    
+    print("Epoch: %s, Training loss: %.2f, Validation loss: %.2f, Validation accuracy: %.2f, F1 score: %.2f" % 
+          (e, avg_train_loss, avg_val_loss, accuracy.get()[1], f1.get()[1]))
+
+    # we reset accuracy, so the new epoch's accuracy would be calculated from the blank state
+    accuracy.reset()
+```
+
+    Epoch: 0, Training loss: 0.43, Validation loss: 0.36, Validation accuracy: 0.85, F1 score: 0.00 <!--notebook-skip-line-->
+
+    Epoch: 1, Training loss: 0.22, Validation loss: 0.14, Validation accuracy: 0.96, F1 score: 0.35 <!--notebook-skip-line-->
+
+    Epoch: 2, Training loss: 0.09, Validation loss: 0.11, Validation accuracy: 0.97, F1 score: 0.48 <!--notebook-skip-line-->
+
+    Epoch: 3, Training loss: 0.07, Validation loss: 0.09, Validation accuracy: 0.96, F1 score: 0.53 <!--notebook-skip-line-->
+
+    Epoch: 4, Training loss: 0.06, Validation loss: 0.09, Validation accuracy: 0.97, F1 score: 0.58 <!--notebook-skip-line-->
+
+    Epoch: 5, Training loss: 0.04, Validation loss: 0.12, Validation accuracy: 0.97, F1 score: 0.59 <!--notebook-skip-line-->
+    
+    Epoch: 6, Training loss: 0.05, Validation loss: 0.09, Validation accuracy: 0.99, F1 score: 0.62 <!--notebook-skip-line-->
+    
+    Epoch: 7, Training loss: 0.05, Validation loss: 0.10, Validation accuracy: 0.97, F1 score: 0.62 <!--notebook-skip-line-->
+    
+    Epoch: 8, Training loss: 0.05, Validation loss: 0.12, Validation accuracy: 0.95, F1 score: 0.63 <!--notebook-skip-line-->
+    
+    Epoch: 9, Training loss: 0.04, Validation loss: 0.09, Validation accuracy: 0.98, F1 score: 0.65 <!--notebook-skip-line-->
+
+
+In our case we hit the accuracy of 0.98 and F1 score of 0.65.
+
+## Tip 1: Use only one neuron in the output layer
+
+Despite that there are 2 classes, there should be only one output neuron, because `SigmoidBinaryCrossEntropyLoss` accepts only one feature as an input. 
+
+## Tip 2: Encode classes as 0 and 1
+
+For `SigmoidBinaryCrossEntropyLoss` to work it is required that classes were encoded as 0 and 1. In some datasets the class encoding might be different, like -1 and 1 or 1 and 2. If this is how your dataset looks like, then you need to re-encode the data before using `SigmoidBinaryCrossEntropyLoss`.
+
+## Tip 3: Use SigmoidBinaryCrossEntropyLoss instead of LogisticRegressionOutput
+
+NDArray API has two options to calculate logistic regression loss: [SigmoidBinaryCrossEntropyLoss](https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss) and [LogisticRegressionOutput](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.LogisticRegressionOutput). `LogisticRegressionOutput` is designed to be an output layer when using the Module API, and is not supposed to be used when using Gluon API. 
+
+## Conclusion
+
+In this tutorial I explained some potential pitfalls to be aware of. When doing logistic regression using Gluon API remember to:
+1. Use only one neuron in the output layer
+1. Encode class labels as 0 or 1
+1. Use `SigmoidBinaryCrossEntropyLoss`
+1. Convert probabilities to classes before calculating Accuracy
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index bb5f8546f..57bfec7cd 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -34,6 +34,7 @@ Select API:&nbsp;
 * Models
     * [Model Zoo: using pre-trained models](/tutorials/gluon/pretrained_models.html)
     * [Linear Regression](http://gluon.mxnet.io/chapter02_supervised-learning/linear-regression-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
+    * [Logistic Regression](/tutorials/gluon/logistic_regression_explained.html)
     * [Word-level text generation with RNN, LSTM and GRU](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
     * [Visual Question Answering](http://gluon.mxnet.io/chapter08_computer-vision/visual-question-answer.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>
 * Practitioner Guides
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index d2d5e6e15..22d00c181 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -136,6 +136,9 @@ def test_python_matrix_factorization():
 def test_python_linear_regression():
     assert _test_tutorial_nb('python/linear-regression')
 
+def test_python_logistic_regression() :
+    assert _test_tutorial_nb('gluon/logistic_regression_explained')
+
 def test_python_mnist():
     assert _test_tutorial_nb('python/mnist')
 

From 7cd01ff6d0b171e1bc95e3ac1eeb11a644d0abcd Mon Sep 17 00:00:00 2001
From: Hao Jin <haojin2@users.noreply.github.com>
Date: Wed, 25 Jul 2018 19:45:50 -0400
Subject: [PATCH 13/63] Re-enabling randomized
 test_operator/test_operator_gpu.test_dot (#11888)

---
 tests/python/unittest/test_operator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 11180ebbc..e50f8a143 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2504,8 +2504,9 @@ def test_stn_valid_sampling():
     ) + target_shape))
 
 
-# Seed set because the test is not robust enough to operate on random data
-@with_seed(1234)
+# @haojin2: Getting rid of fixed seed as flakiness could not be reproduced,
+# tracked at https://github.com/apache/incubator-mxnet/issues/11714
+@with_seed()
 def test_dot():
     ctx=default_context()
     dtypes = ['float32', 'float64']

From 302aae33bf2c60187b869cc643b1489e5739e974 Mon Sep 17 00:00:00 2001
From: Hao Jin <haojin2@users.noreply.github.com>
Date: Wed, 25 Jul 2018 21:05:51 -0400
Subject: [PATCH 14/63] Fix non-determinism of dot(csr.T, dns) = dns with tests
 (#11825)

* fix undeterminism of dot(csr.T, dns) = dns with tests

* address code reviews
---
 src/common/utils.h                            |  12 +
 src/operator/tensor/dot-inl.cuh               | 314 +++++++-----------
 src/operator/tensor/indexing_op.cu            |   2 +-
 src/operator/tensor/indexing_op.h             |  12 +-
 src/storage/pooled_storage_manager.h          |  10 +-
 tests/python/unittest/test_sparse_operator.py |   9 +-
 6 files changed, 139 insertions(+), 220 deletions(-)

diff --git a/src/common/utils.h b/src/common/utils.h
index d7ed4ddf0..96949a047 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -663,6 +663,18 @@ constexpr size_t MaxIntegerValue<mshadow::half::half_t>() {
   return size_t(2) << 10;
 }
 
+MSHADOW_XINLINE int ilog2ul(size_t a) {
+  int k = 1;
+  while (a >>= 1) ++k;
+  return k;
+}
+
+MSHADOW_XINLINE int ilog2ui(unsigned int a) {
+  int k = 1;
+  while (a >>= 1) ++k;
+  return k;
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh
index 8aedec066..6c94ad1ec 100644
--- a/src/operator/tensor/dot-inl.cuh
+++ b/src/operator/tensor/dot-inl.cuh
@@ -31,6 +31,7 @@
 #include "./sort_op.h"
 #include "./util/tensor_util-inl.h"
 #include "./util/tensor_util-inl.cuh"
+#include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -176,119 +177,6 @@ struct DotCsrTransDnsDnsScalarKernel {
   }
 };
 
-/*!
- * \brief GPU warp kernel of dot(csr.T, dns1) = dns2
- * Parallelization by columns: 1 warp computes one lhs column for one rhs column
- */
-struct DotCsrTransDnsDnsWarpKernel {
-  /*!
-   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
-   */
-  template<typename DType, typename IType, typename CType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             DType* out,
-                                             const DType* data_l,
-                                             const IType* indptr_l,
-                                             const CType* col_idx_l,
-                                             const DType* data_r,
-                                             const nnvm::dim_t num_cols_r) {
-    using nnvm::dim_t;
-    const dim_t warp_id = tid / 32;           // global warp id
-    const dim_t lane = tid & (32-1);          // local thread id within warp
-    const dim_t icol = warp_id / num_cols_r;  // lhs column that this warp computes
-    const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes
-
-    // Compute range of nnz elements in this column
-    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
-    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
-
-    // Iterate through the nnz elements in this column
-    for (dim_t j = low+lane; j < high; j+=32) {
-      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
-      const DType val = data_l[j]*data_r[icol*num_cols_r+kcol];
-      atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+kcol])), val);
-    }
-  }
-};
-
-/*!
- * \brief GPU thread block kernel of dot(csr.T, dns1) = dns2
- * Parallelization by columns: 1 thread block computes one lhs column for all rhs columns
- */
-struct DotCsrTransDnsDnsThreadBlockKernel {
-  /*!
-   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
-   */
-  template<typename DType, typename IType, typename CType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             DType* out,
-                                             const DType* data_l,
-                                             const IType* indptr_l,
-                                             const CType* col_idx_l,
-                                             const DType* data_r,
-                                             const nnvm::dim_t num_cols_r) {
-    using nnvm::dim_t;
-    const dim_t warps_per_block = blockDim.x / 32;  // number of warps in this thread block
-    const dim_t warp_id = tid / 32;                 // global warp id
-    const dim_t lane = tid & (32-1);                // local thread id within warp
-    const dim_t icol = blockIdx.x;                  // lhs column that this thread block computes
-    const dim_t kcol = warp_id % warps_per_block;   // rhs column where warp starts computing (offset)
-
-    // Compute range of nnz elements in this lhs column
-    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
-    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
-
-    // Iterate through the nnz elements in this lhs column
-    for (dim_t j = low+lane; j < high; j+=32) {
-      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
-      const DType datum_l = data_l[j];
-      // Iterate over rhs columns that this warp computes
-      for (dim_t k = kcol; k < num_cols_r; k+=warps_per_block) {
-        const DType val = datum_l*data_r[icol*num_cols_r+k];
-        atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+k])), val);
-      }
-    }
-  }
-};
-
-/*!
- * \brief GPU warp block kernel of dot(csr.T, dns1) = dns2
- * Parallelization by columns: 1 warp computes one lhs column for all rhs columns
- */
-struct DotCsrTransDnsDnsWarpBlockKernel {
-  /*!
-   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
-   */
-  template<typename DType, typename IType, typename CType>
-  __device__ __forceinline__ static void Map(int tid,
-                                             DType* out,
-                                             const DType* data_l,
-                                             const IType* indptr_l,
-                                             const CType* col_idx_l,
-                                             const DType* data_r,
-                                             const nnvm::dim_t num_cols_r) {
-    using nnvm::dim_t;
-    const dim_t warp_id = tid / 32;   // global warp id
-    const dim_t lane = tid & (32-1);  // local thread id within warp
-    const dim_t icol = warp_id;       // lhs column that this warp computes
-
-    // Compute range of nnz elements in this column
-    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
-    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
-
-    // Iterate through the nnz elements in lhs column
-    for (dim_t j = low+lane; j < high; j+=32) {
-      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
-      const DType datum_l = data_l[j];
-      // Iterate over all rhs columns
-      for (dim_t k = 0; k < num_cols_r; k++) {
-        const DType val = datum_l*data_r[icol*num_cols_r+k];
-        atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+k])), val);
-      }
-    }
-  }
-};
-
 /*!
  * \brief GPU Kernel of dot(csr.T, rsp1) = rsp2
  * Parallelization by rows: 1 thread/row
@@ -510,7 +398,7 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
     return;
   }
 
-  using mshadow::cuda::kBaseThreadNum;
+  using namespace mshadow;
   using mxnet_op::Kernel;
   using mxnet_op::set_zero;
   using nnvm::dim_t;
@@ -518,7 +406,6 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
   const dim_t num_rows_l = lhs.shape()[0];
   const dim_t num_cols_r = rhs.shape_[1];
   const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
-  const dim_t threads_per_block = kBaseThreadNum;
   dim_t num_threads;
   // TODO: remove kernel dependency on warpSize=32
   if (threads_per_warp != 32) {
@@ -539,86 +426,120 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
           Kernel<set_zero, gpu>::Launch(s, num_threads, data_out.dptr<DType>());
         }
         if (trans_lhs) {
-          // Different kernel versions are optimized for different matrix instances
-          // TODO: switch between kernel versions depending on input
-          // (1) 'Scalar kernel'       (one thread       computing one output element                )
-          // (2) 'Warp kernel'         (one warp         computing one lhs column for one rhs column )
-          // (3) 'Thread block kernel' (one thread block computing one lhs column for all rhs columns)
-          // (4) 'Warp block kernel'   (one warp         computing one lhs column for all rhs columns)
-          const int kernel_version = 0;
-          switch (kernel_version) {
-            case 1:
-              num_threads = data_out.Size();
-              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
-                Kernel<DotCsrTransDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
-                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
-                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_rows_l, num_cols_r);
-              });
-              break;
-            case 2:
-              num_threads = threads_per_warp * num_rows_l * num_cols_r;
-              Kernel<DotCsrTransDnsDnsWarpKernel, gpu>::Launch(s, num_threads,
-                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
-                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-              break;
-            case 3:
-              num_threads = threads_per_block * num_rows_l;
-              Kernel<DotCsrTransDnsDnsThreadBlockKernel, gpu>::Launch(s, num_threads,
-                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
-                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-              break;
-            case 4:
-              num_threads = threads_per_warp * num_rows_l;
-              Kernel<DotCsrTransDnsDnsWarpBlockKernel, gpu>::Launch(s, num_threads,
+          // TODO(haojin2): Switching to deterministic algorithm for now.
+          //                Further optimizations to come later.
+          const nnvm::dim_t num_csr_rows = lhs.shape()[0];
+          const nnvm::dim_t num_csr_cols = lhs.shape()[1];
+          const nnvm::dim_t num_dns_rows = rhs.shape_[0];
+          const nnvm::dim_t nnz = lhs.storage_shape().Size();
+
+          IType* original_idx_ptr = nullptr;
+          IType* csc_indices_ptr = nullptr;
+          IType* csc_cols_ptr = nullptr;
+          CType* csr_rows_ptr = nullptr;
+          CType* csc_indptr_ptr = nullptr;
+          DType* csc_data_ptr = nullptr;
+          char* temp_storage_ptr = nullptr;
+          size_t original_idx_bytes = nnz*sizeof(IType);
+          size_t csc_indices_bytes = nnz*sizeof(IType);
+          size_t csc_cols_bytes = nnz*sizeof(IType);
+          size_t csr_rows_bytes = nnz*sizeof(CType);
+          size_t csc_indptr_bytes = (num_csr_cols+1)*sizeof(CType);
+          size_t csc_data_bytes = nnz*sizeof(DType);
+          size_t scan_temp_storage_bytes = 0;
+          size_t temp_storage_bytes = SortByKeyWorkspaceSize<IType, IType, gpu>(nnz);
+          IType* csr_indices_ptr = col_idx_l.dptr<IType>();
+          cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
+                                        scan_temp_storage_bytes,
+                                        csc_indptr_ptr,
+                                        csc_indptr_ptr,
+                                        num_csr_cols+1,
+                                        mshadow::Stream<gpu>::GetStream(s));
+          temp_storage_bytes = std::max(temp_storage_bytes, scan_temp_storage_bytes);
+          temp_storage_bytes += (sizeof(dim_t) - temp_storage_bytes % sizeof(dim_t));
+          size_t total_workspace_bytes =
+            original_idx_bytes + csc_indices_bytes + csc_cols_bytes + csr_rows_bytes +
+            csc_indptr_bytes + csc_data_bytes + temp_storage_bytes;
+          total_workspace_bytes += (sizeof(IType) - total_workspace_bytes % sizeof(IType));
+          Tensor<gpu, 1, char> workspace = ctx.requested[0]
+              .get_space_typed<gpu, 1, char>(Shape1(total_workspace_bytes), s);
+          original_idx_ptr = reinterpret_cast<IType*>(workspace.dptr_);
+          csc_indices_ptr = reinterpret_cast<IType*>(workspace.dptr_ + original_idx_bytes);
+          csc_cols_ptr = reinterpret_cast<IType*>(workspace.dptr_ + original_idx_bytes +
+                                                  csc_indices_bytes);
+          csr_rows_ptr = reinterpret_cast<CType*>(workspace.dptr_ + original_idx_bytes +
+                                                  csc_indices_bytes + csc_cols_bytes);
+          csc_indptr_ptr = reinterpret_cast<CType*>(workspace.dptr_ + original_idx_bytes +
+                                                    csc_indices_bytes + csc_cols_bytes +
+                                                    csr_rows_bytes);
+          temp_storage_ptr = workspace.dptr_ + original_idx_bytes + csc_indices_bytes +
+                             csc_cols_bytes + csr_rows_bytes + csc_indptr_bytes;
+          csc_data_ptr = reinterpret_cast<DType*>(
+                           workspace.dptr_ + total_workspace_bytes - csc_data_bytes);
+
+          // Fill original_idx
+          mxnet_op::Kernel<range_fwd, gpu>::Launch(
+            s, nnz, 1, IType(0), IType(1), kWriteTo, original_idx_ptr);
+          // Fill csc_cols with copy of csr_indices
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, kWriteTo>, gpu>::Launch(
+            s, nnz, csc_cols_ptr, csr_indices_ptr);
+          // Allocate the tensors needed for SortByKey
+          Tensor<gpu, 1, IType> original_idx(original_idx_ptr, Shape1(nnz), s);
+          Tensor<gpu, 1, IType> csc_cols(csc_cols_ptr, Shape1(nnz), s);
+          Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);
+
+          int num_bits = common::ilog2ul(num_csr_cols - 1);
+          SortByKey(csc_cols, original_idx, true, &temp_storage, 0, num_bits);
+
+          // Scatter csr indptr to row id
+          mxnet_op::Kernel<CsrRowScatterKernel, gpu>::Launch(
+            s, num_csr_rows, indptr_l.dptr<CType>(), csr_rows_ptr, num_csr_rows);
+          // Reset indptr to zero
+          mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, num_csr_cols+1, csc_indptr_ptr);
+          // Histogram on the sorted cols
+          mxnet_op::Kernel<HistogramKernel, gpu>::Launch(
+            s, nnz, csc_indptr_ptr, csc_cols_ptr, nnz);
+          // Scan the bin counts for every column to get csc_indptr
+          cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
+                                        temp_storage_bytes,
+                                        csc_indptr_ptr,
+                                        csc_indptr_ptr,
+                                        num_csr_cols+1,
+                                        mshadow::Stream<gpu>::GetStream(s));
+          // Assign data to csc matrix arrays
+          mxnet_op::Kernel<CscDataIndicesKernel, gpu>::Launch(
+            s, nnz, original_idx_ptr, data_l.dptr<DType>(), csr_rows_ptr, csc_data_ptr,
+            csc_indices_ptr, nnz);
+          if (num_cols_r > 4) {
+            num_threads = data_out.Size();
+            MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+              Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), csc_data_ptr, csc_indptr_ptr,
+                  csc_indices_ptr, data_r.dptr<DType>(), num_cols_r);
+            });
+          } else {
+            num_threads = threads_per_warp * num_rows_l * num_cols_r;
+            MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+              Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), csc_data_ptr, csc_indptr_ptr,
+                  csc_indices_ptr, data_r.dptr<DType>(), num_cols_r);
+            });
+          }
+        } else {
+          if (num_cols_r > 4) {
+            num_threads = data_out.Size();
+            MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+              Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
                   data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
                   col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-              break;
-            default:
-              num_threads = threads_per_warp * num_rows_l * num_cols_r;
-              Kernel<DotCsrTransDnsDnsWarpKernel, gpu>::Launch(s, num_threads,
+            });
+          } else {
+            num_threads = threads_per_warp * num_rows_l * num_cols_r;
+            MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+              Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
                   data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
                   col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-              break;
-          }
-        } else {
-          // Different kernel versions are optimized for different matrix instances
-          // (1) 'Scalar kernel' (one thread computing one output element)
-          // (2) 'Vector kernel' (one warp   computing one output element)
-          const int kernel_version = 0;
-          switch (kernel_version) {
-            case 1:
-              num_threads = data_out.Size();
-              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
-                Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
-                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
-                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-              });
-              break;
-            case 2:
-              num_threads = threads_per_warp * num_rows_l * num_cols_r;
-              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
-                Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
-                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
-                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-              });
-              break;
-            default:
-              if (num_cols_r > 4) {
-                num_threads = data_out.Size();
-                MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
-                  Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
-                      data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
-                      col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-                });
-              } else {
-                num_threads = threads_per_warp * num_rows_l * num_cols_r;
-                MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
-                  Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
-                      data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
-                      col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
-                });
-              }
-              break;
+            });
           }
         }
       });
@@ -671,13 +592,6 @@ struct DotCsrTransDnsRspKernel {
   }
 };
 
-// Returns integer log2(a) rounded up
-inline int log2i(size_t a) {
-  int k = 1;
-  while (a >>= 1) k++;
-  return k;
-}
-
 /*!
  * \brief GPU Impl of dot(csr.T, dns) = rsp
  */
@@ -777,7 +691,7 @@ inline void DotCsrDnsRspImpl(const OpContext& ctx,
           Tensor<gpu, 1, IType> original_idx(original_idx_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(total_temp_bytes), s);
 
-          int num_bits = log2i(num_cols_l - 1);
+          int num_bits = common::ilog2ul(num_cols_l - 1);
           SortByKey(col_idx_copy, original_idx, true, &temp_storage, 0, num_bits);
 
           // over-allocate aux indices
@@ -1124,7 +1038,7 @@ inline void DotDnsCsrDnsImpl(const OpContext& ctx, const gpu& gpu_dev,
           Tensor<gpu, 1, IType> csc_cols(csc_cols_ptr, Shape1(nnz), s);
           Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);
 
-          int num_bits = log2i(num_csr_cols - 1);
+          int num_bits = common::ilog2ul(num_csr_cols - 1);
           SortByKey(csc_cols, original_idx, true, &temp_storage, 0, num_bits);
 
           // Scatter csr indptr to row id
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index b0ee05ea1..39fd81ef2 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -217,7 +217,7 @@ void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
   Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0),
                                  static_cast<dim_t>(1), kWriteTo, original_idx);
   // sort data with its original idx
-  int num_bits = ilog2(num_rows - 1);
+  int num_bits = common::ilog2ui(num_rows - 1);
   char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
   Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
                                            Shape1(sort_workspace_size), s);
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 07d96dcbd..edaf93973 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -44,6 +44,7 @@
 #include "./sort_op.h"
 #include "./init_op.h"
 #include "../../engine/openmp.h"
+#include "../../common/utils.h"
 #ifdef __CUDACC__
 #include "./indexing_op-inl.cuh"
 #endif
@@ -528,13 +529,6 @@ void SparseEmbeddingOpForwardEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
-// Returns integer log2(a) rounded up
-inline int ilog2(unsigned int a) {
-  int k = 1;
-  while (a >>= 1) k++;
-  return k;
-}
-
 /*! \brief cast to type and clip to range [0, K - 1]
  */
 struct tcast_clip {
@@ -906,7 +900,7 @@ void TakeOpBackwardImpl(mshadow::Stream<cpu>* s,
         s, idxshape.Size(), sorted_idx_ptr, sorted_idx_ptr, static_cast<int>(arrshape[axis]));
     }
     Tensor<cpu, 1, int> original_idx(original_idx_ptr, Shape1(idxshape.Size()), s);
-    int num_bits = ilog2(static_cast<unsigned int>(idxshape.Size()) - 1);
+    int num_bits = common::ilog2ui(static_cast<unsigned int>(idxshape.Size()) - 1);
     Tensor<cpu, 1, int> sorted_idx(sorted_idx_ptr, Shape1(idxshape.Size()), s);
     SortByKey(sorted_idx, original_idx, true, &temp_storage, 0, num_bits);
     for (size_t i = 0; i < idxshape.Size(); ++i) {
@@ -1000,7 +994,7 @@ void TakeOpBackwardImpl(mshadow::Stream<gpu>* s,
     }
     Tensor<gpu, 1, int> original_idx(original_idx_ptr, Shape1(idxshape.Size()), s);
     Tensor<gpu, 1, char> temp_storage(temp_storage_ptr, Shape1(temp_storage_bytes), s);
-    int num_bits = ilog2(static_cast<unsigned int>(idxshape.Size()) - 1);
+    int num_bits = common::ilog2ui(static_cast<unsigned int>(idxshape.Size()) - 1);
     Tensor<gpu, 1, int> sorted_idx(sorted_idx_ptr, Shape1(idxshape.Size()), s);
     SortByKey(sorted_idx, original_idx, true, &temp_storage, 0, num_bits);
     cub::DeviceScan::ExclusiveSum(temp_storage_ptr,
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index bed97301f..f3a9b16cd 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -38,6 +38,7 @@
 #include <new>
 #include "./storage_manager.h"
 #include "../common/cuda_utils.h"
+#include "../common/utils.h"
 
 
 namespace mxnet {
@@ -173,10 +174,10 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
       LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \
                  << "Got: " << page_size_ << ".";
     }
-    if (page_size_ != 1ul << log2_round_up(page_size_)) {
+    if (page_size_ != 1ul << common::ilog2ul(page_size_ - 1)) {
       LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE must be a power of 2. Got: " << page_size_ << ".";
     }
-    page_size_ = log2_round_up(page_size_);
+    page_size_ = common::ilog2ul(page_size_ - 1);
     if (cut_off_ < 20 || cut_off_ > LOG2_MAX_MEM) {
       LOG(FATAL) << "MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF cannot be set to a value " \
                  << "smaller than 20 or greater than " << LOG2_MAX_MEM << ". Got: " \
@@ -205,9 +206,6 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   }
 
  private:
-  inline int log2_round_up(size_t s) {
-    return static_cast<int>(std::ceil(std::log2(s)));
-  }
   inline int div_pow2_round_up(size_t s, int divisor_log2) {
     // (1025, 10) -> 2
     // (2048, 10) -> 2
@@ -216,7 +214,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
     return static_cast<int>(result + (s > (result << divisor_log2) ? 1 : 0));
   }
   inline int get_bucket(size_t s) {
-    int log_size = log2_round_up(s);
+    int log_size = common::ilog2ul(s - 1);
     if (log_size > static_cast<int>(cut_off_))
       return div_pow2_round_up(s, cut_off_) - 1 + cut_off_;
     else
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 09b0ae3a9..b4783bd23 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1424,7 +1424,7 @@ def test_sparse_dot_zero_output(lhs_shape, trans_lhs, rhs_num_cols):
 
 @with_seed()
 def test_sparse_dot_determinism():
-    def test_dot_determinism(lhs_stype, rhs_stype, lhs_density, rhs_density, transpose_a, transpose_b, forward_stype):
+    def check_dot_determinism(lhs_stype, rhs_stype, lhs_density, rhs_density, transpose_a, transpose_b, forward_stype):
         lhs_row = rnd.randint(50, 100)
         lhs_col = rnd.randint(50, 100)
         if transpose_a:
@@ -1444,10 +1444,11 @@ def test_dot_determinism(lhs_stype, rhs_stype, lhs_density, rhs_density, transpo
         res2 = mx.nd.sparse.dot(lhs, rhs, transpose_a=transpose_a, transpose_b=transpose_b, forward_stype=forward_stype)
         assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.0, atol=0.0)
 
-    test_dot_determinism('csr', 'default', 0.1, 1.0, True, False, 'row_sparse')
+    check_dot_determinism('csr', 'default', 0.1, 1.0, True, False, 'row_sparse')
     forward_stype = 'csr' if default_context() == mx.cpu() else 'default'
-    test_dot_determinism('default', 'csr', 1.0, 0.1, False, False, forward_stype)
-    test_dot_determinism('default', 'csr', 1.0, 0.1, False, True, forward_stype)
+    check_dot_determinism('default', 'csr', 1.0, 0.1, False, False, forward_stype)
+    check_dot_determinism('default', 'csr', 1.0, 0.1, False, True, forward_stype)
+    check_dot_determinism('csr', 'default', 0.1, 1.0, True, False, 'default')
 
 
 @with_seed()

From f5b95b090815e879b57dca233604dcb3f1df967a Mon Sep 17 00:00:00 2001
From: Vandana Kannan <vandanavk@users.noreply.github.com>
Date: Wed, 25 Jul 2018 21:24:00 -0700
Subject: [PATCH 15/63] Support integer type in ImageIter (#11864)

---
 python/mxnet/image/image.py         | 13 ++++++----
 tests/python/unittest/test_image.py | 39 +++++++++++++++++------------
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 5af2b9556..c2a190664 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -1057,6 +1057,8 @@ class ImageIter(io.DataIter):
         Data name for provided symbols.
     label_name : str
         Label name for provided symbols.
+    dtype : str
+        Label data type. Default: float32. Other options: int32, int64, float64
     kwargs : ...
         More arguments for creating augmenter. See mx.image.CreateAugmenter.
     """
@@ -1064,9 +1066,10 @@ class ImageIter(io.DataIter):
     def __init__(self, batch_size, data_shape, label_width=1,
                  path_imgrec=None, path_imglist=None, path_root=None, path_imgidx=None,
                  shuffle=False, part_index=0, num_parts=1, aug_list=None, imglist=None,
-                 data_name='data', label_name='softmax_label', **kwargs):
+                 data_name='data', label_name='softmax_label', dtype='float32', **kwargs):
         super(ImageIter, self).__init__()
         assert path_imgrec or path_imglist or (isinstance(imglist, list))
+        assert dtype in ['int32', 'float32', 'int64', 'float64'], dtype + ' label not supported'
         num_threads = os.environ.get('MXNET_CPU_WORKER_NTHREADS', 1)
         logging.info('Using %s threads for decoding...', str(num_threads))
         logging.info('Set enviroment variable MXNET_CPU_WORKER_NTHREADS to a'
@@ -1091,7 +1094,7 @@ def __init__(self, batch_size, data_shape, label_width=1,
                 imgkeys = []
                 for line in iter(fin.readline, ''):
                     line = line.strip().split('\t')
-                    label = nd.array([float(i) for i in line[1:-1]])
+                    label = nd.array(line[1:-1], dtype=dtype)
                     key = int(line[0])
                     imglist[key] = (label, line[-1])
                     imgkeys.append(key)
@@ -1105,11 +1108,11 @@ def __init__(self, batch_size, data_shape, label_width=1,
                 key = str(index)  # pylint: disable=redefined-variable-type
                 index += 1
                 if len(img) > 2:
-                    label = nd.array(img[:-1])
+                    label = nd.array(img[:-1], dtype=dtype)
                 elif isinstance(img[0], numeric_types):
-                    label = nd.array([img[0]])
+                    label = nd.array([img[0]], dtype=dtype)
                 else:
-                    label = nd.array(img[0])
+                    label = nd.array(img[0], dtype=dtype)
                 result[key] = (label, img[-1])
                 imgkeys.append(str(key))
             self.imglist = result
diff --git a/tests/python/unittest/test_image.py b/tests/python/unittest/test_image.py
index 636c5e2be..9eec1835c 100644
--- a/tests/python/unittest/test_image.py
+++ b/tests/python/unittest/test_image.py
@@ -132,26 +132,33 @@ def test_color_normalize(self):
 
 
     def test_imageiter(self):
-        im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
-        test_iter = mx.image.ImageIter(2, (3, 224, 224), label_width=1, imglist=im_list,
-            path_root='')
-        for _ in range(3):
+        def check_imageiter(dtype='float32'):
+            im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
+            test_iter = mx.image.ImageIter(2, (3, 224, 224), label_width=1, imglist=im_list,
+                path_root='', dtype=dtype)
+            for _ in range(3):
+                for batch in test_iter:
+                    pass
+                test_iter.reset()
+
+            # test with list file
+            fname = './data/test_imageiter.lst'
+            file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x]) \
+                for k, x in enumerate(TestImage.IMAGES)]
+            with open(fname, 'w') as f:
+                for line in file_list:
+                    f.write(line + '\n')
+
+            test_iter = mx.image.ImageIter(2, (3, 224, 224), label_width=1, path_imglist=fname,
+                path_root='', dtype=dtype)
             for batch in test_iter:
                 pass
-            test_iter.reset()
 
-        # test with list file
-        fname = './data/test_imageiter.lst'
-        file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x]) \
-            for k, x in enumerate(TestImage.IMAGES)]
-        with open(fname, 'w') as f:
-            for line in file_list:
-                f.write(line + '\n')
+        for dtype in ['int32', 'float32', 'int64', 'float64']:
+            check_imageiter(dtype)
 
-        test_iter = mx.image.ImageIter(2, (3, 224, 224), label_width=1, path_imglist=fname,
-            path_root='')
-        for batch in test_iter:
-            pass
+        # test with default dtype
+        check_imageiter()
 
     @with_seed()
     def test_augmenters(self):

From c13ce5cb0adbef4359ced3c07f804f7172ab4c8d Mon Sep 17 00:00:00 2001
From: access2rohit <srivastava.141@osu.edu>
Date: Thu, 26 Jul 2018 13:44:23 -0700
Subject: [PATCH 16/63] [MXNET-378] Adding depth_to_space and space_to_depth
 operator(Updated) (#11587)

* [MXNET-378] Adding depth_to_space and space_to_depth operator

* fixed lint and windows CPU errors

* compliance with C++ style guiide and address shortcomings in unittests

* fixed documentation and nitpicky suggestions

* added operator references in API docs and removed inplace optimization support

* Added references in symbol.md and ndarray.md. Improved test cases and added block_size check

* Fixing bugs in documentation. Tests now include tensors of random shapes.
---
 docs/api/python/ndarray/ndarray.md     |   4 +
 docs/api/python/symbol/symbol.md       |   4 +
 python/mxnet/ndarray/ndarray.py        |  16 ++
 python/mxnet/symbol/symbol.py          |  16 ++
 src/operator/tensor/matrix_op-inl.h    | 322 +++++++++++++++++++++++++
 src/operator/tensor/matrix_op.cc       | 107 ++++++++
 src/operator/tensor/matrix_op.cu       |   6 +
 tests/python/unittest/test_operator.py | 100 ++++++++
 8 files changed, 575 insertions(+)

diff --git a/docs/api/python/ndarray/ndarray.md b/docs/api/python/ndarray/ndarray.md
index d92c3e84e..01a154405 100644
--- a/docs/api/python/ndarray/ndarray.md
+++ b/docs/api/python/ndarray/ndarray.md
@@ -156,6 +156,8 @@ The `ndarray` package provides several classes:
     NDArray.transpose
     NDArray.swapaxes
     NDArray.flip
+    NDArray.depth_to_space
+    NDArray.space_to_depth
 ```
 
 ### Array reduction
@@ -411,6 +413,8 @@ The `ndarray` package provides several classes:
     transpose
     swapaxes
     flip
+    depth_to_space
+    space_to_depth
 ```
 
 ### Joining and splitting arrays
diff --git a/docs/api/python/symbol/symbol.md b/docs/api/python/symbol/symbol.md
index b0db774d9..7c78cbd59 100644
--- a/docs/api/python/symbol/symbol.md
+++ b/docs/api/python/symbol/symbol.md
@@ -222,6 +222,8 @@ Composite multiple symbols into a new one by an operator.
     Symbol.transpose
     Symbol.swapaxes
     Symbol.flip
+    Symbol.depth_to_space
+    Symbol.space_to_depth
 ```
 
 ### Reduce functions
@@ -409,6 +411,8 @@ Composite multiple symbols into a new one by an operator.
     transpose
     swapaxes
     flip
+    depth_to_space
+    space_to_depth
 ```
 
 ### Joining and splitting symbols
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 64d510296..46b21a90d 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -1302,6 +1302,22 @@ def flip(self, *args, **kwargs):
         """
         return op.flip(self, *args, **kwargs)
 
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        return op.depth_to_space(self, *args, **kwargs)
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        return op.space_to_depth(self, *args, **kwargs)
+
     def diag(self, k=0, **kwargs):
         """Convenience fluent method for :py:func:`diag`.
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index ea476cdcb..5f6cbd6b6 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -2046,6 +2046,22 @@ def flip(self, *args, **kwargs):
         """
         return op.flip(self, *args, **kwargs)
 
+    def depth_to_space(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`depth_to_space`.
+
+        The arguments are the same as for :py:func:`depth_to_space`, with
+        this array as data.
+        """
+        return op.depth_to_space(self, *args, **kwargs)
+
+    def space_to_depth(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`space_to_depth`.
+
+        The arguments are the same as for :py:func:`space_to_depth`, with
+        this array as data.
+        """
+        return op.space_to_depth(self, *args, **kwargs)
+
     def diag(self, k=0, **kwargs):
         """Convenience fluent method for :py:func:`diag`.
 
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index dcdf03a53..eec920555 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -2171,6 +2171,328 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+struct DepthToSpaceParam : public dmlc::Parameter<DepthToSpaceParam> {
+  int block_size;
+  DMLC_DECLARE_PARAMETER(DepthToSpaceParam) {
+    DMLC_DECLARE_FIELD(block_size)
+      .describe("Blocks of [block_size. block_size] are moved");
+  }
+};
+
+inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape>* in_attrs,
+                                std::vector<TShape>* out_attrs) {
+  const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Depth To Space requires exactly 4D tensor";
+
+  TShape expected_out(4);
+
+  TShape& in_shape = in_attrs->at(0);
+  int block = param.block_size;
+  CHECK_NE(block, 0) << "block_size must be a positive integer value";
+  CHECK_NE(in_shape[1], 0) << "Depth dimension:1 cannot be 0";
+  CHECK_EQ(in_shape[1] % (block * block), 0)
+    << "Cannot perform Depth To Space operation on the specified tensor."
+       " Dimension:1(depth dimension) should be a multiple of 'block^2'";
+  CHECK_NE(in_shape[0], 0)
+    << "Operation requires a 4D tensor. Size of dimension:0 cannot be 0";
+  CHECK_NE(in_shape[2], 0)
+    << "Operation requires a 4D tensor. Size of dimension:2 cannot be 0";
+  CHECK_NE(in_shape[3], 0)
+    << "Operation requires a 4D tensor. Size of dimension:3 cannot be 0";
+
+  expected_out[0] = in_shape[0];
+  expected_out[1] = in_shape[1] / (block * block);
+  uint32_t i = 2;
+  while (i < expected_out.ndim()) {
+    expected_out[i] = in_shape[i] * block;
+    ++i;
+  }
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
+  return true;
+}
+
+inline bool DepthToSpaceOpType(const nnvm::NodeAttrs& attrs,
+                               std::vector<int>* in_attrs,
+                               std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return out_attrs->at(0) != -1;
+}
+
+/*!
+ * \brief This function updates the value of input index from where the data element
+ * needs to be fetched and written out to the ith location in output tensor
+ * \param index_position    index within offset array to get offset of given dimension
+ * \param dim_size          size of current dimension
+ * \param idx               output tensor index
+ * \param inp_index         index within input tensor from where value is retrieved
+ * \param offset_arr        array containing the linear offset of input tensor
+ */
+MSHADOW_XINLINE void update_index(int index_position, int dim_size, int *idx,
+                                  int *inp_index, const int* offset_arr) {
+  int next_idx_val = *idx / dim_size;
+  *inp_index += (*idx - next_idx_val * dim_size) * offset_arr[index_position];
+  *idx = next_idx_val;
+}
+
+/*!
+ * \brief This function performs the tensor transpose (0, 1, 2, 3, 4, 5) ->
+ * (0, 3, 4, 1, 5, 2) by computing linear index within input tensor to be mapped
+ * to the ith index of output tensor
+ * \param i           tensor index
+ * \param out_data    output tensor
+ * \param in_data     input tensor
+ * \param block       size of chunks to be moved out of depth dimension
+ * \param size        array containing the size of each dimension of input tensor
+ * \param offset_arr  array containing the linear offset of input tensor
+ */
+template<int req>
+struct depth_to_space_forward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+                                  const int block, const int* size, const int* offset_arr) {
+    int inp_index = 0, idx = i, dim_size;
+    dim_size = block;
+    update_index(2, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = size[3];
+    update_index(5, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = block;
+    update_index(1, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = size[2];
+    update_index(4, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = size[1] / (block * block);
+    update_index(3, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = size[0];
+    update_index(0, dim_size, &idx, &inp_index, offset_arr);
+    KERNEL_ASSIGN(out_data[i], req, in_data[inp_index]);
+  }
+};
+
+/*!
+ * \brief This function calculates the linear offset for each dimension of
+ * input tensor and stores them in an array, which is later used in
+ * performing depth_to_space operation
+ * \param i           global thread id
+ * \param offset_arr  array to be populated with offset values
+ * \param size        array to be populated with size of each dimension of input tensor
+ * \param block       size of chunks to be moved out of depth dimension
+ * \param size0       size of Dim 0 of input tensor
+ * \param size1       size of Dim 1 of input tensor
+ * \param size2       size of Dim 2 of input tensor
+ * \param size3       size of Dim 3 of input tensor
+ */
+template<int req>
+struct compute_offset_for_depth_to_space {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
+                                  const int32_t size0, const int32_t size1, const int32_t size2,
+                                  const int32_t size3) {
+    size[0] = size0;
+    size[1] = size1;
+    size[2] = size2;
+    size[3] = size3;
+
+    offset_arr[5] = 1;
+    offset_arr[4] = offset_arr[5] * size[3];
+    offset_arr[3] = offset_arr[4] * size[2];
+    offset_arr[2] = offset_arr[3] * size[1] / (block * block);
+    offset_arr[1] = offset_arr[2] * block;
+    offset_arr[0] = offset_arr[1] * block;
+  }
+};
+
+template<typename xpu>
+void DepthToSpaceOpForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
+  using namespace mxnet_op;
+  int block = param.block_size;
+
+  mshadow::Tensor<xpu, 1, char> workspace =
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+  char* workspace_curr_ptr = workspace.dptr_;
+  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
+  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<compute_offset_for_depth_to_space<req_type>, xpu>::Launch(
+          s, 1, offset_arr, size, block, in_data.shape_[0], in_data.shape_[1],
+          in_data.shape_[2], in_data.shape_[3]);
+
+      Kernel<depth_to_space_forward<req_type>, xpu>::Launch(
+          s, out_data.Size(), out_data.dptr<DType>(), in_data.dptr<DType>(),
+          block, size, offset_arr);
+    });
+  });
+}
+
+inline bool SpaceToDepthOpShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape>* in_attrs,
+                                std::vector<TShape>* out_attrs) {
+  const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Space To Depth requires exactly 4D tensor";
+
+  TShape expected_out(in_attrs->at(0).ndim());
+
+  TShape& in_shape = in_attrs->at(0);
+  int block = param.block_size;
+  CHECK_NE(block, 0) << "block_size must be a positive integer value";
+  CHECK_NE(in_shape[0], 0)
+    << "Operation requires a 4D tensor. Size of dimension:0 cannot be 0";
+  CHECK_NE(in_shape[1], 0) << "Depth dimension:1 cannot be 0";
+  CHECK_NE(in_shape[2], 0)
+    << "Operation requires a 4D tensor. Size of dimension:2 cannot be 0";
+  CHECK_EQ(in_shape[2] % block, 0)
+    << "Cannot perform Depth To Space operation on the specified tensor."
+       " Dimension:2(1st Space dimension) should be a multiple of 'block' ";
+  CHECK_NE(in_shape[3], 0)
+    << "Operation requires a 4D tensor. Size of dimension:3 cannot be 0";
+  CHECK_EQ(in_shape[3] % block, 0)
+    << "Cannot perform Depth To Space operation on the specified tensor."
+       " Dimension:3(2nd space dimension) should be a multiple of 'block' ";
+
+  expected_out[0] = in_shape[0];
+  expected_out[1] = in_shape[1] * block * block;
+  uint32_t i = 2;
+  while (i < expected_out.ndim()) {
+    expected_out[i] = in_shape[i] / block;
+    ++i;
+  }
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
+  return true;
+}
+
+inline bool SpaceToDepthOpType(const nnvm::NodeAttrs& attrs,
+                               std::vector<int>* in_attrs,
+                               std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return out_attrs->at(0) != -1;
+}
+
+/*!
+ * \brief This function preforms the tensor transpose (0, 1, 2, 3, 4, 5) ->
+ * (0, 3, 5, 1, 2, 4) by computing linear index within input tensor to be mapped
+ * to the ith index of output tensor
+ * \param i           tensor index
+ * \param out_data    output tensor
+ * \param in_data     input tensor
+ * \param block       size of chunks to be moved out of depth dimension
+ * \param size        array containing the size of each dimension of input tensor
+ * \param offset_arr  array containing the linear offset of input tensor
+ */
+template<int req>
+struct space_to_depth_forward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, const int block,
+                                  const int* size, const int* offset_arr) {
+    int inp_index = 0, idx = i, dim_size;
+    dim_size = size[3] / block;
+    update_index(4, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = size[2] / block;
+    update_index(2, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = size[1];
+    update_index(1, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = block;
+    update_index(5, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = block;
+    update_index(3, dim_size, &idx, &inp_index, offset_arr);
+    dim_size = size[0];
+    update_index(0, dim_size, &idx, &inp_index, offset_arr);
+    KERNEL_ASSIGN(out_data[i], req, in_data[inp_index]);
+  }
+};
+
+/*!
+ * \brief This function calculates the linear offset for each dimension of
+ * input tensor and stores them in an array, which is later used in
+ * performing space_to_depth operation
+ * \param i           global thread id
+ * \param offset_arr  array to be populated with offset values
+ * \param size        array to be populated with size of each dimension of input tensor
+ * \param block       size of chunks to be moved out of depth dimension
+ * \param size0       size of Dim 0 of input tensor
+ * \param size1       size of Dim 1 of input tensor
+ * \param size2       size of Dim 2 of input tensor
+ * \param size3       size of Dim 3 of input tensor
+ */
+template<int req>
+struct compute_offset_for_space_to_depth {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
+                                  const int32_t size0, const int32_t size1,
+                                  const int32_t size2, const int32_t size3) {
+    size[0] = size0;
+    size[1] = size1;
+    size[2] = size2;
+    size[3] = size3;
+
+    offset_arr[5] = 1;
+    offset_arr[4] = offset_arr[5] * block;
+    offset_arr[3] = offset_arr[4] * size[3] / block;
+    offset_arr[2] = offset_arr[3] * block;
+    offset_arr[1] = offset_arr[2] * size[2] / block;
+    offset_arr[0] = offset_arr[1] * size[1];
+  }
+};
+
+template<typename xpu>
+void SpaceToDepthOpForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& in_data = inputs[0];
+  const TBlob& out_data = outputs[0];
+  const DepthToSpaceParam& param = nnvm::get<DepthToSpaceParam>(attrs.parsed);
+  using namespace mxnet_op;
+  int block = param.block_size;
+
+  mshadow::Tensor<xpu, 1, char> workspace =
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+  char* workspace_curr_ptr = workspace.dptr_;
+  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
+  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      Kernel<compute_offset_for_space_to_depth<req_type>, xpu>::Launch(
+          s, 1, offset_arr, size, block, in_data.shape_[0], in_data.shape_[1],
+          in_data.shape_[2], in_data.shape_[3]);
+      Kernel<space_to_depth_forward<req_type>, xpu>::Launch(
+          s, out_data.Size(), out_data.dptr<DType>(), in_data.dptr<DType>(),
+          block, size, offset_arr);
+    });
+  });
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 29d493ae5..ffdc228b2 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -101,6 +101,7 @@ DMLC_REGISTER_PARAMETER(TileParam);
 DMLC_REGISTER_PARAMETER(ReverseParam);
 DMLC_REGISTER_PARAMETER(StackParam);
 DMLC_REGISTER_PARAMETER(SqueezeParam);
+DMLC_REGISTER_PARAMETER(DepthToSpaceParam);
 
 NNVM_REGISTER_OP(Reshape)
 .add_alias("reshape")
@@ -908,5 +909,111 @@ NNVM_REGISTER_OP(_backward_squeeze)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>);
 
+NNVM_REGISTER_OP(depth_to_space)
+.describe(R"code(Rearranges(permutes) data from depth into blocks of spatial data.
+Similar to ONNX DepthToSpace operator:
+https://github.com/onnx/onnx/blob/master/docs/Operators.md#DepthToSpace.
+The output is a new tensor where the values from depth dimension are moved in spatial blocks 
+to height and width dimension. The reverse of this operation is ``space_to_depth``.
+
+.. math::
+
+    \begin{gather*}
+    x \prime = reshape(x, [N, block\_size, block\_size, C / (block\_size ^ 2), H * block\_size, W * block\_size]) \\
+    x \prime \prime = transpose(x \prime, [0, 3, 4, 1, 5, 2]) \\
+    y = reshape(x \prime \prime, [N, C / (block\_size ^ 2), H * block\_size, W * block\_size])
+    \end{gather*}
+
+where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width] 
+and :math:`y` is the output tensor of layout :math:`[N, C / (block\_size ^ 2), H * block\_size, W * block\_size]`
+
+Example::
+
+  x = [[[[0, 1, 2],
+         [3, 4, 5]],
+        [[6, 7, 8],
+         [9, 10, 11]],
+        [[12, 13, 14],
+         [15, 16, 17]],
+        [[18, 19, 20],
+         [21, 22, 23]]]]
+
+  depth_to_space(x, 2) = [[[[0, 6, 1, 7, 2, 8],
+                            [12, 18, 13, 19, 14, 20],
+                            [3, 9, 4, 10, 5, 11],
+                            [15, 21, 16, 22, 17, 23]]]]
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<DepthToSpaceParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", DepthToSpaceOpShape)
+.set_attr<nnvm::FInferType>("FInferType", DepthToSpaceOpType)
+.set_attr<FCompute>("FCompute<cpu>", DepthToSpaceOpForward<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& n) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"space_to_depth"})
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(DepthToSpaceParam::__FIELDS__());
+
+NNVM_REGISTER_OP(space_to_depth)
+.describe(R"code(Rearranges(permutes) blocks of spatial data into depth.
+Similar to ONNX SpaceToDepth operator:
+https://github.com/onnx/onnx/blob/master/docs/Operators.md#SpaceToDepth 
+
+The output is a new tensor where the values from height and width dimension are 
+moved to the depth dimension. The reverse of this operation is ``depth_to_space``.
+
+.. math::
+
+    \begin{gather*}
+    x \prime = reshape(x, [N, C, H / block\_size, block\_size, W / block\_size, block\_size]) \\
+    x \prime \prime = transpose(x \prime, [0, 3, 5, 1, 2, 4]) \\
+    y = reshape(x \prime \prime, [N, C * (block\_size ^ 2), H / block\_size, W / block\_size])
+    \end{gather*}
+
+where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width] 
+and :math:`y` is the output tensor of layout :math:`[N, C * (block\_size ^ 2), H / block\_size, W / block\_size]`
+
+Example::
+
+  x = [[[[0, 6, 1, 7, 2, 8],
+         [12, 18, 13, 19, 14, 20],
+         [3, 9, 4, 10, 5, 11],
+         [15, 21, 16, 22, 17, 23]]]]
+  
+  
+  space_to_depth(x, 2) = [[[[0, 1, 2],
+                            [3, 4, 5]],
+                           [[6, 7, 8],
+                            [9, 10, 11]],
+                           [[12, 13, 14],
+                            [15, 16, 17]],
+                           [[18, 19, 20],
+                            [21, 22, 23]]]]
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<DepthToSpaceParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", SpaceToDepthOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SpaceToDepthOpType)
+.set_attr<FCompute>("FCompute<cpu>", SpaceToDepthOpForward<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& n) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"depth_to_space"})
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
+.add_arguments(DepthToSpaceParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index bd1b9f208..4e31a4cf1 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -211,5 +211,11 @@ NNVM_REGISTER_OP(squeeze)
 NNVM_REGISTER_OP(_backward_squeeze)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
+NNVM_REGISTER_OP(depth_to_space)
+.set_attr<FCompute>("FCompute<gpu>", DepthToSpaceOpForward<gpu>);
+
+NNVM_REGISTER_OP(space_to_depth)
+.set_attr<FCompute>("FCompute<gpu>", SpaceToDepthOpForward<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index e50f8a143..fa5de0c68 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6679,6 +6679,106 @@ def test_diag():
     diag_sym = mx.sym.diag(data=data, k=-1)
     check_numeric_gradient(diag_sym, [a_np])
 
+@with_seed()
+def test_depthtospace():
+    def f(x, blocksize):
+        b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
+        tmp = np.reshape(x, [b, blocksize, blocksize, c // (blocksize**2), h, w])
+        tmp = np.transpose(tmp, [0, 3, 4, 1, 5, 2])
+        y = np.reshape(tmp, [b, c // (blocksize**2), h * blocksize, w * blocksize])
+        return y
+
+    block = random.randint(2, 4)
+    rand_mul1 = random.randint(1, 4)
+    n = random.randint(1, 5)
+    c = block * block * rand_mul1
+    h = random.randint(1, 5)
+    w = random.randint(1, 5)
+    shape_inp = (n, c, h, w)
+    data = rand_ndarray(shape_inp, 'default')
+    data_np = data.asnumpy()
+    expected = f(data_np, block)
+    output = mx.nd.depth_to_space(data, block)
+    assert_almost_equal(output.asnumpy(), expected, atol=1e-3, rtol=1e-3)
+
+    shape_out = (n, c // (block ** 2), h * block, w * block)
+    data = mx.sym.Variable('data')
+    dts_sym = mx.sym.depth_to_space(data, block)
+    check_numeric_gradient(dts_sym, [np.ones(shape_inp)])
+
+    check_symbolic_forward(dts_sym, [data_np], [expected])
+    check_symbolic_backward(dts_sym, [data_np], [np.ones(shape_out)], [np.ones(shape_inp)])
+
+    def test_invalid_depth_dim():
+        invalid_shape_inp = (n, block - 1, h, w)
+        data = rand_ndarray(invalid_shape_inp, 'default')
+        assertRaises(MXNetError, mx.nd.depth_to_space, data, block)
+
+    def test_invalid_space_dim():
+        invalid_shape_inp = (n, block ** 2, 0, block + 1)
+        data = rand_ndarray(invalid_shape_inp, 'default')
+        assertRaises(MXNetError, mx.nd.depth_to_space, data, block)
+
+    def test_invalid_block_size():
+        block = 0
+        invalid_shape_inp = (n , c, h, w)
+        data = rand_ndarray(invalid_shape_inp, 'default')
+        assertRaises(MXNetError, mx.nd.depth_to_space, data, block)
+        
+    test_invalid_depth_dim()
+    test_invalid_space_dim()
+    test_invalid_block_size()
+
+@with_seed()
+def test_spacetodepth():
+    def f(x, blocksize):
+        b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
+        tmp = np.reshape(x, [b, c, h // blocksize, blocksize, w // blocksize, blocksize])
+        tmp = np.transpose(tmp, [0, 3, 5, 1, 2, 4])
+        y = np.reshape(tmp, [b, c * (blocksize**2), h // blocksize, w // blocksize])
+        return y
+
+    block = random.randint(2, 4)
+    rand_mul1 = random.randint(1, 4)
+    rand_mul2 = random.randint(1, 4)
+    n = random.randint(1, 5)
+    c = random.randint(1, 5)
+    h = block * rand_mul1
+    w = block * rand_mul2
+    shape_inp = (n, c, h, w)
+    data = rand_ndarray(shape_inp, 'default')
+    data_np = data.asnumpy()
+    expected = f(data_np, block)
+    output = mx.nd.space_to_depth(data, block)
+    assert_almost_equal(output.asnumpy(), expected, atol=1e-3, rtol=1e-3)
+
+    shape_out = (n, c * (block ** 2), h // block, w // block)
+    data = mx.sym.Variable('data')
+    dts_sym = mx.sym.space_to_depth(data, block)
+    check_numeric_gradient(dts_sym, [np.ones(shape_inp)])
+
+    check_symbolic_forward(dts_sym, [data_np], [expected])
+    check_symbolic_backward(dts_sym, [data_np], [np.ones(shape_out)], [np.ones(shape_inp)])
+
+    def test_invalid_space_dim():
+        invalid_shape_inp = (n , c, block - 1, w)
+        data = rand_ndarray(invalid_shape_inp, 'default')
+        assertRaises(MXNetError, mx.nd.space_to_depth, data, block)
+
+    def test_invalid_block_size():
+        block = 0
+        invalid_shape_inp = (n, c, h, w)
+        data = rand_ndarray(invalid_shape_inp, 'default')
+        assertRaises(MXNetError, mx.nd.space_to_depth, data, block)
+    
+    def test_invalid_depth_dim():
+        invalid_shape_inp = (n, 0, h, w)
+        data = rand_ndarray(invalid_shape_inp, 'default')
+        assertRaises(MXNetError, mx.nd.space_to_depth, data, block)
+    
+    test_invalid_space_dim()
+    test_invalid_block_size()
+    test_invalid_depth_dim()
 
 if __name__ == '__main__':
     import nose

From 2bddf6f039e94506d11a6539b0e921e5440e09eb Mon Sep 17 00:00:00 2001
From: Mingkun Huang <mingkunhuang@sjtu.edu.cn>
Date: Sat, 28 Jul 2018 03:46:35 +0800
Subject: [PATCH 17/63] Fix mxnet ctc_loss bug (#11834)

* fix ctc_loss GPU bug

* add blank_label parameter for CTCLoss

* Revert "add blank_label parameter for CTCLoss"

This reverts commit aab11f7575580f88f5f27be14466d0deb4b4c456.
---
 src/operator/contrib/ctc_include/detail/gpu_ctc.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc.h b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
index 8015b39c4..2c521b5ab 100644
--- a/src/operator/contrib/ctc_include/detail/gpu_ctc.h
+++ b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
@@ -411,12 +411,7 @@ GpuCTC<ProbT>::compute_log_probs(const ProbT* const activations) {
         denoms_, out_dim_, num_elements);
 
     // compute denominators for softmax
-    denoms_handle = reduce_with_axis<red::sum, false>(
-        F<mxnet::op::mshadow_op::exp>(
-            log_probs_handle -
-            broadcast<0>(reduce_with_axis<red::maximum, false>(log_probs_handle, 1),
-                         log_probs_handle.shape_)),
-        1);
+    denoms_handle = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::exp>(log_probs_handle), 1);
 
     // Kernel launch to calculate probabilities
     compute_log_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>

From 4bbf15c85d300801f6f880f7abe4628e68ced2f7 Mon Sep 17 00:00:00 2001
From: Anirudh <anirudhkrec@gmail.com>
Date: Fri, 27 Jul 2018 13:02:44 -0700
Subject: [PATCH 18/63] [MXNET-344] Add more operators to onnx import (#11856)

* add more ops

* use dict.get

* add list comprehensive

* retrigger CI due to unrelated flaky test failure
---
 .../contrib/onnx/onnx2mx/_import_helper.py    | 26 +++++--
 .../contrib/onnx/onnx2mx/_op_translations.py  | 73 ++++++++++++++++++-
 tests/python-pytest/onnx/import/test_cases.py | 39 ++++++----
 3 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
index c19f0f2cb..c44403d49 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
@@ -20,8 +20,9 @@
 """Operator attributes conversion"""
 from ._op_translations import identity, random_uniform, random_normal
 from ._op_translations import add, subtract, multiply, divide, absolute, negative, add_n
-from ._op_translations import tanh
-from ._op_translations import ceil, floor
+from ._op_translations import tanh, arccos, arcsin, arctan, _cos, _sin, _tan
+from ._op_translations import softplus, shape, gather, lp_pooling
+from ._op_translations import ceil, floor, hardsigmoid, global_lppooling
 from ._op_translations import concat
 from ._op_translations import leaky_relu, _elu, _prelu, softmax, fully_connected
 from ._op_translations import global_avgpooling, global_maxpooling, linalg_gemm
@@ -30,12 +31,13 @@
 from ._op_translations import reshape, cast, split, _slice, transpose, squeeze, flatten
 from ._op_translations import reciprocal, squareroot, power, exponent, _log, unsqueeze
 from ._op_translations import reduce_max, reduce_mean, reduce_min, reduce_sum
-from ._op_translations import reduce_prod, avg_pooling, max_pooling
+from ._op_translations import reduce_prod, avg_pooling, max_pooling, instance_norm
 from ._op_translations import argmax, argmin, maximum, minimum
 from ._op_translations import clip, reduce_log_sum, reduce_log_sum_exp
-from ._op_translations import reduce_sum_square, reduce_l2, max_roi_pooling, instance_norm
+from ._op_translations import reduce_sum_square, reduce_l1, reduce_l2, max_roi_pooling
 from ._op_translations import log_softmax, softsign, lesser, greater, equal
 from ._op_translations import logical_and, logical_or, logical_xor, logical_not
+from ._op_translations import mean
 
 # convert_map defines maps of ONNX operator names to converter functor(callable)
 # defined in the op_translations module.
@@ -77,6 +79,7 @@
     'FC'                : fully_connected,
     'GlobalAveragePool' : global_avgpooling,
     'GlobalMaxPool'     : global_maxpooling,
+    'GlobalLpPool'      : global_lppooling,
     'Gemm'              : linalg_gemm,
     'LRN'               : local_response_norm,
     'Dropout'           : dropout,
@@ -113,6 +116,7 @@
     'ReduceLogSum'      : reduce_log_sum,
     'ReduceLogSumExp'   : reduce_log_sum_exp,
     'ReduceSumSquare'   : reduce_sum_square,
+    'ReduceL1'          : reduce_l1,
     'ReduceL2'          : reduce_l2,
     'MaxRoiPool'        : max_roi_pooling,
     'InstanceNormalization' : instance_norm,
@@ -124,5 +128,17 @@
     'And'               : logical_and,
     'Xor'               : logical_xor,
     'Not'               : logical_not,
-    'Or'                : logical_or
+    'Or'                : logical_or,
+    'Mean'              : mean,
+    'Acos'              : arccos,
+    'Asin'              : arcsin,
+    'Atan'              : arctan,
+    'Cos'               : _cos,
+    'Sin'               : _sin,
+    'Softplus'          : softplus,
+    'Tan'               : _tan,
+    'Shape'             : shape,
+    'Gather'            : gather,
+    'HardSigmoid'       : hardsigmoid,
+    'LpPool'            : lp_pooling
 }
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
index aa37856ff..4d1e95612 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
@@ -80,6 +80,13 @@ def divide(attrs, inputs, proto_obj):
         return op_value, new_attr, inputs
     return 'broadcast_div', new_attr, inputs
 
+def mean(attrs, inputs, proto_obj):
+    """Mean of all the input tensors."""
+    concat_input = [symbol.expand_dims(op_input, axis=0) for op_input in inputs]
+    concat_sym = symbol.concat(*concat_input, dim=0)
+    mean_sym = symbol.mean(concat_sym, axis=0)
+    return mean_sym, attrs, inputs
+
 def logical_and(attrs, inputs, proto_obj):
     """Logical and of two input arrays."""
     return 'broadcast_logical_and', attrs, inputs
@@ -186,6 +193,10 @@ def sigmoid(attrs, inputs, proto_obj):
     """Computes elementwise sigmoid of the input array"""
     return 'sigmoid', attrs, inputs
 
+def hardsigmoid(attrs, inputs, proto_obj):
+    """Computes elementwise hard sigmoid of the input array"""
+    return 'hard_sigmoid', attrs, inputs
+
 def relu(attrs, inputs, proto_obj):
     """Computes rectified linear function."""
     return 'relu', attrs, inputs
@@ -348,6 +359,14 @@ def global_avgpooling(attrs, inputs, proto_obj):
                                                                 'pool_type': 'avg'})
     return 'Pooling', new_attrs, inputs
 
+def global_lppooling(attrs, inputs, proto_obj):
+    """Performs global lp pooling on the input."""
+    p_value = attrs.get('p', 2)
+    new_attrs = translation_utils._add_extra_attributes(attrs, {'global_pool': True,
+                                                                'kernel': (1, 1),
+                                                                'pool_type': 'lp',
+                                                                'p_value': p_value})
+    return 'Pooling', new_attrs, inputs
 
 def linalg_gemm(attrs, inputs, proto_obj):
     """Performs general matrix multiplication and accumulation"""
@@ -465,7 +484,6 @@ def unsqueeze(attrs, inputs, cls):
 
     return mxnet_op, attrs, inputs
 
-
 def flatten(attrs, inputs, proto_obj):
     """Flattens the input array into a 2-D array by collapsing the higher dimensions."""
     #Mxnet does not have axis support. By default uses axis=1
@@ -484,6 +502,10 @@ def clip(attrs, inputs, proto_obj):
         new_attrs = translation_utils._add_extra_attributes(new_attrs, {'a_min' : -np.inf})
     return 'clip', new_attrs, inputs
 
+def gather(attrs, inputs, proto_obj):
+    """Gather elements from an input array along the given axis."""
+    return 'take', attrs, inputs
+
 #Powers
 def reciprocal(attrs, inputs, proto_obj):
     """Returns the reciprocal of the argument, element-wise."""
@@ -505,6 +527,30 @@ def exponent(attrs, inputs, proto_obj):
     """Elementwise exponent of input array."""
     return 'exp', attrs, inputs
 
+def _cos(attrs, inputs, proto_obj):
+    """Elementwise cosine of input array."""
+    return 'cos', attrs, inputs
+
+def _sin(attrs, inputs, proto_obj):
+    """Elementwise sine of input array."""
+    return 'sin', attrs, inputs
+
+def _tan(attrs, inputs, proto_obj):
+    """Elementwise tan of input array."""
+    return 'tan', attrs, inputs
+
+def arccos(attrs, inputs, proto_obj):
+    """Elementwise inverse cos of input array."""
+    return 'arccos', attrs, inputs
+
+def arcsin(attrs, inputs, proto_obj):
+    """Elementwise inverse sin of input array."""
+    return 'arcsin', attrs, inputs
+
+def arctan(attrs, inputs, proto_obj):
+    """Elementwise inverse tan of input array."""
+    return 'arctan', attrs, inputs
+
 def _log(attrs, inputs, proto_obj):
     """Elementwise log of input array."""
     return 'log', attrs, inputs
@@ -559,6 +605,17 @@ def reduce_sum_square(attrs, inputs, proto_obj):
                         keepdims=attrs.get('keepdims'))
     return sum_op, attrs, inputs
 
+def reduce_l1(attrs, inputs, proto_obj):
+    """Reduce input tensor by l1 normalization."""
+    new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
+    new_attrs = translation_utils._add_extra_attributes(new_attrs,
+                                                        {'ord' : 1})
+    return 'norm', new_attrs, inputs
+
+def shape(attrs, inputs, proto_obj):
+    """Returns shape of input array."""
+    return 'shape_array', attrs, inputs
+
 def reduce_l2(attrs, inputs, proto_obj):
     """Reduce input tensor by l2 normalization."""
     new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
@@ -578,6 +635,20 @@ def avg_pooling(attrs, inputs, proto_obj):
 
     return new_op, new_attrs, inputs
 
+def lp_pooling(attrs, inputs, proto_obj):
+    """LP Pooling"""
+    p_value = attrs.get('p', 2)
+    new_attrs = translation_utils._fix_attribute_names(attrs,
+                                                       {'kernel_shape': 'kernel',
+                                                        'strides': 'stride',
+                                                        'pads': 'pad',
+                                                        'p_value': p_value
+                                                       })
+    new_attrs = translation_utils._add_extra_attributes(new_attrs,
+                                                        {'pooling_convention': 'valid'
+                                                        })
+    new_op = translation_utils._fix_pooling('lp', inputs, new_attrs)
+    return new_op, new_attrs, inputs
 
 def max_pooling(attrs, inputs, proto_obj):
     """ Average pooling"""
diff --git a/tests/python-pytest/onnx/import/test_cases.py b/tests/python-pytest/onnx/import/test_cases.py
index d2574dba9..3cad3abf6 100644
--- a/tests/python-pytest/onnx/import/test_cases.py
+++ b/tests/python-pytest/onnx/import/test_cases.py
@@ -19,8 +19,7 @@
 
 IMPLEMENTED_OPERATORS_TEST = [
     'test_split_equal'
-    'test_random_uniform',
-    'test_random_normal',
+    'test_random_',
     'test_add',
     'test_sub',
     'test_mul',
@@ -38,11 +37,7 @@
     'test_constant_pad',
     'test_edge_pad',
     'test_reflect_pad',
-    'test_reduce_min',
-    'test_reduce_max',
-    'test_reduce_mean',
-    'test_reduce_prod',
-    'test_squeeze',
+    'test_squeeze_',
     'test_unsqueeze',
     'test_softmax_example',
     'test_softmax_large_number',
@@ -50,9 +45,9 @@
     'test_transpose',
     'test_globalmaxpool',
     'test_globalaveragepool',
+    'test_global_lppooling',
     'test_slice_cpu',
     'test_slice_neg',
-    'test_squeeze_',
     'test_reciprocal',
     'test_sqrt',
     'test_pow',
@@ -60,19 +55,31 @@
     'test_argmax',
     'test_argmin',
     'test_min',
-    'test_logical_and',
-    'test_logical_xor',
-    'test_logical_not',
-    'test_logical_or',
+    'test_logical_',
     # enabling partial test cases for matmul
     'test_matmul_3d',
     'test_matmul_4d',
     'test_clip',
     'test_softsign',
-    'test_reduce_l2',
-    'test_reduce_log_sum',
-    'test_reduce_log_sum_exp',
-    'test_reduce_sum_square'
+    'test_reduce_',
+    'test_softplus',
+    'test_mean',
+    'test_acos',
+    'test_asin',
+    'test_atan',
+    'test_cos',
+    'test_sin',
+    'test_tan',
+    'test_shape',
+    'test_hardsigmoid_',
+    'test_averagepool_1d',
+    'test_averagepool_2d_pads_count_include_pad',
+    'test_averagepool_2d_precomputed_pads_count_include_pad',
+    'test_averagepool_2d_precomputed_strides',
+    'test_averagepool_2d_strides',
+    'test_averagepool_3d',
+    'test_LpPool_',
+    'test_instancenorm_epsilon',
     #pytorch operator tests
     'test_operator_exp',
     'test_operator_maxpool',

From a8c873742c25a6cd4b78c6a4d8e1026378fda77d Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Fri, 27 Jul 2018 14:24:15 -0700
Subject: [PATCH 19/63] make skiptest work (#11889)

---
 Makefile                       | 10 +++++-----
 scala-package/core/pom.xml     |  6 +++---
 scala-package/examples/pom.xml |  6 +++---
 scala-package/infer/pom.xml    |  6 +++---
 scala-package/pom.xml          |  1 +
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 88f7dd927..18661aa69 100644
--- a/Makefile
+++ b/Makefile
@@ -608,7 +608,7 @@ scalaintegrationtest:
 
 scalainstall:
 	(cd $(ROOTDIR)/scala-package; \
-		mvn install -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -DskipTests -Dcxx="$(CXX)" \
+		mvn install -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -DskipTests=true -Dcxx="$(CXX)" \
 		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
 			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
@@ -617,23 +617,23 @@ scalarelease-dryrun:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn release:clean release:prepare -DdryRun=true -DautoVersionSubmodules=true \
 		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
-		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
+		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests=true\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
 
 scalarelease-prepare:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn release:clean release:prepare -DautoVersionSubmodules=true \
 		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
-		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
+		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests=true\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
 
 scalarelease-perform:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn release:perform -DautoVersionSubmodules=true \
 		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
-		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
+		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests=true\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
 
 scaladeploy:
 	(cd $(ROOTDIR)/scala-package; \
-		mvn deploy -Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \-DskipTests -Dcxx="$(CXX)" \
+		mvn deploy -Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \-DskipTests=true -Dcxx="$(CXX)" \
 		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
 			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 134e0a59d..16061979f 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -17,13 +17,13 @@
     <profile>
       <id>unittest</id>
       <properties>
-        <skiptest>false</skiptest>
+        <skipTests>false</skipTests>
       </properties>
     </profile>
     <profile>
       <id>integrationtest</id>
       <properties>
-        <skiptest>true</skiptest>
+        <skipTests>true</skipTests>
       </properties>
     </profile>
     <profile>
@@ -74,7 +74,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <skipTests>${skiptest}</skipTests>
+          <skipTests>${skipTests}</skipTests>
           <argLine>
             -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
             -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 9a98f74e4..d24785b0e 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -17,13 +17,13 @@
     <profile>
       <id>unittest</id>
       <properties>
-        <skiptest>true</skiptest>
+        <skipTests>true</skipTests>
       </properties>
     </profile>
     <profile>
       <id>integrationtest</id>
       <properties>
-        <skiptest>false</skiptest>
+        <skipTests>false</skipTests>
       </properties>
     </profile>
     <profile>
@@ -134,7 +134,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <skipTests>${skiptest}</skipTests>
+          <skipTests>${skipTests}</skipTests>
           <argLine>
             -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
             -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index 3425579ac..71c85af34 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -17,13 +17,13 @@
         <profile>
             <id>unittest</id>
             <properties>
-                <skiptest>false</skiptest>
+                <skipTests>false</skipTests>
             </properties>
         </profile>
         <profile>
             <id>integrationtest</id>
             <properties>
-                <skiptest>true</skiptest>
+                <skipTests>true</skipTests>
             </properties>
         </profile>
         <profile>
@@ -74,7 +74,7 @@
                 <groupId>org.scalatest</groupId>
                 <artifactId>scalatest-maven-plugin</artifactId>
                 <configuration>
-                    <skipTests>${skiptest}</skipTests>
+                    <skipTests>${skipTests}</skipTests>
                     <argLine>
                         -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
                         -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index c4f162008..3511f4acf 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -228,6 +228,7 @@
         <artifactId>scalatest-maven-plugin</artifactId>
         <version>1.0</version>
         <configuration>
+          <skipTests>${skipTests}</skipTests>
           <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
           <junitxml>.</junitxml>
           <filereports>WDF TestSuite.txt</filereports>

From bd3fc88716d7312c93a2de9214c8682f19cb172b Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@apache.org>
Date: Fri, 27 Jul 2018 17:50:58 -0700
Subject: [PATCH 20/63] Fix flaky test test_deconvolution (#11630)

* Replace cublassgemm with cublassgemmex for >= 7.5

* Add comment for cublassgemmex
---
 src/operator/linalg_impl.h | 59 ++++++++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index 08d2add28..c0ae97ad3 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -169,23 +169,52 @@ void linalg_gemm<cpu, mshadow::half::half_t>(const Tensor<cpu, 2, mshadow::half:
 
 // cublas col-major processing accounted for by switching first two operands
 
-#define LINALG_GPU_GEMM(fname, DType) \
-template<> inline \
-void linalg_gemm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2, DType>& B, \
-                             const Tensor<gpu, 2, DType>& C, DType alpha, DType beta, \
-                             bool tA, bool tB, Stream<gpu> *s) { \
-  using namespace mxnet; \
-  using mshadow::gpu; \
-  CHECK_NOTNULL(s); \
-  check_gemm(A, B, C, alpha, beta, tA, tB); \
-  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
-                            (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
-                            (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
-                            C.size(1), C.size(0), (tB ? B.size(1) : B.size(0)), \
-                            &alpha, B.dptr_, B.stride_, A.dptr_, A.stride_, \
-                            &beta, C.dptr_, C.stride_)) \
+#define LINALG_GPU_GEMM(fname, DType)                                      \
+  template <>                                                              \
+  inline void linalg_gemm<gpu, DType>(                                     \
+      const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2, DType>& B,      \
+      const Tensor<gpu, 2, DType>& C, DType alpha, DType beta, bool tA,    \
+      bool tB, Stream<gpu>* s) {                                           \
+    using namespace mxnet;                                                 \
+    using mshadow::gpu;                                                    \
+    CHECK_NOTNULL(s);                                                      \
+    check_gemm(A, B, C, alpha, beta, tA, tB);                              \
+    CUBLAS_CALL(cublas##fname(                                             \
+        Stream<gpu>::GetBlasHandle(s), (tB ? CUBLAS_OP_T : CUBLAS_OP_N),   \
+        (tA ? CUBLAS_OP_T : CUBLAS_OP_N), C.size(1), C.size(0),            \
+        (tB ? B.size(1) : B.size(0)), &alpha, B.dptr_, B.stride_, A.dptr_, \
+        A.stride_, &beta, C.dptr_, C.stride_))                             \
+  }
+
+// Use cublasSgemmEx when it is available (CUDA >= 7.5). Resolves precision issues with
+// cublasSgemm. Please see https://github.com/apache/incubator-mxnet/pull/11630
+#if CUDA_VERSION >= 7050
+template <>
+inline void linalg_gemm<gpu, float>(const Tensor<gpu, 2, float>& A,
+                                    const Tensor<gpu, 2, float>& B,
+                                    const Tensor<gpu, 2, float>& C, float alpha,
+                                    float beta, bool tA, bool tB,
+                                    Stream<gpu>* s) {
+  using namespace mxnet;
+  using mshadow::gpu;
+  CHECK_NOTNULL(s);
+  check_gemm(A, B, C, alpha, beta, tA, tB);
+#if CUDA_VERSION >= 8000
+  cudaDataType_t full_datatype = CUDA_R_32F;
+#else
+  cublasDataType_t full_datatype = CUBLAS_DATA_FULL;
+#endif
+  CUBLAS_CALL(cublasSgemmEx(
+      Stream<gpu>::GetBlasHandle(s), (tB ? CUBLAS_OP_T : CUBLAS_OP_N),
+      (tA ? CUBLAS_OP_T : CUBLAS_OP_N), C.size(1), C.size(0),
+      (tB ? B.size(1) : B.size(0)), &alpha, B.dptr_, full_datatype, B.stride_,
+      A.dptr_, full_datatype, A.stride_, &beta, C.dptr_, full_datatype,
+      C.stride_))
 }
+
+#else
 LINALG_GPU_GEMM(Sgemm, float)
+#endif
 LINALG_GPU_GEMM(Dgemm, double)
 
 // Version where matrix rows are given by first axis.

From 011a0dc7c1c47daeb39424999c33827aabf7ce95 Mon Sep 17 00:00:00 2001
From: Kalyanee Chendke <k_chendke@hotmail.com>
Date: Sat, 28 Jul 2018 11:00:35 -0700
Subject: [PATCH 21/63] Remove fixed seed for test_sparse_nd_save_load (#11920)

* Remove fixed seed for test_sparse_nd_save_load

* Add comments related to the commit
---
 tests/python/unittest/test_sparse_ndarray.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index 975f576bc..508f52301 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -534,7 +534,9 @@ def test_sparse_nd_pickle():
                 assert same(a.asnumpy(), b.asnumpy())
 
 
-@with_seed(0)
+# @kalyc: Getting rid of fixed seed as flakiness could not be reproduced
+# tracked at https://github.com/apache/incubator-mxnet/issues/11741
+@with_seed()
 def test_sparse_nd_save_load():
     repeat = 1
     stypes = ['default', 'row_sparse', 'csr']

From 196468dedf3135252ed1a98a3db627ffef019473 Mon Sep 17 00:00:00 2001
From: Thom Lane <thom.e.lane@gmail.com>
Date: Sat, 28 Jul 2018 17:24:42 -0700
Subject: [PATCH 22/63] Corrections to profiling tutorial (#11887)

Corrected a race condition with stopping profiling. Added mx.nd.waitall to ensure all operations have completed, including GPU operations that might otherwise be missing.

Also added alternative code for context selection GPU vs CPU, that had error before on machines with nvidia-smi.
---
 docs/tutorials/python/profiler.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/tutorials/python/profiler.md b/docs/tutorials/python/profiler.md
index 81ecc2a84..d99bb19ee 100644
--- a/docs/tutorials/python/profiler.md
+++ b/docs/tutorials/python/profiler.md
@@ -94,7 +94,10 @@ Let's define a method that will run one training iteration given data and label.
 
 ```python
 # Use GPU if available
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
+try:
+    mx.test_utils.list_gpus(); ctx = mx.gpu()
+except:
+    ctx = mx.cpu()
 
 # Initialize the parameters with random weights
 net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
@@ -144,7 +147,8 @@ profiler.set_state('run')
 
 run_training_iteration(*next(itr))
 
-# Ask the profiler to stop recording
+# Ask the profiler to stop recording after operations have completed
+mx.nd.waitall()
 profiler.set_state('stop')
 ```
 

From 54ebc5db7967fe023d987796dd098ecf8a65e507 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <rahulhuilgol@gmail.com>
Date: Sat, 28 Jul 2018 17:26:10 -0700
Subject: [PATCH 23/63] Fix image classification scripts and Improve Fp16
 tutorial (#11533)

* fix bugs and improve tutorial

* improve logging

* update benchmark_score

* Update float16.md

* update link to dmlc web data

* fix train cifar and add random mirroring

* set aug defaults

* fix whitespace

* fix typo
---
 docs/faq/float16.md                           | 12 +++++--
 example/gluon/data.py                         | 15 ++++-----
 .../image-classification/benchmark_score.py   |  9 ++++--
 example/image-classification/common/data.py   | 31 +++++++++++++------
 example/image-classification/fine-tune.py     |  4 +--
 example/image-classification/train_cifar10.py |  8 ++++-
 .../image-classification/train_imagenet.py    | 12 +++++--
 src/io/image_aug_default.cc                   |  2 +-
 8 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/docs/faq/float16.md b/docs/faq/float16.md
index cbb308f69..b4cd97b30 100644
--- a/docs/faq/float16.md
+++ b/docs/faq/float16.md
@@ -102,9 +102,17 @@ python fine-tune.py --network resnet --num-layers 50 --pretrained-model imagenet
 ```
 
 ## Example training results
-Here is a plot to compare the training curves of a Resnet50 v1 network on the Imagenet 2012 dataset. These training jobs ran for 95 epochs with a batch size of 1024 using a learning rate of 0.4 decayed by a factor of 1 at epochs 30,60,90 and used Gluon. The only changes made for the float16 job when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracies at 95th epoch were **76.598% for float16** and **76.486% for float32**. The difference is within what's normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. This run was approximately **65% faster** to train with float16.
+Let us consider training a Resnet50 v1 model on the Imagenet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a AWS p3.16x large instance. Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below.
 
-![Training curves of Resnet50 v1 on Imagenet 2012](https://raw.githubusercontent.com/rahul003/web-data/03929a8beb8ac574f2392ed34cc6d4b2f052826a/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png)
+Batch size | Data type | Top 1 Validation accuracy | Time to train | Speedup |
+--- | --- | --- | --- | --- |
+1024 | float32 | 76.18% | 11.8 hrs | 1 |
+1024 | float16 | 76.34% | 7.3 hrs | 1.62x |
+2048 | float16 | 76.29% | 6.5 hrs | 1.82x |
+
+![Training curves of Resnet50 v1 on Imagenet 2012](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tutorials/mixed-precision/resnet50v1b_imagenet_fp16_fp32_training.png)
+
+The differences in accuracies above are within normal random variation, and there is no reason to expect float16 to have better accuracy than float32 in general. As the plot indicates training behaves similarly for these cases, even though we didn't have to change any other hyperparameters. We can also see from the table that using float16 helps train faster through faster computation with float16 as well as allowing the use of larger batch sizes.
 
 ## Things to keep in mind
 
diff --git a/example/gluon/data.py b/example/gluon/data.py
index 6aa531648..f855c9050 100644
--- a/example/gluon/data.py
+++ b/example/gluon/data.py
@@ -21,6 +21,7 @@
 import random
 import tarfile
 import logging
+import tarfile
 logging.basicConfig(level=logging.INFO)
 
 import mxnet as mx
@@ -92,9 +93,10 @@ def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224, dtype='
 def get_caltech101_data():
     url = "https://s3.us-east-2.amazonaws.com/mxnet-public/101_ObjectCategories.tar.gz"
     dataset_name = "101_ObjectCategories"
-    if not os.path.isdir("data"):
+    data_folder = "data"
+    if not os.path.isdir(data_folder):
         os.makedirs(data_folder)
-    tar_path = mx.gluon.utils.download(url, path='data')
+    tar_path = mx.gluon.utils.download(url, path=data_folder)
     if (not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories")) or
         not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories_test"))):
         tar = tarfile.open(tar_path, "r:gz")
@@ -110,18 +112,17 @@ def transform(image, label):
         # resize the shorter edge to 224, the longer edge will be greater or equal to 224
         resized = mx.image.resize_short(image, 224)
         # center and crop an area of size (224,224)
-        cropped, crop_info = mx.image.center_crop(resized, 224)
+        cropped, crop_info = mx.image.center_crop(resized, (224, 224))
         # transpose the channels to be (3,224,224)
         transposed = mx.nd.transpose(cropped, (2, 0, 1))
-        image = mx.nd.cast(image, dtype)
-        return image, label
+        return transposed, label
 
     training_path, testing_path = get_caltech101_data()
     dataset_train = ImageFolderDataset(root=training_path, transform=transform)
     dataset_test = ImageFolderDataset(root=testing_path, transform=transform)
 
-    train_data = mx.gluon.data.DataLoader(dataset_train, batch_size, shuffle=True, num_workers=num_workers)
-    test_data = mx.gluon.data.DataLoader(dataset_test, batch_size, shuffle=False, num_workers=num_workers)
+    train_data = DataLoader(dataset_train, batch_size, shuffle=True, num_workers=num_workers)
+    test_data = DataLoader(dataset_test, batch_size, shuffle=False, num_workers=num_workers)
     return DataLoaderIter(train_data), DataLoaderIter(test_data)
 
 class DummyIter(mx.io.DataIter):
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
index 0d47d859d..05e4b487f 100644
--- a/example/image-classification/benchmark_score.py
+++ b/example/image-classification/benchmark_score.py
@@ -79,13 +79,16 @@ def score(network, dev, batch_size, num_batches, dtype):
         logging.info('network: %s', net)
         for d in devs:
             logging.info('device: %s', d)
+            logged_fp16_warning = False
             for b in batch_sizes:
                 for dtype in ['float32', 'float16']:
                     if d == mx.cpu() and dtype == 'float16':
                         #float16 is not supported on CPU
                         continue
-                    elif net in ['inception-bn', 'alexnet'] and dt == 'float16':
-                        logging.info('{} does not support float16'.format(net))
+                    elif net in ['inception-bn', 'alexnet'] and dtype == 'float16':
+                        if not logged_fp16_warning:
+                            logging.info('Model definition for {} does not support float16'.format(net))
+                            logged_fp16_warning = True
                     else:
                         speed = score(network=net, dev=d, batch_size=b, num_batches=10, dtype=dtype)
-                        logging.info('batch size %2d, dtype %s image/sec: %f', b, dtype, speed)
+                        logging.info('batch size %2d, dtype %s, images/sec: %f', b, dtype, speed)
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
index c1dfcf565..df1449b63 100755
--- a/example/image-classification/common/data.py
+++ b/example/image-classification/common/data.py
@@ -28,8 +28,12 @@ def add_data_args(parser):
     data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
     data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
                       help='a tuple of size 3 for the mean rgb')
+    data.add_argument('--rgb-std', type=str, default='1,1,1',
+                      help='a tuple of size 3 for the std rgb')
     data.add_argument('--pad-size', type=int, default=0,
                       help='padding the input image')
+    data.add_argument('--fill-value', type=int, default=127,
+                      help='Set the padding pixels value to fill_value')
     data.add_argument('--image-shape', type=str,
                       help='the image shape feed into the network, e.g. (3,224,224)')
     data.add_argument('--num-classes', type=int, help='the number of classes')
@@ -67,11 +71,18 @@ def add_data_aug_args(parser):
     aug.add_argument('--max-random-scale', type=float, default=1,
                      help='max ratio to scale')
     aug.add_argument('--min-random-scale', type=float, default=1,
-                     help='min ratio to scale, should >= img_size/input_shape. otherwise use --pad-size')
+                     help='min ratio to scale, should >= img_size/input_shape. '
+                          'otherwise use --pad-size')
     aug.add_argument('--max-random-area', type=float, default=1,
                      help='max area to crop in random resized crop, whose range is [0, 1]')
     aug.add_argument('--min-random-area', type=float, default=1,
                      help='min area to crop in random resized crop, whose range is [0, 1]')
+    aug.add_argument('--min-crop-size', type=int, default=-1,
+                     help='Crop both width and height into a random size in '
+                          '[min_crop_size, max_crop_size]')
+    aug.add_argument('--max-crop-size', type=int, default=-1,
+                     help='Crop both width and height into a random size in '
+                          '[min_crop_size, max_crop_size]')
     aug.add_argument('--brightness', type=float, default=0,
                      help='brightness jittering, whose range is [0, 1]')
     aug.add_argument('--contrast', type=float, default=0,
@@ -84,13 +95,6 @@ def add_data_aug_args(parser):
                      help='whether to use random resized crop')
     return aug
 
-def set_resnet_aug(aug):
-    # standard data augmentation setting for resnet training
-    aug.set_defaults(random_crop=0, random_resized_crop=1)
-    aug.set_defaults(min_random_area=0.08)
-    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
-    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
-
 class SyntheticDataIter(DataIter):
     def __init__(self, num_classes, data_shape, max_iter, dtype):
         self.batch_size = data_shape[0]
@@ -137,6 +141,7 @@ def get_rec_iter(args, kv=None):
     else:
         (rank, nworker) = (0, 1)
     rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
+    rgb_std = [float(i) for i in args.rgb_std.split(',')]
     train = mx.io.ImageRecordIter(
         path_imgrec         = args.data_train,
         path_imgidx         = args.data_train_idx,
@@ -144,6 +149,9 @@ def get_rec_iter(args, kv=None):
         mean_r              = rgb_mean[0],
         mean_g              = rgb_mean[1],
         mean_b              = rgb_mean[2],
+        std_r               = rgb_std[0],
+        std_g               = rgb_std[1],
+        std_b               = rgb_std[2],
         data_name           = 'data',
         label_name          = 'softmax_label',
         data_shape          = image_shape,
@@ -151,13 +159,15 @@ def get_rec_iter(args, kv=None):
         rand_crop           = args.random_crop,
         max_random_scale    = args.max_random_scale,
         pad                 = args.pad_size,
-        fill_value          = 127,
+        fill_value          = args.fill_value,
         random_resized_crop = args.random_resized_crop,
         min_random_scale    = args.min_random_scale,
         max_aspect_ratio    = args.max_random_aspect_ratio,
         min_aspect_ratio    = args.min_random_aspect_ratio,
         max_random_area     = args.max_random_area,
         min_random_area     = args.min_random_area,
+        min_crop_size       = args.min_crop_size,
+        max_crop_size       = args.max_crop_size,
         brightness          = args.brightness,
         contrast            = args.contrast,
         saturation          = args.saturation,
@@ -181,6 +191,9 @@ def get_rec_iter(args, kv=None):
         mean_r              = rgb_mean[0],
         mean_g              = rgb_mean[1],
         mean_b              = rgb_mean[2],
+        std_r               = rgb_std[0],
+        std_g               = rgb_std[1],
+        std_b               = rgb_std[2],
         resize              = 256,
         data_name           = 'data',
         label_name          = 'softmax_label',
diff --git a/example/image-classification/fine-tune.py b/example/image-classification/fine-tune.py
index 2a0c0ec99..719fa86e0 100644
--- a/example/image-classification/fine-tune.py
+++ b/example/image-classification/fine-tune.py
@@ -54,8 +54,8 @@ def get_fine_tune_model(symbol, arg_params, num_classes, layer_name, dtype='floa
     parser.add_argument('--layer-before-fullc', type=str, default='flatten0',
                         help='the name of the layer before the last fullc layer')\
 
-    # use less augmentations for fine-tune
-    data.set_data_aug_level(parser, 1)
+    # use less augmentations for fine-tune. by default here it uses no augmentations
+
     # use a small learning rate and less regularizations
     parser.set_defaults(image_shape='3,224,224',
                         num_epochs=30,
diff --git a/example/image-classification/train_cifar10.py b/example/image-classification/train_cifar10.py
index 7eb56ebce..f449aad68 100644
--- a/example/image-classification/train_cifar10.py
+++ b/example/image-classification/train_cifar10.py
@@ -31,6 +31,11 @@ def download_cifar10():
     download_file('http://data.mxnet.io/data/cifar10/cifar10_train.rec', fnames[0])
     return fnames
 
+def set_cifar_aug(aug):
+    aug.set_defaults(rgb_mean='125.307,122.961,113.8575', rgb_std='51.5865,50.847,51.255')
+    aug.set_defaults(random_mirror=1, pad=4, fill_value=0, random_crop=1)
+    aug.set_defaults(min_random_size=32, max_random_size=32)
+
 if __name__ == '__main__':
     # download data
     (train_fname, val_fname) = download_cifar10()
@@ -41,7 +46,8 @@ def download_cifar10():
     fit.add_fit_args(parser)
     data.add_data_args(parser)
     data.add_data_aug_args(parser)
-    data.set_data_aug_level(parser, 2)
+    # uncomment to set standard cifar augmentations
+    # set_cifar_aug(parser)
     parser.set_defaults(
         # network
         network        = 'resnet',
diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
index a90b6aead..0835f5d3b 100644
--- a/example/image-classification/train_imagenet.py
+++ b/example/image-classification/train_imagenet.py
@@ -23,6 +23,14 @@
 from common.util import download_file
 import mxnet as mx
 
+def set_imagenet_aug(aug):
+    # standard data augmentation setting for imagenet training
+    aug.set_defaults(rgb_mean='123.68,116.779,103.939', rgb_std='58.393,57.12,57.375')
+    aug.set_defaults(random_crop=0, random_resized_crop=1, random_mirror=1)
+    aug.set_defaults(min_random_area=0.08)
+    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
+    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
+
 if __name__ == '__main__':
     # parse args
     parser = argparse.ArgumentParser(description="train imagenet-1k",
@@ -30,8 +38,8 @@
     fit.add_fit_args(parser)
     data.add_data_args(parser)
     data.add_data_aug_args(parser)
-    # uncomment to set standard augmentation for resnet training
-    # data.set_resnet_aug(parser)
+    # uncomment to set standard augmentations for imagenet training
+    # set_imagenet_aug(parser)
     parser.set_defaults(
         # network
         network          = 'resnet',
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index 5b28aa189..bea2e2c07 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -178,7 +178,7 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
     DMLC_DECLARE_FIELD(rotate).set_default(-1.0f)
         .describe("Rotate by an angle. If set, it overwrites the ``max_rotate_angle`` option.");
     DMLC_DECLARE_FIELD(fill_value).set_default(255)
-        .describe("Set the padding pixes value into ``fill_value``.");
+        .describe("Set the padding pixels value to ``fill_value``.");
     DMLC_DECLARE_FIELD(data_shape)
         .set_expect_ndim(3).enforce_nonzero()
         .describe("The shape of a output image.");

From ab5242c1919cefed6d69106b6f81645fa320bed3 Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Sun, 29 Jul 2018 12:39:42 -0700
Subject: [PATCH 24/63] [MXNET-711] Website build and version dropdown update
 (#11892)

* adding param for list of tags to display on website

* using new website display argument for artifact placement in version folder

* adding display logic

* remove restricted setting for testing

* update usage instructions

* reverted Jenkinsfile to use restricted nodes
---
 ci/docker/runtime_functions.sh               |  6 ++-
 docs/build_version_doc/build_all_version.sh  | 48 ++++++++++++++++----
 docs/build_version_doc/update_all_version.sh |  6 +--
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a0795eb58..8805850e3 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -814,7 +814,11 @@ build_docs() {
     set -ex
     pushd .
     cd /work/mxnet/docs/build_version_doc
-    ./build_all_version.sh $1
+    # Parameters are set in the Jenkins pipeline: restricted-website-build
+    # $1 is the list of branches to build; $2 is the list of tags to display
+    # So you can build from the 1.2.0 branch, but display 1.2.1 on the site
+    ./build_all_version.sh $1 $2
+    # $3 is the default version tag for the website; $4 is the base URL
     ./update_all_version.sh $2 $3 $4
     cd VersionedWeb
     tar -zcvf ../artifacts.tgz .
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
index 56b80e3a0..44cd540fd 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -20,23 +20,42 @@
 # This script is for locally building website for all versions
 # Built files are stored in $built
 
-# Takes one argument:
-# * tag list - space delimited list of Github tags; Example: "1.1.0 1.0.0 master"
+# Takes two arguments:
+# tag list - semicolon delimited list of Github tags
+#   Example: "1.2.0;1.1.0;master"
+# display list - semicolon delimited list of what to display on website
+#   Example: "1.2.1;1.1.0;master"
+# The number of tags for the two arguments must be the same.
 # Example Usage:
-# ./build_all_version.sh "1.1.0 1.0.0 master"
+#   ./build_all_version.sh "1.2.0;1.1.0;master" "1.2.1;1.1.0;master"
+#   ./build_all_version.sh "1.2.0" "1.2.1"
 
 set -e
 set -x
 
 if [ -z "$1" ]
   then
-    echo "Please provide a list of version tags you wish to run."
+    echo "Please provide a list of branches or tags you wish to build."
     exit 1
   else
     IFS=$';'
     tag_list=$1
     echo "Using these tags: $tag_list"
-    for tag in $tag_list; do echo $tag; done
+    build_arr=($tag_list)
+fi
+
+if [ -z "$2" ]
+  then
+    echo "Please provide a list of version tags you wish to display on the site."
+    exit 1
+  else
+    IFS=$';'
+    tags_to_display=$2
+    echo "Displaying these tags: $tags_to_display"
+    display_arr=($tags_to_display)
+    for key in ${!build_arr[@]}; do
+        echo "Branch/tag ${build_arr[${key}]} will be displayed as ${display_arr[${key}]}"
+    done
 fi
 
 mxnet_url="https://github.com/apache/incubator-mxnet.git"
@@ -51,18 +70,27 @@ fi
 if [ ! -d "$built" ]; then
   mkdir $built
   mkdir "$built/versions"
+  else
+    if [ ! -d "$built/versions" ]; then
+      mkdir "$built/versions"
+    fi
 fi
 
-# Build all versions and use latest version(First version number in $tag_list) as landing page.
-for tag in $tag_list; do
+# Checkout each tag and build it
+# Then store it in a folder according to the desired display tag
+for key in ${!build_arr[@]}; do
+    tag=${build_arr[${key}]}
     cd "$mxnet_folder"
     git fetch
     if [ $tag == 'master' ]
         then
             git checkout master
             git pull
+            echo "Building master..."
         else
-            git checkout "v$tag"
+            # Use "v$tag" for branches or pass that in from jenkins
+            git checkout "$tag"
+            echo "Building $tag..."
     fi
 
     git submodule update --init --recursive || exit 1
@@ -72,11 +100,13 @@ for tag in $tag_list; do
     make clean
     make html USE_OPENMP=1 || exit 1
     cd ../../
-    file_loc="$built/versions/$tag"
+    # Use the display tag name for the folder name
+    file_loc="$built/versions/${display_arr[${key}]}"
     if [ -d "$file_loc" ] ; then
         rm -rf "$file_loc"
     fi
     mkdir "$file_loc"
+    echo "Storing artifacts for $tag in $file_loc folder..."
     cp -a "$mxnet_folder/docs/_build/html/." "$file_loc"
 done
 
diff --git a/docs/build_version_doc/update_all_version.sh b/docs/build_version_doc/update_all_version.sh
index bfd656f5a..e39b0a503 100755
--- a/docs/build_version_doc/update_all_version.sh
+++ b/docs/build_version_doc/update_all_version.sh
@@ -23,12 +23,12 @@
 # the tags you want to update.
 
 # Takes three arguments:
-# * tag list - space delimited list of Github tags; Example: "1.1.0 1.0.0 master"
+# * tag list - semicolon delimited list of tags to display on site; Example: "1.1.0;1.0.0;master"
 # * default tag - which version should the site default to; Example: 1.0.0
 # * root URL - for the versions dropdown to change to production or dev server; Example: http://mxnet.incubator.apache.org/
 
 # Example Usage:
-# ./update_all_version.sh "1.1.0 1.0.0 master" 1.0.0 http://mxnet.incubator.apache.org/
+# ./update_all_version.sh "1.1.0;1.0.0;master" 1.0.0 http://mxnet.incubator.apache.org/
 
 set -e
 set -x
@@ -36,7 +36,6 @@ set -x
 MASTER_SOURCE_DIR="../../docs"
 STATIC_FILES_DIR="_static"
 MXNET_THEME_DIR="_static/mxnet-theme"
-BUILD_HTML_DIR="_build/html"
 
 if [ -z "$1" ]
   then
@@ -132,4 +131,3 @@ for tag in $tag_list; do
 done
 
 echo "The output of this process can be found in the VersionedWeb folder."
-

From 6372037ec4eb56b609a184712e5ec993a21faad0 Mon Sep 17 00:00:00 2001
From: vishaalkapoor <40836875+vishaalkapoor@users.noreply.github.com>
Date: Sun, 29 Jul 2018 12:40:32 -0700
Subject: [PATCH 25/63] [MXAPPS-581] Fixes for broken Straight Dope tests.
 (#11923)

* Update relative paths pointing to the data directory to point to the
  correct place in the testing temporary folder.

* Enable the notebooks that were previously broken because of relative
  file paths not pointing to the correct place.

* Move some notebooks we do not plan to test to the whitelist. These
  notebooks are not published in the Straight Dope book.

* Clean-up: Convert print statements to info/warn/error logging
  statements. Add some logging statements for better status.
---
 .../straight_dope/straight_dope_test_utils.py | 39 ++++++--
 .../test_notebooks_single_gpu.py              | 94 +++++--------------
 tests/utils/notebook_test/__init__.py         |  7 +-
 3 files changed, 61 insertions(+), 79 deletions(-)

diff --git a/tests/nightly/straight_dope/straight_dope_test_utils.py b/tests/nightly/straight_dope/straight_dope_test_utils.py
index bb64f37fe..ee499a56f 100644
--- a/tests/nightly/straight_dope/straight_dope_test_utils.py
+++ b/tests/nightly/straight_dope/straight_dope_test_utils.py
@@ -24,6 +24,7 @@
     the notebook. e.g: `export MXNET_TEST_KERNEL=python2`
 """
 import io
+import logging
 import os
 import re
 import shutil
@@ -40,6 +41,7 @@
 GIT_REPO = 'https://github.com/zackchase/mxnet-the-straight-dope'
 KERNEL = os.getenv('MXNET_TEST_KERNEL', None)
 NOTEBOOKS_DIR = os.path.join(os.path.dirname(__file__), 'tmp_notebook')
+RELATIVE_DATA_PATH_REGEX = r'\.\.\/data\/'  # Regular expression to match the relative data path.
 
 def _test_notebook(notebook, override_epochs=True):
     """Run Jupyter notebook to catch any execution error.
@@ -47,13 +49,18 @@ def _test_notebook(notebook, override_epochs=True):
     Args:
         notebook : string
             notebook name in folder/notebook format
-        epochs : boolean
+        override_epochs : boolean
             whether or not to override the number of epochs to 1
+
     Returns:
         True if the notebook runs without warning or error.
     """
+    # Some notebooks will fail to run without error if we do not override the data path.
+    _override_data_path(notebook)
+
     if override_epochs:
         _override_epochs(notebook)
+
     return run_notebook(notebook, NOTEBOOKS_DIR, kernel=KERNEL, temp_dir=NOTEBOOKS_DIR)
 
 
@@ -63,15 +70,14 @@ def _override_epochs(notebook):
     Args:
         notebook : string
             notebook name in folder/notebook format
-
     """
     notebook_path = os.path.join(*([NOTEBOOKS_DIR] + notebook.split('/'))) + ".ipynb"
 
-    # Read the notebook and set epochs to num_epochs
+    # Read the notebook and set epochs to num_epochs.
     with io.open(notebook_path, 'r', encoding='utf-8') as f:
         notebook = f.read()
 
-    # Set number of epochs to 1
+    # Set number of epochs to 1.
     modified_notebook = re.sub(EPOCHS_REGEX, 'epochs = 1', notebook)
 
     # Replace the original notebook with the modified one.
@@ -79,13 +85,34 @@ def _override_epochs(notebook):
         f.write(modified_notebook)
 
 
+def _override_data_path(notebook):
+    """Overrides the relative path for the data directory to point to the right place. This is
+    required as we run the notebooks in a different directory hierarchy more suitable for testing.
+
+    Args:
+        notebook : string
+            notebook name in folder/notebook format
+    """
+    notebook_path = os.path.join(*([NOTEBOOKS_DIR] + notebook.split('/'))) + ".ipynb"
+
+    # Read the notebook and set epochs to num_epochs.
+    with io.open(notebook_path, 'r', encoding='utf-8') as f:
+        notebook = f.read()
+
+    # Update the location for the data directory.
+    modified_notebook = re.sub(RELATIVE_DATA_PATH_REGEX, NOTEBOOKS_DIR + '/data/', notebook)
+
+    # Replace the original notebook with the modified one.
+    with io.open(notebook_path, 'w', encoding='utf-8') as f:
+        f.write(modified_notebook)
+
 def _download_straight_dope_notebooks():
     """Downloads the Straight Dope Notebooks.
 
     Returns:
         True if it succeeds in downloading the notebooks without error.
     """
-    print('Cleaning and setting up notebooks directory "{}"'.format(NOTEBOOKS_DIR))
+    logging.info('Cleaning and setting up notebooks directory "{}"'.format(NOTEBOOKS_DIR))
     shutil.rmtree(NOTEBOOKS_DIR, ignore_errors=True)
 
     cmd = [GIT_PATH,
@@ -98,7 +125,7 @@ def _download_straight_dope_notebooks():
     if proc.returncode != 0:
         err_msg = 'Error downloading Straight Dope notebooks.\n'
         err_msg += msg
-        print(err_msg)
+        logging.error(err_msg)
         return False
     return True
 
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
index b87d16cb0..ee7c94c80 100644
--- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -32,9 +32,19 @@
     'chapter01_crashcourse/introduction',
     'chapter01_crashcourse/chapter-one-problem-set',
     'chapter02_supervised-learning/environment',
+    'chapter03_deep-neural-networks/kaggle-gluon-kfold',
     'chapter07_distributed-learning/multiple-gpus-scratch',
     'chapter07_distributed-learning/multiple-gpus-gluon',
-    'chapter07_distributed-learning/training-with-multiple-machines'
+    'chapter07_distributed-learning/training-with-multiple-machines',
+    'chapter12_time-series/intro-forecasting-gluon',
+    'chapter12_time-series/intro-forecasting-2-gluon',
+    'chapter13_unsupervised-learning/vae-gluon',
+    'chapter18_variational-methods-and-uncertainty/bayes-by-backprop-rnn',
+    'chapter17_deep-reinforcement-learning/DQN',
+    'chapter17_deep-reinforcement-learning/DDQN',
+    'chapter19_graph-neural-networks/Graph-Neural-Networks',
+    'chapter16_tensor_methods/tensor_basics',
+    'cheatsheets/kaggle-gluon-kfold'
 ]
 
 
@@ -91,10 +101,8 @@ def test_linear_regression_scratch(self):
     def test_linear_regression_gluon(self):
         assert _test_notebook('chapter02_supervised-learning/linear-regression-gluon')
 
-    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
-    # python code can be run from another directory.
-    #def test_logistic_regression_gluon(self):
-    #    assert _test_notebook('chapter02_supervised-learning/logistic-regression-gluon')
+    def test_logistic_regression_gluon(self):
+        assert _test_notebook('chapter02_supervised-learning/logistic-regression-gluon')
 
     def test_softmax_regression_scratch(self):
         assert _test_notebook('chapter02_supervised-learning/softmax-regression-scratch')
@@ -132,9 +140,6 @@ def test_plumbing(self):
     def test_custom_layer(self):
         assert _test_notebook('chapter03_deep-neural-networks/custom-layer')
 
-    #def test_kaggle_gluon_kfold(self):
-    #    assert _test_notebook('chapter03_deep-neural-networks/kaggle-gluon-kfold')
-
     # TODO(vishaalk): Load params and Save params are deprecated warning.
     #def test_serialization(self):
     #    assert _test_notebook('chapter03_deep-neural-networks/serialization')
@@ -162,20 +167,14 @@ def test_cnn_batch_norm_gluon(self):
 
     # Chapter 5
 
-    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
-    # python code can be run from another directory.
-    #def test_simple_rnn(self):
-    #    assert _test_notebook('chapter05_recurrent-neural-networks/simple-rnn')
+    def test_simple_rnn(self):
+        assert _test_notebook('chapter05_recurrent-neural-networks/simple-rnn')
 
-    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
-    # python code can be run from another directory.
-    #def test_lstm_scratch(self):
-    #    assert _test_notebook('chapter05_recurrent-neural-networks/lstm-scratch')
+    def test_lstm_scratch(self):
+        assert _test_notebook('chapter05_recurrent-neural-networks/lstm-scratch')
 
-    # TODO(vishaalk): There is a relative file path needs to be fixed so that the
-    # python code can be run from another directory.
-    #def test_gru_scratch(self):
-    #    assert _test_notebook('chapter05_recurrent-neural-networks/gru-scratch')
+    def test_gru_scratch(self):
+        assert _test_notebook('chapter05_recurrent-neural-networks/gru-scratch')
 
     #def test_rnns_gluon(self):
     #    assert _test_notebook('chapter05_recurrent-neural-networks/rnns-gluon')
@@ -263,19 +262,6 @@ def test_lds_scratch(self):
     #def test_issm_scratch(self):
     #    assert _test_notebook('chapter12_time-series/issm-scratch')
 
-    # TODO(vishaalk): Error: sequential1_batchnorm0_running_mean' has not been initialized
-    # def test_intro_forecasting_gluon(self):
-    #    assert _test_notebook('chapter12_time-series/intro-forecasting-gluon')
-
-    #def test_intro_forecasting_2_gluon(self):
-    #    assert _test_notebook('chapter12_time-series/intro-forecasting-2-gluon')
-
-    # Chapter 13
-
-    # TODO(vishaalk): Load params and Save params are deprecated warning.
-    #def test_vae_gluon(self):
-    #    assert _test_notebook('chapter13_unsupervised-learning/vae-gluon')
-
     # Chapter 14
 
     def test_igan_intro(self):
@@ -287,46 +273,14 @@ def test_dcgan(self):
     def test_generative_adversarial_networks(self):
         assert _test_notebook('chapter14_generative-adversarial-networks/conditional')
 
-    # Chapter 16
-
-    # TODO(vishaalk): Checked failed oshape.Size() != dshape.Size()
-    #def test_tensor_basics(self):
-    #    assert _test_notebook('chapter16_tensor_methods/tensor_basics')
-
     # TODO(vishaalk): Notebook does not appear to be valid JSON.
     #def test_pixel2pixel(self):
     #    assert _test_notebook('chapter14_generative-adversarial-networks/pixel2pixel')
 
-    # Chapter 17
-
-    # TODO(vishaalk): Requires OpenAI Gym. Also uses deprecated load_params.
-    #def test_dqn(self):
-#    assert _test_notebook('chapter17_deep-reinforcement-learning/DQN')
-
-#def test_ddqn(self):
-#    assert _test_notebook('chapter17_deep-reinforcement-learning/DDQN')
-
-# Chapter 18
-
-#def test_bayes_by_backprop(self):
-#    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop')
-
-#def test_bayes_by_backprop_gluon(self):
-#    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop-gluon')
-
-# TODO(vishaalk): AttributeError: 'list' object has no attribute 'keys'
-#def test_bayes_by_backprop_rnn(self):
-#    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop-rnn')
-
-# Chapter 19
-
-# TODO(vishaalk): Requires deepchem
-#def test_graph_neural_networks(self):
-#    assert _test_notebook('chapter19_graph-neural-networks/Graph-Neural-Networks')
+    # Chapter 18
 
-# Cheatsheets
+    #def test_bayes_by_backprop(self):
+    #    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop')
 
-# TODO(vishaalk): There is a relative file path needs to be fixed so that the
-# python code can be run from another directory.
-#def test_kaggle_gluon_kfold(self):
-#    assert _test_notebook('cheatsheets/kaggle-gluon-kfold')
+    #def test_bayes_by_backprop_gluon(self):
+    #    assert _test_notebook('chapter18_variational-methods-and-uncertainty/bayes-by-backprop-gluon')
diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
index cb5282fb4..2cdb6134a 100644
--- a/tests/utils/notebook_test/__init__.py
+++ b/tests/utils/notebook_test/__init__.py
@@ -21,6 +21,7 @@
     warning or exception.
 """
 import io
+import logging
 import os
 import shutil
 import time
@@ -57,12 +58,12 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
     -------
        Returns true if the workbook runs with no warning or exception.
     """
-
+    logging.info("Running notebook '{}'".format(notebook))
     notebook_path = os.path.join(*([notebook_dir] + notebook.split('/')))
     working_dir = os.path.join(*([temp_dir] + notebook.split('/')))
 
     if no_cache == '1':
-        print("Cleaning and setting up temp directory '{}'".format(working_dir))
+        logging.info("Cleaning and setting up temp directory '{}'".format(working_dir))
         shutil.rmtree(temp_dir, ignore_errors=True)
 
     errors = []
@@ -92,6 +93,6 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
                 if "Warning:" in line:
                     errors.append("Warning:\n" + line)
         if len(errors) > 0:
-            print('\n'.join(errors))
+            logging.error('\n'.join(errors))
             return False
         return True

From 83ae3a3cd76ae9e314104f61b618eb046a3015fb Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Mon, 30 Jul 2018 19:11:57 +0200
Subject: [PATCH 26/63] Disable flaky test: test_spatial_transformer_with_type
 (#11930)

https://github.com/apache/incubator-mxnet/issues/11839
---
 tests/python/gpu/test_operator_gpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 99d8d0980..8877b5739 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -693,6 +693,7 @@ def test_grid_generator_with_type():
     check_consistency(sym, ctx_list, grad_req="add")
 
 
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed.  https://github.com/apache/incubator-mxnet/issues/11839")
 @with_seed()
 def test_spatial_transformer_with_type():
     data = mx.sym.Variable('data')

From 2cc5a422247b338b5382860c83d48a1221841bb7 Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Tue, 31 Jul 2018 02:32:01 +0800
Subject: [PATCH 27/63] Add linux and macos MKLDNN Building Instruction
 (#11049)

* add linux and macos doc

* update doc

* Update MKL_README.md

* Update MKL_README.md

Add convolution code to verify mkldnn backend

* add homebrew link

* rename to MKLDNN_README

* add mkl verify

* trigger

* trigger

* set mac complier to gcc47

* add VS2017 support experimentally

* improve quality

* improve quality

* modify mac build instruction since prepare_mkldnn.sh has been rm

* trigger

* add some improvement
---
 MKLDNN_README.md | 301 +++++++++++++++++++++++++++++++++++++++++++++++
 MKL_README.md    |  77 ------------
 2 files changed, 301 insertions(+), 77 deletions(-)
 create mode 100644 MKLDNN_README.md
 delete mode 100644 MKL_README.md

diff --git a/MKLDNN_README.md b/MKLDNN_README.md
new file mode 100644
index 000000000..43cced49e
--- /dev/null
+++ b/MKLDNN_README.md
@@ -0,0 +1,301 @@
+# Build/Install MXNet with MKL-DNN
+
+Building MXNet with [Intel MKL-DNN](https://github.com/intel/mkl-dnn) will gain better performance when using Intel Xeon CPUs for training and inference. The improvement of performance can be seen in this [page](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu). Below are instructions for linux, MacOS and Windows platform.
+
+<h2 id="0">Contents</h2>
+
+* [1. Linux](#1)
+* [2. MacOS](#2)
+* [3. Windows](#3)
+* [4. Verify MXNet with python](#4)
+* [5. Enable MKL BLAS](#5)
+* [6. Support](#6)
+
+<h2 id="1">Linux</h2>
+
+### Prerequisites
+
+```
+sudo apt-get update
+sudo apt-get install -y build-essential git
+sudo apt-get install -y libopenblas-dev liblapack-dev
+sudo apt-get install -y libopencv-dev
+sudo apt-get install -y graphviz
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Build MXNet with MKL-DNN
+
+```
+make -j $(nproc) USE_OPENCV=1 USE_MKLDNN=1 USE_BLAS=mkl USE_INTEL_PATH=/opt/intel
+```
+
+If you don't have full [MKL](https://software.intel.com/en-us/intel-mkl) library installed, you can use OpenBLAS by setting `USE_BLAS=openblas`.
+
+<h2 id="2">MacOS</h2>
+
+### Prerequisites
+
+Install the dependencies, required for MXNet, with the following commands:
+
+- [Homebrew](https://brew.sh/)
+- gcc (clang in macOS does not support OpenMP)
+- OpenCV (for computer vision operations)
+
+```
+# Paste this command in Mac terminal to install Homebrew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+# install dependency
+brew update
+brew install pkg-config
+brew install graphviz
+brew tap homebrew/core
+brew install opencv
+brew tap homebrew/versions
+brew install gcc49
+brew link gcc49 #gcc-5 and gcc-7 also work
+```
+
+### Clone MXNet sources
+
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+cd incubator-mxnet
+```
+
+### Enable OpenMP for MacOS
+
+If you want to enable OpenMP for better performance, you should modify the Makefile in MXNet root dictionary:
+
+Add CFLAGS '-fopenmp' for Darwin.
+
+```
+ifeq ($(USE_OPENMP), 1)
+# ifneq ($(UNAME_S), Darwin)
+    CFLAGS += -fopenmp
+# endif
+endif
+```
+
+### Build MXNet with MKL-DNN
+
+```
+make -j $(sysctl -n hw.ncpu) CC=gcc-4.9 CXX=g++-4.9 USE_OPENCV=0 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple USE_PROFILER=1
+```
+
+*Note: Temporarily disable OPENCV.*
+
+<h2 id="3">Windows</h2>
+
+We recommend to build and install MXNet yourself using [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/), or you can also try experimentally the latest [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/).
+
+**Visual Studio 2015**
+
+To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake 3](https://cmake.org/) if it is not already installed.
+3. Download and install [OpenCV 3](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
+6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
+```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
+7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
+8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
+```
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+```
+
+2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
+
+3. Start a Visual Studio command prompt.
+
+4. Use [CMake 3](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
+[CMake 3](https://cmake.org/) command:
+```
+mkdir build
+cd build
+cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+
+5. In Visual Studio, open the solution file,```.sln```, and compile it.
+These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
+Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
+
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+
+**Visual Studio 2017**
+
+To build and install MXNet yourself using [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/), you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and install [CMake 3](https://cmake.org/files/v3.11/cmake-3.11.0-rc4-win64-x64.msi) if it is not already installed.
+3. Download and install [OpenCV](https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.4.1/opencv-3.4.1-vc14_vc15.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (e.g., ```OpenCV_DIR = C:\utils\opencv\build```).
+6. If you don’t have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](https://sourceforge.net/projects/openblas/files/v0.2.20/OpenBLAS%200.2.20%20version.zip/download).
+7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories (e.g., ```OpenBLAS_HOME = C:\utils\OpenBLAS```).
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Start ```cmd``` in windows.
+
+2. Download the MXNet source code from GitHub by using following command:
+
+```r
+cd C:\
+git clone --recursive https://github.com/apache/incubator-mxnet.git
+```
+
+3. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
+
+4. Follow [this link](https://docs.microsoft.com/en-us/visualstudio/install/modify-visual-studio) to modify ```Individual components```, and check ```VC++ 2017 version 15.4 v14.11 toolset```, and click ```Modify```.
+
+5. Change the version of the Visual studio 2017 to v14.11 using the following command (by default the VS2017 is installed in the following path):
+
+```r
+"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.11
+```
+
+6. Create a build dir using the following command and go to the directory, for example:
+
+```r
+mkdir C:\build
+cd C:\build
+```
+
+7. CMake the MXNet source code by using following command:
+
+```r
+cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+
+8. After the CMake successfully completed, compile the the MXNet source code by using following command:
+
+```r
+msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
+```
+
+9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+
+<h2 id="4">Verify MXNet with python</h2>
+
+```
+cd python
+sudo python setup.py install
+python -c "import mxnet as mx;print((mx.nd.ones((2, 3))*2).asnumpy());"
+
+Expected Output:
+
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
+```
+
+### Verify whether MKL-DNN works
+
+After MXNet is installed, you can verify if MKL-DNN backend works well with a single Convolution layer.
+
+```
+import mxnet as mx
+import numpy as np
+
+num_filter = 32
+kernel = (3, 3)
+pad = (1, 1)
+shape = (32, 32, 256, 256)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.Convolution(data=x, weight=w, num_filter=num_filter, kernel=kernel, no_bias=True, pad=pad)
+exe = y.simple_bind(mx.cpu(), x=shape)
+
+exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
+exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+You can open the `MKLDNN_VERBOSE` flag by setting environment variable:
+```
+export MKLDNN_VERBOSE=1
+```
+Then by running above code snippet, you probably will get the following output message which means `convolution` and `reorder` primitive from MKL-DNN are called. Layout information and primitive execution performance are also demonstrated in the log message.
+```
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nchw out:f32_nChw16c,num:1,32x32x256x256,6.47681
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0429688
+mkldnn_verbose,exec,convolution,jit:avx512_common,forward_inference,fsrc:nChw16c fwei:OIhw16i16o fbia:undef fdst:nChw16c,alg:convolution_direct,mb32_g1ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1,9.98193
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_oihw out:f32_OIhw16i16o,num:1,32x32x3x3,0.0510254
+mkldnn_verbose,exec,reorder,jit:uni,undef,in:f32_nChw16c out:f32_nchw,num:1,32x32x256x256,20.4819
+```
+
+<h2 id="5">Enable MKL BLAS</h2>
+
+To make it convenient for customers, Intel introduced a new license called [Intel® Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license) that allows to redistribute not only dynamic libraries but also headers, examples and static libraries.
+
+Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.
+
+  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
+
+  2. Run `make -j ${nproc} USE_BLAS=mkl`
+
+  3. Navigate into the python directory
+
+  4. Run `sudo python setup.py install`
+
+### Verify whether MKL works
+
+After MXNet is installed, you can verify if MKL BLAS works well with a single dot layer.
+
+```
+import mxnet as mx
+import numpy as np
+
+shape_x = (1, 10, 8)
+shape_w = (1, 12, 8)
+
+x_npy = np.random.normal(0, 1, shape_x)
+w_npy = np.random.normal(0, 1, shape_w)
+
+x = mx.sym.Variable('x')
+w = mx.sym.Variable('w')
+y = mx.sym.batch_dot(x, w, transpose_b=True)
+exe = y.simple_bind(mx.cpu(), x=x_npy.shape, w=w_npy.shape)
+
+exe.forward(is_train=False)
+o = exe.outputs[0]
+t = o.asnumpy()
+```
+
+You can open the `MKL_VERBOSE` flag by setting environment variable:
+```
+export MKL_VERBOSE=1
+```
+Then by running above code snippet, you probably will get the following output message which means `SGEMM` primitive from MKL are called. Layout information and primitive execution performance are also demonstrated in the log message.
+```
+Numpy + Intel(R) MKL: THREADING LAYER: (null)
+Numpy + Intel(R) MKL: setting Intel(R) MKL to use INTEL OpenMP runtime
+Numpy + Intel(R) MKL: preloading libiomp5.so runtime
+MKL_VERBOSE Intel(R) MKL 2018.0 Update 1 Product build 20171007 for Intel(R) 64 architecture Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) enabled processors, Lnx 2.40GHz lp64 intel_thread NMICDev:0
+MKL_VERBOSE SGEMM(T,N,12,10,8,0x7f7f927b1378,0x1bc2140,8,0x1ba8040,8,0x7f7f927b1380,0x7f7f7400a280,12) 8.93ms CNR:OFF Dyn:1 FastMM:1 TID:0  NThr:40 WDiv:HOST:+0.000
+```
+
+<h2 id="6">Next Steps and Support</h2>
+
+- For questions or support specific to MKL, visit the [Intel MKL](https://software.intel.com/en-us/mkl)
+
+- For questions or support specific to MKL, visit the [Intel MKLDNN](https://github.com/intel/mkl-dnn)
+
+- If you find bugs, please open an issue on GitHub for [MXNet with MKL](https://github.com/apache/incubator-mxnet/labels/MKL) or [MXNet with MKLDNN](https://github.com/apache/incubator-mxnet/labels/MKLDNN)
diff --git a/MKL_README.md b/MKL_README.md
deleted file mode 100644
index a5c63b097..000000000
--- a/MKL_README.md
+++ /dev/null
@@ -1,77 +0,0 @@
-## Build/Install MXNet with a full MKL installation:
-
-To make it convenient for customers, Intel introduced a new license called [Intel® Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license) that allows to redistribute not only dynamic libraries but also headers, examples and static libraries.
-
-Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.
-
-  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
-
-  2. Run 'make -j ${nproc} USE_BLAS=mkl'
-
-  3. Navigate into the python directory
-
-  4. Run 'sudo python setup.py install'
-
-
-## Build/Install MXNet with MKLDNN on Windows:
-
-To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
-
-1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Download and Install [CMake](https://cmake.org/) if it is not already installed.
-3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
-4. Unzip the OpenCV package.
-5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
-6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
-```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
-7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
-8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
-
-After you have installed all of the required dependencies, build the MXNet source code:
-
-1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
-```
-    git clone https://github.com/apache/incubator-mxnet.git --recursive
-```
-
-2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
-
-3. Start a Visual Studio command prompt.
-
-4. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
-[CMake](https://cmake.org/) command:
-```
-    mkdir build
-    cd build
-    cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
-```
-
-5. In Visual Studio, open the solution file,```.sln```, and compile it.
-These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
-Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
-
-6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
-
-## Install MXNet for Python
-
-1. Install ```Python``` using windows installer available [here](https://www.python.org/downloads/release/python-2712/).
-2. Install ```Numpy``` using windows installer available [here](http://scipy.org/install.html).
-3. Next, we install Python package interface for MXNet. You can find the Python interface package for [MXNet on GitHub](https://github.com/dmlc/mxnet/tree/master/python/mxnet).
-
-```CMD
-    cd python
-    python setup.py install
-```
-Done! We have installed MXNet with Python interface. Run below commands to verify our installation is successful.
-```CMD
-    # Open Python terminal
-    python
-
-    # You should be able to import mxnet library without any issues.
-    >>> import mxnet as mx;
-    >>> a = mx.nd.ones((2, 3));
-    >>> print ((a*2).asnumpy());
-        [[ 2.  2.  2.]
-        [ 2.  2.  2.]]
-```
-We actually did a small tensor computation using MXNet! You are all set with MKLDNN MXNet on your Windows machine.

From b2fd3b1cad34556c938feec385cdad1a19b1d606 Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Mon, 30 Jul 2018 12:15:15 -0700
Subject: [PATCH 28/63] [MXNET-531] Add download util (#11866)

* add changes to example

* place the file to the util

* add retry scheme

* fix the retry logic

* change the DownloadUtil to Util

* Trigger the CI
---
 scala-package/core/pom.xml                    |  5 +++
 .../scala/org/apache/mxnetexamples/Util.scala | 45 +++++++++++++++++++
 .../multitask/ExampleMultiTask.scala          | 11 ++---
 .../CNNClassifierExampleSuite.scala           | 22 +++------
 .../customop/CustomOpExampleSuite.scala       |  8 ++--
 .../mxnetexamples/gan/GanExampleSuite.scala   |  8 ++--
 .../imclassification/MNISTExampleSuite.scala  |  8 ++--
 .../ImageClassifierExampleSuite.scala         | 31 ++++---------
 .../ObjectDetectorExampleSuite.scala          | 30 ++++---------
 9 files changed, 87 insertions(+), 81 deletions(-)
 create mode 100644 scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala

diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 16061979f..c74b00fdc 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -104,5 +104,10 @@
       <version>1.3.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>2.1</version>
+    </dependency>
   </dependencies>
 </project>
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala
new file mode 100644
index 000000000..c1ff10c6c
--- /dev/null
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples
+
+import java.io.File
+import java.net.URL
+
+import org.apache.commons.io.FileUtils
+
+object Util {
+
+  def downloadUrl(url: String, filePath: String, maxRetry: Option[Int] = None) : Unit = {
+    val tmpFile = new File(filePath)
+    var retry = maxRetry.getOrElse(3)
+    var success = false
+    if (!tmpFile.exists()) {
+      while (retry > 0 && !success) {
+        try {
+          FileUtils.copyURLToFile(new URL(url), tmpFile)
+          success = true
+        } catch {
+          case e: Exception => retry -= 1
+        }
+      }
+    } else {
+      success = true
+    }
+   if (!success) throw new Exception(s"$url Download failed!")
+  }
+}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
index 1270af3c4..9df2bcc05 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
@@ -24,9 +24,7 @@ import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
 
 import scala.collection.JavaConverters._
-
 import org.apache.commons.io.FileUtils
-
 import org.apache.mxnet.Symbol
 import org.apache.mxnet.DataIter
 import org.apache.mxnet.DataBatch
@@ -37,13 +35,13 @@ import org.apache.mxnet.Context
 import org.apache.mxnet.Xavier
 import org.apache.mxnet.optimizer.RMSProp
 import org.apache.mxnet.Executor
+import org.apache.mxnetexamples.Util
 
 import scala.collection.immutable.ListMap
 import scala.sys.process.Process
 
 /**
  * Example of multi-task
- * @author Depeng Liang
  */
 object ExampleMultiTask {
   private val logger = LoggerFactory.getLogger(classOf[ExampleMultiTask])
@@ -204,11 +202,8 @@ object ExampleMultiTask {
     val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
     val tempDirPath = System.getProperty("java.io.tmpdir")
     val modelDirPath = tempDirPath + File.separator + "multitask/"
-    val tmpFile = new File(tempDirPath + "/multitask/mnist.zip")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(baseUrl + "/mnist/mnist.zip"),
-        tmpFile)
-    }
+    Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
+      tempDirPath + "/multitask/mnist.zip")
 
     // TODO: Need to confirm with Windows
     Process("unzip " + tempDirPath + "/multitask/mnist.zip -d "
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
index f7d133279..95c9823e3 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
@@ -22,6 +22,7 @@ import java.net.URL
 
 import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
@@ -46,22 +47,13 @@ class CNNClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
 
       logger.info("Downloading CNN text...")
       val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala"
-      var tmpFile = new File(tempDirPath + "/CNN/rt-polarity.pos")
-      if (!tmpFile.exists()) {
-        FileUtils.copyURLToFile(new URL(baseUrl + "/scala-example-ci/CNN/rt-polarity.pos"),
-          tmpFile)
-      }
-      tmpFile = new File(tempDirPath + "/CNN/rt-polarity.neg")
-      if (!tmpFile.exists()) {
-        FileUtils.copyURLToFile(new URL(baseUrl + "/scala-example-ci/CNN/rt-polarity.neg"),
-          tmpFile)
-      }
+      Util.downloadUrl(baseUrl + "/scala-example-ci/CNN/rt-polarity.pos",
+        tempDirPath + "/CNN/rt-polarity.pos")
+      Util.downloadUrl(baseUrl + "/scala-example-ci/CNN/rt-polarity.neg",
+        tempDirPath + "/CNN/rt-polarity.neg")
       logger.info("Downloading pretrianed Word2Vec Model, may take a while")
-      tmpFile = new File(tempDirPath + "/CNN/" + w2vModelName)
-      if (!tmpFile.exists()) {
-        FileUtils.copyURLToFile(new URL(baseUrl + "/scala-example-ci/CNN/" + w2vModelName),
-          tmpFile)
-      }
+      Util.downloadUrl(baseUrl + "/scala-example-ci/CNN/" + w2vModelName,
+        tempDirPath + "/CNN/" + w2vModelName)
 
       val modelDirPath = tempDirPath + File.separator + "CNN"
 
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
index 4ba0e1bb8..6385e062a 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
@@ -21,6 +21,7 @@ import java.net.URL
 
 import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
@@ -64,11 +65,8 @@ class CustomOpExampleSuite extends FunSuite with BeforeAndAfterAll {
         val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
         val tempDirPath = System.getProperty("java.io.tmpdir")
         val modelDirPath = tempDirPath + File.separator + "mnist/"
-        val tmpFile = new File(tempDirPath + "/mnist/mnist.zip")
-        if (!tmpFile.exists()) {
-          FileUtils.copyURLToFile(new URL(baseUrl + "/mnist/mnist.zip"),
-            tmpFile)
-        }
+        Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
+          tempDirPath + "/mnist/mnist.zip")
         // TODO: Need to confirm with Windows
         Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
           + tempDirPath + "/mnist/") !
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
index 12459fb1c..8ab3a4b36 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
@@ -22,6 +22,7 @@ import java.net.URL
 
 import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
@@ -38,11 +39,8 @@ class GanExampleSuite extends FunSuite with BeforeAndAfterAll{
       val tempDirPath = System.getProperty("java.io.tmpdir")
       val modelDirPath = tempDirPath + File.separator + "mnist/"
       logger.info("tempDirPath: %s".format(tempDirPath))
-      val tmpFile = new File(tempDirPath + "/mnist/mnist.zip")
-      if (!tmpFile.exists()) {
-        FileUtils.copyURLToFile(new URL(baseUrl + "/mnist/mnist.zip"),
-          tmpFile)
-      }
+      Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
+        tempDirPath + "/mnist/mnist.zip")
       // TODO: Need to confirm with Windows
       Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
         + tempDirPath + "/mnist/") !
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
index 3e91b5b02..7b1d6ddc3 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
@@ -22,6 +22,7 @@ import java.net.URL
 
 import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
@@ -41,11 +42,8 @@ class MNISTExampleSuite extends FunSuite with BeforeAndAfterAll {
       val tempDirPath = System.getProperty("java.io.tmpdir")
       val modelDirPath = tempDirPath + File.separator + "mnist/"
       logger.info("tempDirPath: %s".format(tempDirPath))
-      val tmpFile = new File(tempDirPath + "/mnist/mnist.zip")
-      if (!tmpFile.exists()) {
-        FileUtils.copyURLToFile(new URL(baseUrl + "/mnist/mnist.zip"),
-          tmpFile)
-      }
+      Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
+        tempDirPath + "/mnist/mnist.zip")
       // TODO: Need to confirm with Windows
       Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
         + tempDirPath + "/mnist/") !
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
index 2b5ac7f8a..f0bb07b4a 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
@@ -24,6 +24,7 @@ import java.net.URL
 
 import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
 
 import sys.process.Process
 
@@ -42,28 +43,14 @@ class ImageClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
 
     val baseUrl = "https://s3.us-east-2.amazonaws.com/scala-infer-models"
 
-    var tmpFile = new File(tempDirPath + "/resnet18/resnet-18-symbol.json")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(baseUrl + "/resnet-18/resnet-18-symbol.json"),
-        tmpFile)
-    }
-    tmpFile = new File(tempDirPath + "/resnet18/resnet-18-0000.params")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(baseUrl + "/resnet-18/resnet-18-0000.params"),
-        tmpFile)
-    }
-    tmpFile = new File(tempDirPath + "/resnet18/synset.txt")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(baseUrl + "/resnet-18/synset.txt"),
-        tmpFile)
-    }
-    tmpFile = new File(tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(
-        new URL("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg"),
-        tmpFile
-      )
-    }
+    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-symbol.json",
+      tempDirPath + "/resnet18/resnet-18-symbol.json")
+    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-0000.params",
+      tempDirPath + "/resnet18/resnet-18-0000.params")
+    Util.downloadUrl(baseUrl + "/resnet-18/synset.txt",
+      tempDirPath + "/resnet18/synset.txt")
+    Util.downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
+      tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg")
 
     val modelDirPath = tempDirPath + File.separator + "resnet18/"
     val inputImagePath = tempDirPath + File.separator +
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala
index 85b98381a..31da38569 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala
@@ -22,6 +22,7 @@ import java.net.URL
 
 import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
@@ -39,27 +40,14 @@ class ObjectDetectorExampleSuite extends FunSuite with BeforeAndAfterAll {
     val modelBase = "https://s3.amazonaws.com/model-server/models/resnet50_ssd/"
     val imageBase = "https://s3.amazonaws.com/model-server/inputs/"
 
-
-    var tmpFile = new File(tempDirPath + "/resnetssd/resnet50_ssd_model-symbol.json")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(modelBase + "resnet50_ssd_model-symbol.json"),
-        tmpFile)
-    }
-    tmpFile = new File(tempDirPath + "/resnetssd/resnet50_ssd_model-0000.params")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(modelBase + "resnet50_ssd_model-0000.params"),
-        tmpFile)
-    }
-    tmpFile = new File(tempDirPath + "/resnetssd/synset.txt")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(modelBase + "synset.txt"),
-        tmpFile)
-    }
-    tmpFile = new File(tempDirPath + "/inputImages/resnetssd/dog-ssd.jpg")
-    if (!tmpFile.exists()) {
-      FileUtils.copyURLToFile(new URL(imageBase + "dog-ssd.jpg"),
-        tmpFile)
-    }
+    Util.downloadUrl(modelBase + "resnet50_ssd_model-symbol.json",
+      tempDirPath + "/resnetssd/resnet50_ssd_model-symbol.json")
+    Util.downloadUrl(modelBase + "resnet50_ssd_model-0000.params",
+      tempDirPath + "/resnetssd/resnet50_ssd_model-0000.params")
+    Util.downloadUrl(modelBase + "synset.txt",
+      tempDirPath + "/resnetssd/synset.txt")
+    Util.downloadUrl(imageBase + "dog-ssd.jpg",
+      tempDirPath + "/inputImages/resnetssd/dog-ssd.jpg")
 
     val modelDirPath = tempDirPath + File.separator + "resnetssd/"
     val inputImagePath = tempDirPath + File.separator +

From 024b5a916dd3a39a39031ce5e6565cd7d9d60fe2 Mon Sep 17 00:00:00 2001
From: Dick Carter <dick.carter@comcast.net>
Date: Mon, 30 Jul 2018 13:34:34 -0700
Subject: [PATCH 29/63] [MXNET-11241] Avoid use of troublesome cudnnFind()
 results when grad_req='add' (#11338)

* Add tests that fail due to issue 11241

* Fix #11241 Conv1D throws CUDNN_STATUS_EXECUTION_FAILED

* Force algo 1 when grad_req==add with large c.  Expand tests.

* Shorten test runtimes.
---
 src/operator/nn/convolution.cu                | 20 +++++--
 src/operator/nn/cudnn/cudnn_algoreg-inl.h     | 11 +++-
 src/operator/nn/cudnn/cudnn_convolution-inl.h | 36 +++++++++--
 .../nn/cudnn/cudnn_deconvolution-inl.h        | 38 ++++++++++--
 src/operator/nn/deconvolution.cu              | 20 +++++--
 src/operator/operator_common.h                |  2 +-
 tests/python/gpu/test_operator_gpu.py         | 59 +++++++++++++++++++
 7 files changed, 162 insertions(+), 24 deletions(-)

diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu
index 797557e35..daccc5518 100644
--- a/src/operator/nn/convolution.cu
+++ b/src/operator/nn/convolution.cu
@@ -41,7 +41,8 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
                                                  int backward_compute_type,
                                                  const std::vector<TShape>& in_shape,
                                                  const std::vector<TShape>& out_shape,
-                                                 const RunContext& rctx) {
+                                                 const RunContext& rctx,
+                                                 bool add_to_weight) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<ConvSignature,
                                          std::shared_ptr<CuDNNConvolutionOp<DType> >,
@@ -57,14 +58,18 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
     ndim += s.ndim();
   for (auto &s : out_shape)
     ndim += s.ndim();
-  key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
-              + ndim + 1 /* for dev_id */);
+  key.Reserve(1 /* for forward_compute_type */ +
+              1 /* for backward_compute_type */ +
+              ndim /* for in and out shapes */ +
+              1 /* for dev_id */ +
+              1 /* for add_to_weight */);
 
   key.AddSign(forward_compute_type);
   key.AddSign(backward_compute_type);
   key.AddSign(in_shape);
   key.AddSign(out_shape);
   key.AddSign(rctx.ctx.dev_id);
+  key.AddSign(add_to_weight ? 1 : 0);
 
   auto it = ops.find(key);
   if (it == ops.end()) {
@@ -74,7 +79,7 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
     CHECK(ins_ret.second);
     it = ins_ret.first;
     it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
-                     out_shape, rctx);
+                     out_shape, rctx, add_to_weight);
   }
   return *it->second;
 }
@@ -141,8 +146,10 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
       std::vector<TShape> out_shape(1, outputs[0].shape_);
       for (size_t i = 0; i < in_shape.size(); i++)
         in_shape[i] = inputs[i].shape_;
+      // req[conv::kWeight] is only set for backward, so assume the typical 'write' for now.
+      auto add_to_weight = false;
       CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
-          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight);
       op.Forward(ctx, inputs, req, outputs);
     }
   })
@@ -220,8 +227,9 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
       std::vector<TShape> out_shape(1, out_grad.shape_);
       for (size_t i = 0; i < in_shape.size(); i++)
         in_shape[i] = in_data[i].shape_;
+      auto add_to_weight = req[conv::kWeight] == kAddTo;
       CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
-          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight);
       op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
     }
   })
diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
index e029c837b..3b59fd1c3 100644
--- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h
+++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
@@ -72,12 +72,13 @@ class CuDNNAlgoReg {
             cudnnDataType_t cudnn_forward_compute_type,
             cudnnDataType_t cudnn_backward_compute_type,
             int sm_arch,
+            bool add_to_weight,
             CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
             CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
             CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
     CHECK(in_shape.size() == 2 || in_shape.size() == 3);
     ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
-                 cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
+                 cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
     std::lock_guard<std::mutex> guard(lock_);
     auto i = reg_.find(key);
     if (i != reg_.end()) {
@@ -96,12 +97,13 @@ class CuDNNAlgoReg {
                 cudnnDataType_t cudnn_forward_compute_type,
                 cudnnDataType_t cudnn_backward_compute_type,
                 int sm_arch,
+                bool add_to_weight,
                 const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
                 const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
                 const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
     CHECK(in_shape.size() == 2 || in_shape.size() == 3);
     ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
-                 cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
+                 cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
     std::lock_guard<std::mutex> guard(lock_);
     if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
       LOG(INFO) << "Running performance tests to find the best convolution "
@@ -140,6 +142,7 @@ class CuDNNAlgoReg {
     cudnnDataType_t cudnn_forward_compute_type;
     cudnnDataType_t cudnn_backward_compute_type;
     int sm_arch;
+    bool add_to_weight;
 
     bool operator==(const ParamKey& other) const {
       return this->param == other.param &&
@@ -149,7 +152,8 @@ class CuDNNAlgoReg {
              this->cudnn_data_type == other.cudnn_data_type &&
              this->cudnn_forward_compute_type == other.cudnn_forward_compute_type &&
              this->cudnn_backward_compute_type == other.cudnn_backward_compute_type &&
-             this->sm_arch == other.sm_arch;
+             this->sm_arch == other.sm_arch &&
+             this->add_to_weight == other.add_to_weight;
     }
   };
 
@@ -164,6 +168,7 @@ class CuDNNAlgoReg {
       ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_forward_compute_type));
       ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_backward_compute_type));
       ret = dmlc::HashCombine(ret, key.sm_arch);
+      ret = dmlc::HashCombine(ret, key.add_to_weight);
       return ret;
     }
   };
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index 4b1cbbe70..827c89faa 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -59,9 +59,11 @@ class CuDNNConvolutionOp {
             int backward_compute_type,
             const std::vector<TShape>& in_shape,
             const std::vector<TShape>& out_shape,
-            const RunContext& rctx) {
+            const RunContext& rctx,
+            bool add_to_weight) {
     using namespace mshadow;
     this->param_ = param;
+    this->add_to_weight_ = add_to_weight;
     InitBufferForParam();
     auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type);
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
@@ -247,6 +249,7 @@ class CuDNNConvolutionOp {
                                             gbias.dptr_));
     }
     if (req[conv::kWeight] != kNullOp) {
+        CHECK_EQ(add_to_weight_, req[conv::kWeight] == kAddTo);
         CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
             &alpha,
             in_desc_,
@@ -610,8 +613,8 @@ class CuDNNConvolutionOp {
                   cudnnDataType_t cudnn_backward_compute_type) {
     if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
                                        cudnn_forward_compute_type, cudnn_backward_compute_type,
-                                       SMArch(rctx.ctx.dev_id), &forward_algo_, &back_algo_,
-                                       &back_algo_w_)) {
+                                       SMArch(rctx.ctx.dev_id), add_to_weight_,
+                                       &forward_algo_, &back_algo_, &back_algo_w_)) {
       mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
       CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
       size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
@@ -645,6 +648,8 @@ class CuDNNConvolutionOp {
       auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
       std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
       int actual_bwd_filter_algos = 0;
+      // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
+      // were summing into the output (i.e. beta != 0).  Get() returned OK algos though.
       auto bwd_filter_algo_discoverer =
         param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7
                                                 : cudnnFindConvolutionBackwardFilterAlgorithm;
@@ -792,6 +797,13 @@ class CuDNNConvolutionOp {
         }
       }
       #endif  // CUDNN_MAJOR < 7
+
+      // Fix for issue #11241
+      int cudnn_find_issue_max_features = 64 * 1024;
+      if (add_to_weight_ && Features(in_shape[conv::kData]) >= cudnn_find_issue_max_features) {
+        this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
+      }
+
       // An algo specification by the user may be cached here, but another
       // convolution will match only if identically specified.
       // We're caching results of *Get* as well as *Find*, but these records
@@ -799,7 +811,8 @@ class CuDNNConvolutionOp {
       CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
                                         cudnn_forward_compute_type,
                                         cudnn_backward_compute_type,
-                                        SMArch(rctx.ctx.dev_id), this->forward_algo_,
+                                        SMArch(rctx.ctx.dev_id), this->add_to_weight_,
+                                        this->forward_algo_,
                                         this->back_algo_, this->back_algo_w_);
     }
     // If we're allowing Tensor Core variants of the algos to be considered in
@@ -921,6 +934,19 @@ class CuDNNConvolutionOp {
     return tensor.MSize() * sizeof(DType);
   }
 
+  // Given a tensor shape of this operation, return the number of features 'c'
+  int64_t Features(const TShape &dshape) {
+    int c = 0;
+    switch (dshape.ndim()) {
+      case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break;
+      case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break;
+      case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break;
+      default:
+        LOG(FATAL) << "Unexpected convolution data dimension " << dshape.ndim();
+    }
+    return c;
+  }
+
   std::vector<int> param_stride_;
   std::vector<int> param_dilate_;
   std::vector<int> param_pad_;
@@ -953,6 +979,8 @@ class CuDNNConvolutionOp {
   cudnnTensorFormat_t format_;
   // Allow TensorCore algo policy
   bool cudnn_tensor_core_;
+  // Is req[kWeight] == conv::kAddTo ?
+  bool add_to_weight_;
   ConvolutionParam param_;
 };
 #endif  // __CUDACC__ && CUDNN
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index cb0de4c96..f1b40cce2 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -56,9 +56,11 @@ class CuDNNDeconvolutionOp {
             int backward_compute_type,
             const std::vector<TShape>& in_shape,
             const std::vector<TShape>& out_shape,
-            const RunContext& rctx) {
+            const RunContext& rctx,
+            bool add_to_weight) {
     using namespace mshadow;
     this->param_ = param;
+    this->add_to_weight_ = add_to_weight;
     InitBufferForParam();
     auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type);
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
@@ -257,6 +259,7 @@ class CuDNNDeconvolutionOp {
           filter_desc_,
           gwmat.dptr_ + weight_offset_ * g));
         #elif CUDNN_MAJOR >= 5
+        CHECK_EQ(add_to_weight_, req[deconv::kWeight] == kAddTo);
         CUDNN_CALL(cudnnConvolutionBackwardFilter(
           s->dnn_handle_,
           &alpha,
@@ -543,8 +546,8 @@ class CuDNNDeconvolutionOp {
     if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
                                          cudnn_forward_compute_type,
                                          cudnn_backward_compute_type,
-                                         SMArch(rctx.ctx.dev_id), &forward_algo_,
-                                         &back_algo_, &back_algo_w_)) {
+                                         SMArch(rctx.ctx.dev_id), add_to_weight_,
+                                         &forward_algo_, &back_algo_, &back_algo_w_)) {
       mshadow::Stream <gpu> *s = rctx.get_stream<gpu>();
       CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
       size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
@@ -578,6 +581,8 @@ class CuDNNDeconvolutionOp {
       auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
       std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
       int actual_bwd_filter_algos = 0;
+      // In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
+      // were summing into the output (i.e. beta != 0).  Get() returned OK algos though.
       auto bwd_filter_algo_discoverer =
         param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7
                                                 : cudnnFindConvolutionBackwardFilterAlgorithm;
@@ -728,6 +733,14 @@ class CuDNNDeconvolutionOp {
         }
       }
       #endif  // CUDNN_MAJOR < 7
+
+      // Fix for issue #11241
+      int cudnn_find_issue_max_features = 64 * 1024;
+      // With deconvolution, the algo sensitivity is to a large number of output features
+      if (add_to_weight_ && Features(out_shape[deconv::kOut]) >= cudnn_find_issue_max_features) {
+        this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
+      }
+
       // An algo specification by the user may be cached here, but another
       // convolution will match only if identically specified.
       // We're caching results of *Get* as well as *Find*, but these records
@@ -735,7 +748,8 @@ class CuDNNDeconvolutionOp {
       CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
                                           cudnn_forward_compute_type,
                                           cudnn_backward_compute_type,
-                                          SMArch(rctx.ctx.dev_id), this->forward_algo_,
+                                          SMArch(rctx.ctx.dev_id), this->add_to_weight_,
+                                          this->forward_algo_,
                                           this->back_algo_, this->back_algo_w_);
     }
     // If we're allowing Tensor Core variants of the algos to be considered in
@@ -866,6 +880,20 @@ class CuDNNDeconvolutionOp {
     return tensor.MSize() * sizeof(DType);
   }
 
+
+  // Given a tensor shape of this operation, return the number of features 'c'
+  int64_t Features(const TShape &dshape) {
+    int c = 0;
+    switch (dshape.ndim()) {
+      case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break;
+      case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break;
+      case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break;
+      default:
+        LOG(FATAL) << "Unexpected deconvolution data dimension " << dshape.ndim();
+    }
+    return c;
+  }
+
   std::vector<int> param_stride_;
   std::vector<int> param_dilate_;
 
@@ -912,6 +940,8 @@ class CuDNNDeconvolutionOp {
   cudnnTensorFormat_t format_;
   // Allow TensorCore algo policy
   bool cudnn_tensor_core_;
+  // Is req[kWeight] == deconv::kAddTo ?
+  bool add_to_weight_;
   DeconvolutionParam param_;
 };
 #endif  // CUDNN
diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu
index cdfb60690..1c3970b9e 100644
--- a/src/operator/nn/deconvolution.cu
+++ b/src/operator/nn/deconvolution.cu
@@ -39,7 +39,8 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
                                                      int backward_compute_type,
                                                      const std::vector<TShape>& in_shape,
                                                      const std::vector<TShape>& out_shape,
-                                                     const RunContext& rctx) {
+                                                     const RunContext& rctx,
+                                                     bool add_to_weight) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<DeconvSignature,
                                          std::shared_ptr<CuDNNDeconvolutionOp<DType> >,
@@ -55,14 +56,18 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
     ndim += s.ndim();
   for (auto &s : out_shape)
     ndim += s.ndim();
-  key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
-              + ndim + 1 /* for dev_id */);
+  key.Reserve(1 /* for forward_compute_type */ +
+              1 /* for backward_compute_type */ +
+              ndim /* for in and out shapes */ +
+              1 /* for dev_id */ +
+              1 /* for add_to_weight */);
 
   key.AddSign(forward_compute_type);
   key.AddSign(backward_compute_type);
   key.AddSign(in_shape);
   key.AddSign(out_shape);
   key.AddSign(rctx.ctx.dev_id);
+  key.AddSign(add_to_weight ? 1 : 0);
 
   auto it = ops.find(key);
   if (it == ops.end()) {
@@ -72,7 +77,7 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
     CHECK(ins_ret.second);
     it = ins_ret.first;
     it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
-                     out_shape, rctx);
+                     out_shape, rctx, add_to_weight);
   }
   return *it->second;
 }
@@ -109,8 +114,10 @@ void DeconvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
       for (size_t i = 0; i < in_shape.size(); i++) {
         in_shape[i] = inputs[i].shape_;
       }
+      // req[deconv::kWeight] is only set for backward, so assume the typical 'write' for now.
+      auto add_to_weight = false;
       GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
-          in_shape, out_shape, ctx.run_ctx).Forward(ctx, inputs, req, outputs);
+          in_shape, out_shape, ctx.run_ctx, add_to_weight).Forward(ctx, inputs, req, outputs);
     }
   })
 #else
@@ -156,8 +163,9 @@ void DeconvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
       for (size_t i = 0; i < in_shape.size(); i++) {
         in_shape[i] = in_data[i].shape_;
       }
+      auto add_to_weight = req[deconv::kWeight] == kAddTo;
       GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
-          in_shape, out_shape, ctx.run_ctx).Backward(ctx,
+          in_shape, out_shape, ctx.run_ctx, add_to_weight).Backward(ctx,
             std::vector<TBlob>{out_grad}, in_data, req, in_grad);
     }
   })
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 02130eb32..29112939a 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -494,7 +494,7 @@ inline void LogUnimplementedOp(const nnvm::NodeAttrs& attrs,
 }
 
 class OpSignature {
-  std::vector<int> eles;
+  std::vector<int64_t> eles;
   uint64_t hash;
 
  public:
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 8877b5739..a3e663a68 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -522,6 +522,65 @@ def test_convolution_options():
     sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(1,1,1), pad=(0,0,0), cudnn_off=True, name='conv')
     check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
 
+# This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c.
+# Algos returned by find() can fail to run with grad_req='add' (wgrad kernel beta parameter == 1.0f).
+@with_seed()
+def test_convolution_large_c():
+    problematic_c = 64 * 1024
+    # The convolution accumulates many values, so set large tolerances.
+    tol = {np.dtype(np.float32): 1,
+           np.dtype(np.float64): 1}
+    def test_1D_with_width(width, grad_req):
+        ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float32}},
+                    {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float64}}]
+        sym = mx.sym.Convolution(layout='NCW', num_filter=8, kernel=(2,), name='conv')
+        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+
+    def test_2D_with_width(width, grad_req):
+        ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float32}},
+                    {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float64}}]
+        sym = mx.sym.Convolution(layout='NCHW', num_filter=4, kernel=(2,2), name='conv')
+        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+
+    # Run with different data tensor shapes to run cudnnFind() multiple times.
+    # First, populate algo and op caches with models that always use cudnnFind() (req == 'write').
+    # Then run models that must avoid cached cudnnFind() results in some cases (req == 'add').
+    widths = [4, 16, 64]
+    for req in ['write', 'add']:
+        for width in widths:
+            test_1D_with_width(width, req)
+            test_2D_with_width(width, req)
+
+
+# This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c.
+# Algos returned by find() can fail to run with grad_req='add' (wgrad kernel beta parameter == 1.0f).
+@with_seed()
+def test_deconvolution_large_c():
+    problematic_c = 64 * 1024
+    # The deconvolution accumulates many values, so set large tolerances.
+    tol = {np.dtype(np.float32): 1,
+           np.dtype(np.float64): 1}
+    def test_1D_with_width(width, grad_req):
+        ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float32}},
+                    {'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float64}}]
+        sym = mx.sym.Deconvolution(layout='NCW', num_filter=problematic_c, kernel=(2,), name='deconv')
+        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+
+    def test_2D_with_width(width, grad_req):
+        ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float32}},
+                    {'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float64}}]
+        sym = mx.sym.Deconvolution(layout='NCHW', num_filter=problematic_c, kernel=(2,2), name='deconv')
+        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+
+    # Run with different data tensor shapes to run cudnnFind() multiple times.
+    # First, populate algo and op caches with models that always use cudnnFind() (req == 'write').
+    # Then run models that must avoid cached cudnnFind() results in some cases (req == 'add').
+    widths = [4, 16, 64]
+    for req in ['write', 'add']:
+        for width in widths:
+            test_1D_with_width(width, req)
+            test_2D_with_width(width, req)
+
 
 @with_seed()
 def test_convolution_versions():

From 478b4a1a6f4ea93b291dd5ace3dc85e21d823453 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <rahulhuilgol@gmail.com>
Date: Mon, 30 Jul 2018 15:30:51 -0700
Subject: [PATCH 30/63] Improving documentation and error messages for Async
 distributed training with Gluon (#11910)

* Add description about update on kvstore

* add async check for gluon

* only raise error if user set update_on_kvstore

* fix condition

* add async nightly test

* fix case when no kvstore

* add example for trainer creation in doc
---
 docs/faq/distributed_training.md    | 21 +++++++++++--
 python/mxnet/gluon/trainer.py       |  8 ++++-
 tests/nightly/dist_async_kvstore.py | 48 +++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 3 deletions(-)
 create mode 100644 tests/nightly/dist_async_kvstore.py

diff --git a/docs/faq/distributed_training.md b/docs/faq/distributed_training.md
index 70078ba60..d4fa72db2 100644
--- a/docs/faq/distributed_training.md
+++ b/docs/faq/distributed_training.md
@@ -73,6 +73,23 @@ These can be passed as arguments to the iterator.
 You can look at [example/gluon/image_classification.py](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/image_classification.py)
 to see an example usage.
 
+### Updating weights
+KVStore server supports two modes, one which aggregates the gradients and updates the weights using those gradients, and second where the server only aggregates gradients. In the latter case, when a worker process pulls from kvstore, it gets the aggregated gradients. The worker then uses these gradients and applies the weights locally. 
+
+When using Gluon there is an option to choose between these modes by passing `update_on_kvstore` variable when you create the [Trainer](https://mxnet.incubator.apache.org/versions/master/api/python/gluon/gluon.html#mxnet.gluon.Trainer) object like this:
+
+```
+trainer = gluon.Trainer(net.collect_params(), optimizer='sgd',
+                        optimizer_params={'learning_rate': opt.lr,
+                                          'wd': opt.wd,
+                                          'momentum': opt.momentum,
+                                          'multi_precision': True},
+                        kvstore=kv,
+                        update_on_kvstore=True)
+```
+
+When using the symbolic interface, it performs the weight updates on the server without the user having to do anything special.
+
 ### Different Modes of Distributed Training
 Distributed training itself is enabled when kvstore creation string contains the word `dist`.
 
@@ -86,9 +103,9 @@ In this mode, if a worker crashes, then it halts the progress of all workers.
 - `dist_async`: In asynchronous distributed training, the server receives gradients from one worker and immediately updates its store, which it uses to respond to any future pulls.
 This means that a worker who finishes processing a batch can pull the current parameters from server and start the next batch,
 even if other workers haven't finished processing the earlier batch.
-This is faster than `dist_sync` but can take more epochs to converge.
-In `async` mode, it is required to pass an optimizer because in the absence of an optimizer kvstore would replace the stored weights with received weights and this doesn't make sense for training in asynchronous mode.
+This is faster than `dist_sync` because there is no cost of synchronization, but can take more epochs to converge.
 The update of weights is atomic, meaning no two updates happen on the same weight at the same time. However, the order  of updates is not guaranteed.
+In `async` mode, it is required to pass an optimizer because in the absence of an optimizer kvstore would replace the stored weights with received weights and this doesn't make sense for training in asynchronous mode. Hence, when using Gluon with `async` mode we need to set `update_on_kvstore` to `True`. 
 
 - `dist_sync_device`: Same as `dist_sync` except that when there are multiple GPUs being used on each node,
 this mode aggregates gradients and updates weights on GPU while dist_sync does so on CPU memory.
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index b4263410a..98a6878b9 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -187,6 +187,11 @@ def _init_kvstore(self):
             arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params}
             kvstore, update_on_kvstore = _create_kvstore(config['kvstore'], len(self._contexts),
                                                          arg_arrays)
+            if kvstore and 'async' in kvstore.type and config['update_on_kvstore'] is not None\
+                    and not config['update_on_kvstore']:
+                raise ValueError("Please set update_on_kvstore to true "
+                                 "when training in async mode.")
+
             if config['update_on_kvstore'] is not None:
                 update_on_kvstore = config['update_on_kvstore']
         if kvstore:
@@ -195,7 +200,8 @@ def _init_kvstore(self):
             self._distributed = 'dist' in kvstore.type
             if self._distributed:
                 # kv.pull(row_sparse_grad) is not supported for dist kvstore
-                update_on_kvstore = self._contains_sparse_weight or self._contains_sparse_grad
+                update_on_kvstore = self._contains_sparse_weight or self._contains_sparse_grad \
+                                    or 'async' in kvstore.type
             if update_on_kvstore:
                 # optimizer preferably needs to be set before init for multiprecision
                 kvstore.set_optimizer(self._optimizer)
diff --git a/tests/nightly/dist_async_kvstore.py b/tests/nightly/dist_async_kvstore.py
new file mode 100644
index 000000000..3e400eafa
--- /dev/null
+++ b/tests/nightly/dist_async_kvstore.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+import sys
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+
+kv = mx.kv.create('dist_async')
+my_rank = kv.rank
+nworker = kv.num_workers
+
+def test_gluon_trainer_type():
+    def check_trainer_kv_update(update_on_kv):
+        params = mx.gluon.ParameterDict()
+        x = params.get('x', shape=(10,1), lr_mult=1.0)
+        params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+        try:
+            trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv, update_on_kvstore=update_on_kv)
+            trainer._init_kvstore()
+            assert trainer._kv_initialized
+            assert trainer._update_on_kvstore is True
+        except ValueError:
+            assert update_on_kv is False
+
+    check_trainer_kv_update(False)
+    check_trainer_kv_update(True)
+    check_trainer_kv_update(None)
+    print('worker ' + str(my_rank) + ' passed test_gluon_trainer_type')
+
+if __name__ == "__main__":
+    test_gluon_trainer_type()
\ No newline at end of file

From 815f42d0a425acd37b025aa6ee14cb31af052f9f Mon Sep 17 00:00:00 2001
From: Ankit Khedia <36249596+ankkhedia@users.noreply.github.com>
Date: Mon, 30 Jul 2018 16:16:14 -0700
Subject: [PATCH 31/63] [MXNET-641] fix R windows install docs (#11805)

* fix R windows install docs

* addressed PR comments

* PR comments

* PR comments

* fixed line wrappings

* fixed line wrappings
---
 docs/install/windows_setup.md | 176 ++++++++++++++++++++++++++--------
 1 file changed, 134 insertions(+), 42 deletions(-)

diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 8a2b1c141..9d03474b5 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -62,7 +62,7 @@ Next, we install ```graphviz``` library that we use for visualizing network grap
 
 We have installed MXNet core library. Next, we will install MXNet interface package for programming language of your choice:
 - [Python](#install-the-mxnet-package-for-python)
-- [R](#install-mxnet-for-r)
+- [R](#install-mxnet-package-for-r)
 - [Julia](#install-the-mxnet-package-for-julia)
 - **Scala** is not yet available for Windows
 
@@ -91,7 +91,7 @@ Done! We have installed MXNet with Python interface. Run below commands to verif
 ```
 We actually did a small tensor computation using MXNet! You are all set with MXNet on your Windows machine.
 
-## Install MXNet for R
+## Install MXNet Package for R
 MXNet for R is available for both CPUs and GPUs.
 
 ### Installing MXNet on a Computer with a CPU Processor
@@ -101,7 +101,7 @@ To install MXNet on a computer with a CPU processor, choose from two options:
 * Use the prebuilt binary package
 * Build the library from source code
 
-#### Installing MXNet with the Prebuilt Binary Package
+#### Installing MXNet with the Prebuilt Binary Package(CPU)
 For Windows users, MXNet provides prebuilt binary packages.
 You can install the package directly in the R console.
 
@@ -114,81 +114,173 @@ For CPU-only package:
   install.packages("mxnet")
 ```
 
-For GPU-enabled package:
+#### Building MXNet from Source Code(CPU)
+1. Clone the MXNet github repo.
 
-```r
-  cran <- getOption("repos")
-  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU"
-  options(repos = cran)
-  install.packages("mxnet")
+```sh
+git clone --recursive https://github.com/apache/incubator-mxnet
 ```
 
-#### Building MXNet from Source Code
+The `--recursive` is to clone all the submodules used by MXNet. You will be editing the ```"/mxnet/R-package"``` folder.
+
+2. Download prebuilt GPU-enabled MXNet libraries for Windows from [Windows release](https://github.com/yajiedesign/mxnet/releases). You will need `mxnet_x64_vc14_cpu.7z` and `prebuildbase_win10_x64_vc14.7z` where X stands for your CUDA toolkit version
 
-Run the following commands to install the MXNet dependencies and build the MXNet R package.
+3. Create a folder called ```R-package/inst/libs/x64```. MXNet supports only 64-bit operating systems, so you need the x64 folder.
 
-```r
-  Rscript -e "install.packages('devtools', repo = 'https://cloud.r-project.org/')"
+4. Copy the following shared libraries (.dll files) into the ```R-package/inst/libs/x64``` folder:
 ```
+libgcc_s_seh-1.dll
+libgfortran-3.dll
+libmxnet.dll
+libmxnet.lib
+libopenblas.dll
+libquadmath-0.dll
+mxnet.dll
+unzip.exe
+unzip32.dll
+vcomp140.dll
+wget.exe
+```
+These dlls can be found in `prebuildbase_win10_x64_vc14/3rdparty`, `mxnet_x64_vc14_cpu/build`, `mxnet_x64_vc14_cpu/lib`.
 
-```bash
-  cd R-package
-  Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cloud.r-project.org/')); install_deps(dependencies = TRUE)"
-  cd ..
-  make rpkg
+5. Copy the header files from `dmlc`, `mxnet`, `mxshadow` and `nnvm` from mxnet_x64_vc14_cpu/include and mxnet_x64_vc14_cpu/nvnm/include into `./R-package/inst/include`. It should look like:
+
+```
+./R-package/inst
+└── include
+    ├── dmlc
+    ├── mxnet
+    ├── mshadow
+    └── nnvm 
+    
+```
+6. Make sure that R executable is added to your ```PATH``` in the environment variables. Running the ```where R``` command at the command prompt should return the location.
+7. Also make sure that Rtools is installed and the executable is added to your ```PATH``` in the environment variables.
+8. Temporary patch - im2rec currently results in crashes during the build. Remove the im2rec.h and im2rec.cc files in R-package/src/ from cloned repository and comment out the two im2rec lines in [R-package/src/mxnet.cc](https://github.com/apache/incubator-mxnet/blob/master/R-package/src/mxnet.cc) as shown below.
+```bat
+#include "./kvstore.h"
+#include "./export.h"
+//#include "./im2rec.h"
+......
+......
+  DataIterCreateFunction::InitRcppModule();
+  KVStore::InitRcppModule();
+  Exporter::InitRcppModule();
+//  IM2REC::InitRcppModule();
+}
+
+```
+
+9. Now open the Windows CMD with admin rights and change the directory to the `mxnet` folder(cloned repository). Then use the following commands
+to build R package:
+
+```bat
+echo import(Rcpp) > R-package\NAMESPACE
+echo import(methods) >> R-package\NAMESPACE
+Rscript -e "install.packages('devtools', repos = 'https://cloud.r-project.org')"
+cd R-package
+Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cloud.r-project.org')); install_deps(dependencies = TRUE)"
+cd ..
+
+R CMD INSTALL --no-multiarch R-package
+
+Rscript -e "require(mxnet); mxnet:::mxnet.export('R-package')"
+rm R-package/NAMESPACE
+Rscript -e "require(devtools); install_version('roxygen2', version = '5.0.1', repos = 'https://cloud.r-project.org/', quiet = TRUE)"
+Rscript -e "require(roxygen2); roxygen2::roxygenise('R-package')"
+
+R CMD INSTALL --build --no-multiarch R-package
 ```
 
+
 ### Installing MXNet on a Computer with a GPU Processor
+To install MXNet on a computer with a GPU processor, choose from two options:
+
+* Use the prebuilt binary package
+* Build the library from source code
+
+However, a few dependencies remain for both options.  You will need the following:
+* Install [Nvidia-drivers](http://www.nvidia.com/Download/index.aspx?lang=en-us) if not installed. Latest driver based on your system configuration is recommended. 
 
-To install MXNet R package on a computer with a GPU processor, you need the following:
+* Install [Microsoft Visual Studio](https://visualstudio.microsoft.com/downloads/) (VS2015 or VS2017 is required by CUDA)
 
-* Microsoft Visual Studio 2013
+* Install  [NVidia CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)(cu92 is recommended though we support cu80, cu90, cu91 and cu92)
 
-* The NVidia CUDA Toolkit
+* Download and install [CuDNN](https://developer.nvidia.com/cudnn) (to provide a Deep Neural Network library). Latest version recommended.
 
-* The MXNet package
+Note: A pre-requisite to above softwares is [Nvidia-drivers](http://www.nvidia.com/Download/index.aspx?lang=en-us) which we assume is installed.
 
-* CuDNN (to provide a Deep Neural Network library)
+#### Installing MXNet with the Prebuilt Binary Package(GPU)
+For Windows users, MXNet provides prebuilt binary packages.
+You can install the package directly in the R console after you have the above software installed.
 
-To install the required dependencies and install MXNet for R:
+For GPU package:
 
-1. Install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). The CUDA Toolkit depends on Visual Studio. To check whether your GPU is compatible with the CUDA Toolkit and for information on installing it, see NVidia's [CUDA Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/).
-3. Clone the MXNet github repo.
+```r
+  cran <- getOption("repos")
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cuX"
+  options(repos = cran)
+  install.packages("mxnet")
+```
+Change X to 80,90,91 or 92 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+#### Building MXNet from Source Code(GPU)
+After you have installed above software, continue with the following steps to build MXNet-R: 
+1. Clone the MXNet github repo.
 
 ```sh
-git clone --recursive https://github.com/dmlc/mxnet
+git clone --recursive https://github.com/apache/incubator-mxnet
 ```
 
 The `--recursive` is to clone all the submodules used by MXNet. You will be editing the ```"/mxnet/R-package"``` folder.
-4. Download prebuilt GPU-enabled MXNet libraries for Windows from https://github.com/yajiedesign/mxnet/releases. You will need `mxnet_x64_vc14_gpu.7z` and `prebuildbase_win10_x64_vc14.7z`.
-5. Download and install [CuDNN](https://developer.nvidia.com/cudnn).
-6. Create a folder called ```R-package/inst/libs/x64```. MXNet supports only 64-bit operating systems, so you need the x64 folder.
-7. Copy the following shared libraries (.dll files) into the ```R-package/inst/libs/x64``` folder:
-```
-cublas64_80.dll
-cudart64_80.dll
-cudnn64_5.dll
-curand64_80.dll
+
+2. Download prebuilt GPU-enabled MXNet libraries for Windows from https://github.com/yajiedesign/mxnet/releases. You will need `mxnet_x64_vc14_gpu_cuX.7z` and `prebuildbase_win10_x64_vc14.7z` where X stands for your CUDA toolkit version
+
+3. Create a folder called ```R-package/inst/libs/x64```. MXNet supports only 64-bit operating systems, so you need the x64 folder.
+
+4. Copy the following shared libraries (.dll files) into the ```R-package/inst/libs/x64``` folder:
+```
 libgcc_s_seh-1.dll
 libgfortran-3.dll
 libmxnet.dll
 libmxnet.lib
 libopenblas.dll
 libquadmath-0.dll
-nvrtc64_80.dll
+mxnet.dll
+unzip.exe
+unzip32.dll
+vcomp140.dll
+wget.exe
 ```
-These dlls can be found in `prebuildbase_win10_x64_vc14/3rdparty/cudart`, `prebuildbase_win10_x64_vc14/3rdparty/openblas/bin`, `mxnet_x64_vc14_gpu/build`, `mxnet_x64_vc14_gpu/lib` and the `cuDNN` downloaded from NVIDIA.
-8. Copy the header files from `dmlc`, `mxnet` and `nnvm` into `./R-package/inst/include`. It should look like:
+These dlls can be found in `prebuildbase_win10_x64_vc14/3rdparty`, `mxnet_x64_vc14_gpu_cuX/build`, `mxnet_x64_vc14_gpu_cuX/lib`.
+
+5. Copy the header files from `dmlc`, `mxnet`, `mxshadow` and `nnvm` from mxnet_x64_vc14_gpuX/include and mxnet_x64_vc14_gpuX/nvnm/include into `./R-package/inst/include`. It should look like:
 
 ```
 ./R-package/inst
 └── include
     ├── dmlc
     ├── mxnet
-    └── nnvm
+    ├── mshadow
+    └── nnvm 
+    
+```
+6. Make sure that R executable is added to your ```PATH``` in the environment variables. Running the ```where R``` command at the command prompt should return the location.
+7. Also make sure that Rtools is installed and the executable is added to your ```PATH``` in the environment variables.
+8. Temporary patch - im2rec currently results in crashes during the build. Remove the im2rec.h and im2rec.cc files in R-package/src/ from cloned repository and comment out the two im2rec lines in [R-package/src/mxnet.cc](https://github.com/apache/incubator-mxnet/blob/master/R-package/src/mxnet.cc) as shown below.
+```bat
+#include "./kvstore.h"
+#include "./export.h"
+//#include "./im2rec.h"
+......
+......
+  DataIterCreateFunction::InitRcppModule();
+  KVStore::InitRcppModule();
+  Exporter::InitRcppModule();
+//  IM2REC::InitRcppModule();
+}
+
 ```
-9. Make sure that R is added to your ```PATH``` in the environment variables. Running the ```where R``` command at the command prompt should return the location.
-10. Now open the Windows CMD and change the directory to the `mxnet` folder. Then use the following commands
+9. Now open the Windows CMD with admin rights and change the directory to the `mxnet` folder(cloned repository). Then use the following commands
 to build R package:
 
 ```bat

From 461ba072c2531a3303e796425f19325627b966e7 Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Tue, 31 Jul 2018 11:29:56 +0800
Subject: [PATCH 32/63] a hot fix for mkldnn link (#11939)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c402c523b..7e9dc91e7 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ What's New
 * [Version 0.8.0 Release](https://github.com/dmlc/mxnet/releases/tag/v0.8.0)
 * [Updated Image Classification with new Pre-trained Models](./example/image-classification)
 * [Python Notebooks for How to Use MXNet](https://github.com/dmlc/mxnet-notebooks)
-* [MKLDNN for Faster CPU Performance](./MKL_README.md)
+* [MKLDNN for Faster CPU Performance](./MKLDNN_README.md)
 * [MXNet Memory Monger, Training Deeper Nets with Sublinear Memory Cost](https://github.com/dmlc/mxnet-memonger)
 * [Tutorial for NVidia GTC 2016](https://github.com/dmlc/mxnet-gtc-tutorial)
 * [Embedding Torch layers and functions in MXNet](https://mxnet.incubator.apache.org/faq/torch.html)

From 7ffb2528c80b58005c9dc3b5ae4d273bebadfb71 Mon Sep 17 00:00:00 2001
From: Hao Jin <haojin2@users.noreply.github.com>
Date: Mon, 30 Jul 2018 23:49:35 -0400
Subject: [PATCH 33/63] re-enabling randomized test_l2_normalization (#11900)

---
 tests/python/unittest/test_operator.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index fa5de0c68..99d635e35 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3018,17 +3018,18 @@ def check_l2_normalization(in_shape, mode, dtype, norm_eps=1e-10):
     check_numeric_gradient(out, [in_data], numeric_eps=1e-3, rtol=1e-2, atol=1e-3)
 
 
-# TODO(szha): Seeding this masks failures. We need to do a deep dive for failures without this seed.
-@with_seed(1234)
+# @haojin2: getting rid of the fixed seed as the flakiness could not be reproduced.
+# tracked at: https://github.com/apache/incubator-mxnet/issues/11717
+@with_seed()
 def test_l2_normalization():
     for dtype in ['float16', 'float32', 'float64']:
         for mode in ['channel', 'spatial', 'instance']:
-            for nbatch in [1, 4]:
-                for nchannel in [3, 5]:
-                    for height in [4, 6]:
-                        check_l2_normalization((nbatch, nchannel, height), mode, dtype)
-                        for width in [5, 7]:
-                            check_l2_normalization((nbatch, nchannel, height, width), mode, dtype)
+            nbatch = random.randint(1, 4)
+            nchannel = random.randint(3, 5)
+            height = random.randint(4, 6)
+            check_l2_normalization((nbatch, nchannel, height), mode, dtype)
+            width = random.randint(5, 7)
+            check_l2_normalization((nbatch, nchannel, height, width), mode, dtype)
 
 
 def check_layer_normalization(in_shape, axis, eps, dtype=np.float32, forward_check_eps=1E-3):

From a56a569e22b9241c7be59edd03b5afee0d6d42b5 Mon Sep 17 00:00:00 2001
From: Piyush Ghai <ghai.8@osu.edu>
Date: Tue, 31 Jul 2018 02:50:13 -0700
Subject: [PATCH 34/63] [MXNET-651] MXNet Model Backwards Compatibility Checker
 (#11626)

* Added MNIST-MLP-Module-API models to check model save and load_checkpoint methods

* Added LENET with Conv2D operator training file

* Added LENET with Conv2d operator inference file

* Added LanguageModelling with RNN training file

* Added LamguageModelling with RNN inference file

* Added hybridized LENET Gluon Model training file

* Added hybridized LENET gluon model inference file

* Added license headers

* Refactored the model and inference files and extracted out duplicate code in a common file

* Added runtime function for executing the MBCC files

* Added JenkinsFile for MBCC to be run as a nightly job

* Added boto3 install for s3 uploads

* Added README for MBCC

* Added license header

* Added more common functions from lm_rnn_gluon_train and inference files into common.py to clean up code

* Added scripts for training models on older versions of MXNet

* Added check for preventing inference script from crashing in case no trained models are found

* Fixed indentation issue

* Replaced Penn Tree Bank Dataset with Sherlock Holmes Dataset

* Fixed indentation issue

* Removed training in models and added smaller models. Now we are simply checking a forward pass in the model with dummy data.

* Updated README

* Fixed indentation error

* Fixed indentation error

* Removed code duplication in the training file

* Added comments for runtime_functions script for training files

* Merged S3 Buckets for storing data and models into one

* Automated the process to fetch MXNet versions from git tags

* Added defensive checks for the case where the data might not be found

* Fixed issue where we were performing inference on state model files

* Replaced print statements with logging ones

* Removed boto install statements and move them into ubuntu_python docker

* Separated training and uploading of models into separate files so that training runs in Docker and upload runs outside Docker

* Fixed pylint warnings

* Updated comments and README

* Removed the venv for training process

* Fixed indentation in the MBCC Jenkins file and also separated out training and inference into two separate stages

* Fixed indendation

* Fixed erroneous single quote

* Added --user flag to check for Jenkins error

* Removed unused methods

* Added force flag in the pip command to install mxnet

* Removed the force-re-install flag

* Changed exit 1 to exit 0

* Added quotes around the shell command

* added packlibs and unpack libs for MXNet builds

* Changed PythonPath from relative to absolute

* Created dedicated bucket with correct permission

* Fix for python path in training

* Changed bucket name to CI bucket

* Added set -ex to the upload shell script

* Now raising an exception if no models are found in the S3 bucket

* Added regex to train models script

* Added check for performing inference only on models trained on same major versions

* Added set -ex flags to shell scripts

* Added multi-version regex checks in training

* Fixed typo in regex

* Now we will train models for all the minor versions for a given major version by traversing the tags

* Added check for validating current_version
---
 ci/docker/install/ubuntu_python.sh            |   4 +-
 ci/docker/runtime_functions.sh                |  14 ++
 .../JenkinsfileForMBCC                        | 120 ++++++++++
 .../README.md                                 |  25 ++
 .../common.py                                 | 214 ++++++++++++++++++
 .../model_backward_compat_checker.sh          |  30 +++
 .../model_backwards_compat_inference.py       | 137 +++++++++++
 .../model_backwards_compat_train.py           | 127 +++++++++++
 .../train_mxnet_legacy_models.sh              |  89 ++++++++
 .../upload_models_to_s3.sh                    |  43 ++++
 10 files changed, 801 insertions(+), 2 deletions(-)
 create mode 100644 tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC
 create mode 100644 tests/nightly/model_backwards_compatibility_check/README.md
 create mode 100644 tests/nightly/model_backwards_compatibility_check/common.py
 create mode 100755 tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh
 create mode 100644 tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
 create mode 100644 tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
 create mode 100755 tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
 create mode 100755 tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh

diff --git a/ci/docker/install/ubuntu_python.sh b/ci/docker/install/ubuntu_python.sh
index f087f0709..e71cac8a3 100755
--- a/ci/docker/install/ubuntu_python.sh
+++ b/ci/docker/install/ubuntu_python.sh
@@ -29,5 +29,5 @@ wget -nv https://bootstrap.pypa.io/get-pip.py
 python3 get-pip.py
 python2 get-pip.py
 
-pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1
-pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1
+pip2 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip3 install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 8805850e3..52a2650a1 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -899,6 +899,20 @@ nightly_test_javascript() {
     make -C /work/mxnet/amalgamation libmxnet_predict.js MIN=1 EMCC=/work/deps/emscripten/emcc
 }
 
+#Tests Model backwards compatibility on MXNet
+nightly_model_backwards_compat_test() {
+    set -ex
+    export PYTHONPATH=/work/mxnet/python/
+    ./tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh
+}
+
+#Backfills S3 bucket with models trained on earlier versions of mxnet
+nightly_model_backwards_compat_train() {
+    set -ex
+    export PYTHONPATH=./python/
+    ./tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
+}
+
 # Nightly 'MXNet: The Straight Dope' Single-GPU Tests
 nightly_straight_dope_python2_single_gpu_tests() {
     set -ex
diff --git a/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC
new file mode 100644
index 000000000..412d68d56
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/JenkinsfileForMBCC
@@ -0,0 +1,120 @@
+// -*- mode: groovy -*-
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+//This is a Jenkinsfile for the model backwards compatibility checker. The format and some functions have been picked up from the top-level Jenkinsfile.
+
+err = null
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+
+def init_git() {
+  deleteDir()
+  retry(5) {
+    try {
+      timeout(time: 15, unit: 'MINUTES') {
+        checkout scm
+        sh 'git submodule update --init --recursive'
+        sh 'git clean -d -f'
+      }
+    } catch (exc) {
+      deleteDir()
+      error "Failed to fetch source codes with ${exc}"
+      sleep 2
+    }
+  }
+}
+
+// pack libraries for later use
+def pack_lib(name, libs=mx_lib) {
+  sh """
+echo "Packing ${libs} into ${name}"
+echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+"""
+  stash includes: libs, name: name
+}
+
+// unpack libraries saved before
+def unpack_lib(name, libs=mx_lib) {
+  unstash name
+  sh """
+echo "Unpacked ${libs} from ${name}"
+echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+"""
+}
+
+def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
+  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
+  command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
+  command = command.replaceAll('%PLATFORM%', platform)
+  command = command.replaceAll('%FUNCTION_NAME%', function_name)
+  command = command.replaceAll('%SHARED_MEM%', shared_mem)
+
+  sh command
+}
+
+try {
+  stage('MBCC Train'){
+    node('restricted-mxnetlinux-cpu') {
+      ws('workspace/modelBackwardsCompat') {
+        init_git()
+        // Train models on older versions
+        docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_train', false)
+        // upload files to S3 here outside of the docker environment
+        sh "./tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh"
+      }
+    }
+  }
+
+  stage('MXNet Build'){
+    node('restricted-mxnetlinux-cpu') {
+      ws('workspace/build-cpu') {
+        init_git()
+        docker_run('ubuntu_cpu','build_ubuntu_cpu', false)
+        pack_lib('cpu', mx_lib)
+      }
+    }
+  }
+
+  stage('MBCC Inference'){
+    node('restricted-mxnetlinux-cpu') {
+      ws('workspace/modelBackwardsCompat') {
+        init_git()
+        unpack_lib('cpu', mx_lib)
+        // Perform inference on the latest version of MXNet
+        docker_run('ubuntu_nightly_cpu', 'nightly_model_backwards_compat_test', false)
+      }
+    }
+  }
+} catch (caughtError) {
+  node("restricted-mxnetlinux-cpu") {
+    sh "echo caught ${caughtError}"
+    err = caughtError
+    currentBuild.result = "FAILURE"
+  }
+} finally {
+  node("restricted-mxnetlinux-cpu") {
+    // Only send email if model backwards compat test failed
+    if (currentBuild.result == "FAILURE") {
+    	emailext body: 'Nightly tests for model backwards compatibity on MXNet branch : ${BRANCH_NAME} failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[MODEL BACKWARDS COMPATIBILITY TEST FAILED] build ${BUILD_NUMBER}', to: '${EMAIL}'
+    }
+    // Remember to rethrow so the build is marked as failing
+    if (err) {
+      throw err
+    }
+  }
+}
diff --git a/tests/nightly/model_backwards_compatibility_check/README.md b/tests/nightly/model_backwards_compatibility_check/README.md
new file mode 100644
index 000000000..7a2116ac5
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/README.md
@@ -0,0 +1,25 @@
+# Model Backwards Compatibility Tests
+
+This folder contains the scripts that are required to run the nightly job of verifying the compatibility and inference results of models (trained on earlier versions of MXNet) when loaded on the latest release candidate. The tests flag if:
+- The models fail to load on the latest version of MXNet.
+- The inference results are different. 
+
+ 
+## JenkinsfileForMBCC
+This is configuration file for jenkins job.
+
+## Details 
+- Currently the APIs that covered for model saving/loading are : do_checkpoint/load_checkpoint, save_params/load_params, save_parameters/load_parameters(added v1.2.1 onwards), export/gluon.SymbolBlock.imports. 
+- These APIs are covered over models with architectures such as : MLP, RNNs, LeNet, LSTMs covering the four scenarios described above.
+- More operators/models will be added in the future to extend the operator coverage. 
+- The model train file is suffixed by `_train.py` and the trained models are hosted in AWS S3.
+- The trained models for now are backfilled into S3 starting from every MXNet release version v1.1.0 via the `train_mxnet_legacy_models.sh`. 
+- `train_mxnet_legacy_models.sh` script checks out the previous two releases using git tag command and trains and uploads models to S3 on those MXNet versions.
+- The S3 bucket's folder structure looks like this : 
+    * 1.1.0/<model-1-files>  1.1.0/<model-2-files> 
+    * 1.2.0/<model-1-files> 1.2.0/<model-2-files>
+- The <model-1-files> is also a folder which contains the trained model symbol definitions, toy datasets it was trained on, weights and parameters of the model and other relevant files required to reload the model.
+- Over a period of time, the training script would have accumulated a repository of models trained over several versions of MXNet (both major and minor releases).
+- The inference part is checked via the script `model_backwards_compat_inference.sh`.
+- The inference script scans the S3 bucket for MXNet version folders as described above and runs the inference code for each model folder found.
+
diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py
new file mode 100644
index 000000000..4c61cc4e3
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/common.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import boto3
+import mxnet as mx
+import os
+import numpy as np
+import logging
+from mxnet import gluon
+import mxnet.ndarray as F
+from mxnet.gluon import nn
+import re
+from mxnet.test_utils import assert_almost_equal
+
+# Set fixed random seeds.
+mx.random.seed(7)
+np.random.seed(7)
+logging.basicConfig(level=logging.INFO)
+
+# get the current mxnet version we are running on
+mxnet_version = mx.__version__
+model_bucket_name = 'mxnet-ci-prod-backwards-compatibility-models'
+data_folder = 'mxnet-model-backwards-compatibility-data'
+backslash = '/'
+s3 = boto3.resource('s3')
+ctx = mx.cpu(0)
+
+
+def get_model_path(model_name):
+    return os.path.join(os.getcwd(), 'models', str(mxnet_version), model_name)
+
+
+def get_module_api_model_definition():
+    input = mx.symbol.Variable('data')
+    input = mx.symbol.Flatten(data=input)
+
+    fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128)
+    act1 = mx.sym.Activation(data=fc1, name='relu1', act_type="relu")
+    fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=2)
+    op = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
+    model = mx.mod.Module(symbol=op, context=ctx, data_names=['data'], label_names=['softmax_label'])
+    return model
+
+
+def save_inference_results(inference_results, model_name):
+    assert (isinstance(inference_results, mx.ndarray.ndarray.NDArray))
+    save_path = os.path.join(get_model_path(model_name), ''.join([model_name, '-inference']))
+
+    mx.nd.save(save_path, {'inference': inference_results})
+
+
+def load_inference_results(model_name):
+    inf_dict = mx.nd.load(model_name+'-inference')
+    return inf_dict['inference']
+
+
+def save_data_and_labels(test_data, test_labels, model_name):
+    assert (isinstance(test_data, mx.ndarray.ndarray.NDArray))
+    assert (isinstance(test_labels, mx.ndarray.ndarray.NDArray))
+
+    save_path = os.path.join(get_model_path(model_name), ''.join([model_name, '-data']))
+    mx.nd.save(save_path, {'data': test_data, 'labels': test_labels})
+
+
+def clean_model_files(files, model_name):
+    files.append(model_name + '-inference')
+    files.append(model_name + '-data')
+
+    for file in files:
+        if os.path.isfile(file):
+            os.remove(file)
+
+
+def download_model_files_from_s3(model_name, folder_name):
+    model_files = list()
+    bucket = s3.Bucket(model_bucket_name)
+    prefix = folder_name + backslash + model_name
+    model_files_meta = list(bucket.objects.filter(Prefix = prefix))
+    if len(model_files_meta) == 0:
+        logging.error('No trained models found under path : %s', prefix)
+        return model_files
+    for obj in model_files_meta:
+        file_name = obj.key.split('/')[2]
+        model_files.append(file_name)
+        # Download this file
+        bucket.download_file(obj.key, file_name)
+
+    return model_files
+
+
+def get_top_level_folders_in_bucket(s3client, bucket_name):
+    # This function returns the top level folders in the S3Bucket.
+    # These folders help us to navigate to the trained model files stored for different MXNet versions.
+    bucket = s3client.Bucket(bucket_name)
+    result = bucket.meta.client.list_objects(Bucket=bucket.name, Delimiter=backslash)
+    folder_list = list()
+    if 'CommonPrefixes' not in result:
+        logging.error('No trained models found in S3 bucket : %s for this file. '
+                      'Please train the models and run inference again' % bucket_name)
+        raise Exception("No trained models found in S3 bucket : %s for this file. "
+                        "Please train the models and run inference again" % bucket_name)
+        return folder_list
+    for obj in result['CommonPrefixes']:
+        folder_name = obj['Prefix'].strip(backslash)
+        # We only compare models from the same major versions. i.e. 1.x.x compared with latest 1.y.y etc
+        if str(folder_name).split('.')[0] != str(mxnet_version).split('.')[0]:
+            continue
+        # The top level folders contain MXNet Version # for trained models. Skipping the data folder here
+        if folder_name == data_folder:
+            continue
+        folder_list.append(obj['Prefix'].strip(backslash))
+
+    if len(folder_list) == 0:
+        logging.error('No trained models found in S3 bucket : %s for this file. '
+                      'Please train the models and run inference again' % bucket_name)
+        raise Exception("No trained models found in S3 bucket : %s for this file. "
+                        "Please train the models and run inference again" % bucket_name)
+    return folder_list
+
+
+def create_model_folder(model_name):
+    path = get_model_path(model_name)
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+class Net(gluon.Block):
+    def __init__(self, **kwargs):
+        super(Net, self).__init__(**kwargs)
+        with self.name_scope():
+            # layers created in name_scope will inherit name space
+            # from parent layer.
+            self.conv1 = nn.Conv2D(20, kernel_size=(5, 5))
+            self.pool1 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2))
+            self.conv2 = nn.Conv2D(50, kernel_size=(5, 5))
+            self.pool2 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2))
+            self.fc1 = nn.Dense(500)
+            self.fc2 = nn.Dense(2)
+
+    def forward(self, x):
+        x = self.pool1(F.tanh(self.conv1(x)))
+        x = self.pool2(F.tanh(self.conv2(x)))
+        # 0 means copy over size from corresponding dimension.
+        # -1 means infer size from the rest of dimensions.
+        x = x.reshape((0, -1))
+        x = F.tanh(self.fc1(x))
+        x = F.tanh(self.fc2(x))
+        return x
+
+
+class HybridNet(gluon.HybridBlock):
+    def __init__(self, **kwargs):
+        super(HybridNet, self).__init__(**kwargs)
+        with self.name_scope():
+            # layers created in name_scope will inherit name space
+            # from parent layer.
+            self.conv1 = nn.Conv2D(20, kernel_size=(5, 5))
+            self.pool1 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2))
+            self.conv2 = nn.Conv2D(50, kernel_size=(5, 5))
+            self.pool2 = nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2))
+            self.fc1 = nn.Dense(500)
+            self.fc2 = nn.Dense(2)
+
+    def hybrid_forward(self, F, x):
+        x = self.pool1(F.tanh(self.conv1(x)))
+        x = self.pool2(F.tanh(self.conv2(x)))
+        # 0 means copy over size from corresponding dimension.
+        # -1 means infer size from the rest of dimensions.
+        x = x.reshape((0, -1))
+        x = F.tanh(self.fc1(x))
+        x = F.tanh(self.fc2(x))
+        return x
+
+
+class SimpleLSTMModel(gluon.Block):
+    def __init__(self, **kwargs):
+        super(SimpleLSTMModel, self).__init__(**kwargs)
+        with self.name_scope():
+            self.model = mx.gluon.nn.Sequential(prefix='')
+            with self.model.name_scope():
+                self.model.add(mx.gluon.nn.Embedding(30, 10))
+                self.model.add(mx.gluon.rnn.LSTM(20))
+                self.model.add(mx.gluon.nn.Dense(100))
+                self.model.add(mx.gluon.nn.Dropout(0.5))
+                self.model.add(mx.gluon.nn.Dense(2, flatten=True, activation='tanh'))
+
+    def forward(self, x):
+        return self.model(x)
+
+
+def compare_versions(version1, version2):
+    '''
+    https://stackoverflow.com/questions/1714027/version-number-comparison-in-python
+    '''
+    def normalize(v):
+        return [int(x) for x in re.sub(r'(\.0+)*$','', v).split(".")]
+    return cmp(normalize(version1), normalize(version2))
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh
new file mode 100755
index 000000000..23386836e
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/model_backward_compat_checker.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#Author: Piyush Ghai
+
+set -ex
+
+echo "Invoking model_backwards_compat_checker.sh script"
+echo `pwd`
+cd tests/nightly/model_backwards_compatibility_check
+echo `pwd`
+
+echo '=========================='
+python model_backwards_compat_inference.py
\ No newline at end of file
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
new file mode 100644
index 000000000..ae368e3a0
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from common import *
+
+
+def test_module_checkpoint_api():
+    model_name = 'module_checkpoint_api'
+    print ('Performing inference for model/API %s' % model_name)
+
+    # For each MXNet version that has the saved models
+    for folder in get_top_level_folders_in_bucket(s3, model_bucket_name):
+        logging.info('Fetching files for MXNet version : %s and model %s' % (folder, model_name))
+        model_files = download_model_files_from_s3(model_name, folder)
+        if len(model_files) == 0:
+            logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder))
+            continue
+
+        data = mx.nd.load(''.join([model_name, '-data']))
+        data_iter = mx.io.NDArrayIter(data['data'], data['labels'], batch_size=10)
+        # Load the model and perform inference
+        loaded_model = get_module_api_model_definition()
+
+        sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 1)
+        loaded_model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
+        loaded_model.set_params(arg_params, aux_params)
+
+        old_inference_results = load_inference_results(model_name)
+        inference_results = loaded_model.predict(data_iter)
+        # Check whether they are equal or not ?
+        assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy())
+        clean_model_files(model_files, model_name)
+        logging.info('=================================')
+
+    logging.info('Assertion passed for model : %s' % model_name)
+
+
+def test_lenet_gluon_load_params_api():
+    model_name = 'lenet_gluon_save_params_api'
+    logging.info('Performing inference for model/API %s' % model_name)
+
+    for folder in get_top_level_folders_in_bucket(s3, model_bucket_name):
+        logging.info('Fetching files for MXNet version : %s and model %s' % (folder, model_name))
+        model_files = download_model_files_from_s3(model_name, folder)
+        if len(model_files) == 0:
+            logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder))
+            continue
+
+        data = mx.nd.load(''.join([model_name, '-data']))
+        test_data = data['data']
+        # Load the model and perform inference
+        loaded_model = Net()
+        loaded_model.load_params(model_name + '-params')
+        output = loaded_model(test_data)
+        old_inference_results = mx.nd.load(model_name + '-inference')['inference']
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        clean_model_files(model_files, model_name)
+        logging.info('=================================')
+    logging.info('Assertion passed for model : %s' % model_name)
+
+
+def test_lenet_gluon_hybrid_imports_api():
+    model_name = 'lenet_gluon_hybrid_export_api'
+    logging.info('Performing inference for model/API %s' % model_name)
+
+    for folder in get_top_level_folders_in_bucket(s3, model_bucket_name):
+        logging.info('Fetching files for MXNet version : %s and model %s' % (folder, model_name))
+        model_files = download_model_files_from_s3(model_name, folder)
+        if len(model_files) == 0:
+            logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder))
+            continue
+            # Load the model and perform inference
+        data = mx.nd.load(''.join([model_name, '-data']))
+        test_data = data['data']
+        loaded_model = HybridNet()
+        loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0000.params')
+        output = loaded_model(test_data)
+        old_inference_results = mx.nd.load(model_name + '-inference')['inference']
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        clean_model_files(model_files, model_name)
+        logging.info('=================================')
+    logging.info('Assertion passed for model : %s' % model_name)
+
+
+def test_lstm_gluon_load_parameters_api():
+    # If this code is being run on version >= 1.2.0 only then execute it,
+    # since it uses save_parameters and load_parameters API
+
+    if compare_versions(str(mxnet_version), '1.2.1') < 0:
+        logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters'
+                     ' and load_parameters functions' % str(mxnet_version))
+        return
+
+    model_name = 'lstm_gluon_save_parameters_api'
+    logging.info('Performing inference for model/API %s and model' % model_name)
+
+    for folder in get_top_level_folders_in_bucket(s3, model_bucket_name):
+        logging.info('Fetching files for MXNet version : %s' % folder)
+        model_files = download_model_files_from_s3(model_name, folder)
+        if len(model_files) == 0:
+            logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder))
+            continue
+
+        data = mx.nd.load(''.join([model_name, '-data']))
+        test_data = data['data']
+        # Load the model and perform inference
+        loaded_model = SimpleLSTMModel()
+        loaded_model.load_parameters(model_name + '-params')
+        output = loaded_model(test_data)
+        old_inference_results = mx.nd.load(model_name + '-inference')['inference']
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        clean_model_files(model_files, model_name)
+        logging.info('=================================')
+    logging.info('Assertion passed for model : %s' % model_name)
+
+
+if __name__ == '__main__':
+    test_module_checkpoint_api()
+    test_lenet_gluon_load_params_api()
+    test_lenet_gluon_hybrid_imports_api()
+    test_lstm_gluon_load_parameters_api()
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
new file mode 100644
index 000000000..289d47c70
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from common import *
+
+
+def train_module_checkpoint_api():
+    model_name = 'module_checkpoint_api'
+    create_model_folder(model_name)
+    logging.info('Saving files for model %s' % model_name)
+    # Prepare data
+    test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1)))
+    test_label = mx.nd.array(np.random.randint(0, 2, size=(20,)), dtype='float32')
+    data_iter = mx.io.NDArrayIter(test_data, test_label, batch_size=10)
+
+    mod = get_module_api_model_definition()
+    mod.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
+    weights = mx.initializer.Xavier(magnitude=2.57)
+    mod.init_params(weights)
+
+    mod.save_checkpoint(os.path.join(get_model_path(model_name), model_name), 1)
+
+    inference_results = mod.predict(data_iter)
+    # Save inference_results
+    # Save the model files
+    save_data_and_labels(test_data, test_label, model_name)
+    save_inference_results(inference_results, model_name)
+
+
+def train_lenet_gluon_save_params_api():
+    model_name = 'lenet_gluon_save_params_api'
+    create_model_folder(model_name)
+    logging.info('Saving files for model %s' % model_name)
+    net = Net()
+    weights = mx.initializer.Xavier(magnitude=2.57)
+    net.initialize(weights, ctx=[mx.cpu(0)])
+    # Prepare data
+
+    test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30)))
+    output = net(test_data)
+    # print (y)
+
+    mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data})
+    save_inference_results(output, model_name)
+    net.save_params(os.path.join(get_model_path(model_name), ''.join([model_name, '-params'])))
+
+
+def train_lenet_gluon_hybrid_export_api():
+    model_name = 'lenet_gluon_hybrid_export_api'
+    logging.info('Saving files for model %s' % model_name)
+    create_model_folder(model_name)
+    net = HybridNet()
+    weights = mx.initializer.Xavier(magnitude=2.57)
+    net.initialize(weights, ctx=[mx.cpu(0)])
+    net.hybridize()
+    # Prepare data
+    test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30)))
+    output = net(test_data)
+    # print (y)
+    # Save the test data as well.
+    # Save the inference output ys
+    # Save the model params
+
+    mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data})
+    save_inference_results(output, model_name)
+    if compare_versions(str(mxnet_version) , '1.1.0') < 0:
+        # v1.0.0 does not have the epoch param in the .exports API. Hence adding this safety net
+        net.export(os.path.join(get_model_path(model_name), model_name))
+    else:
+        # Saving with 0 since by default on 1.0.0 it was saved with 0, so simplifying things
+        net.export(os.path.join(get_model_path(model_name), model_name), epoch=0)
+
+
+
+def train_lstm_gluon_save_parameters_api():
+    # If this code is being run on version >= 1.2.1 only then execute it,
+    # since it uses save_parameters and load_parameters API
+    if compare_versions(str(mxnet_version), '1.2.1') < 0:
+        logging.warn('Found MXNet version %s and exiting because this version does not contain save_parameters'
+                     ' and load_parameters functions' % str(mxnet_version))
+        return
+
+    model_name = 'lstm_gluon_save_parameters_api'
+    logging.info('Saving files for model %s' % model_name)
+    create_model_folder(model_name)
+    net = SimpleLSTMModel()
+    weights = mx.initializer.Xavier(magnitude=2.57)
+    net.initialize(weights, ctx=[mx.cpu(0)])
+
+    test_data = mx.nd.array(np.random.uniform(-1, 1, size=(10, 30)))
+    output = net(test_data)
+    # print output
+    mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data})
+    save_inference_results(output, model_name)
+    net.save_parameters(os.path.join(get_model_path(model_name), ''.join([model_name, '-params'])))
+
+
+def create_root_folder():
+    base_path = os.getcwd()
+    version_path = os.path.join(base_path, 'models')
+    if not os.path.exists(version_path):
+        os.mkdir(version_path)
+
+
+if __name__ == '__main__':
+    create_root_folder()
+
+    train_module_checkpoint_api()
+    train_lenet_gluon_save_params_api()
+    train_lenet_gluon_hybrid_export_api()
+    train_lstm_gluon_save_parameters_api()
diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
new file mode 100755
index 000000000..336c61df2
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#Author: Piyush Ghai
+
+set -ex
+
+run_models() {
+	echo '=========================='
+	echo "Running training files and preparing models"
+	echo '=========================='
+	python model_backwards_compat_train.py
+	echo '=========================='
+}
+
+install_mxnet() {
+	version=$1
+	echo "Installing MXNet "$version
+	pip install mxnet==$version --user
+}
+
+## Cuts the string and gives only the major version part.
+## eg : 12.3.0 ---> 12
+get_major_version() {
+    major=$(echo $1 | cut -d. -f1)
+    echo $major
+}
+
+## We read the current major version from libinfo.py file. And we extract the major version from it.
+curr_mxnet_version=$(grep -w "__version__" python/mxnet/libinfo.py | grep -o '".*"' | sed 's/"//g')
+## Expected in <numeric>.<numeric>.<numeric> format
+if [[ $curr_mxnet_version = [[:digit:][[:digit:]]*.[[:digit:][[:digit:]]*.[[:digit:][[:digit:]]* ]]
+then
+    curr_major_version=$(get_major_version $curr_mxnet_version)
+else
+    echo "The current major version does not comply with the regex expected. Exiting here."
+    exit 1
+fi
+
+echo `pwd`
+cd tests/nightly/model_backwards_compatibility_check
+echo `pwd`
+
+## Fetch the latest release tags, filtering out 'rcs' and filtering out some other irrelevant ones
+## This list is sorted in descending order chronologically.
+## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1
+## so from this sample, we will pick up all the versions matching with the current latest version
+## Now while performing inference the latest version could be 1.3.0, which will help in validating models trained
+## on 1.1.0 and 1.2.0 by loading them on the latest version (1.3.0)
+## Over a period of time, the model repository will grow since with every new release we
+## upload models trained on newer versions as well through this script
+previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc))
+count=0
+for version in ${previous_versions[*]}
+do
+	## If MXNet major version starts with a number >=1. with a wildcard match for the minor version numbers
+	## Could have used a [[:digit:]]+. as well but it was not working as a traditional regex in bash.
+	## so had to resort to using [[:digit:]] [[:digit:]]* to indicate multi-digit version regex match
+	## Example : #previous_versions=(12.0.0 12.12.0 12.12.12 2.0.0 1.0.4 1.2.0 v.12.0.0 beta.12.0.1)
+	## When passed through the regex, the output is : [12.0.0 12.12.0 12.12.12 2.0.0 1.0.4 1.2.0]
+	if [[ $version = [[:digit:][[:digit:]]*.[[:digit:][[:digit:]]*.[[:digit:][[:digit:]]* ]]
+	then
+#	    echo $version
+	    major_version=$(get_major_version $version)
+	    if [ ${major_version} -eq ${curr_major_version} ]
+	        then
+#			echo $version
+		        install_mxnet $version
+		        run_models
+	    fi
+	fi
+done
+exit 0
diff --git a/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh
new file mode 100755
index 000000000..16923980a
--- /dev/null
+++ b/tests/nightly/model_backwards_compatibility_check/upload_models_to_s3.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#Author: Piyush Ghai
+
+set -ex
+
+echo "uploading model files to s3"
+
+echo `pwd`
+cd ./tests/nightly/model_backwards_compatibility_check/models/
+echo `pwd`
+
+# The directory structure will be as follows :
+# <mxnet-version>/<model-files> eg :
+# ls /tests/nightly/model_backwards_compatibility_check/models/
+# 1.1.0/   1.2.0/   1.2.1/
+# we upload these folders to S3 and the inference files understand them and pull of models off them
+for dir in $(ls `pwd`/)
+do
+    echo $dir
+    aws s3 cp $dir/ s3://mxnet-ci-prod-backwards-compatibility-models/$dir/ --recursive
+done
+
+echo "Deleting model files"
+cd ../
+rm -rf `pwd`/models

From 98a41af52a2c7cb9e30a6cf81796d431212f217d Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Tue, 31 Jul 2018 10:53:32 -0700
Subject: [PATCH 35/63] [MXNET-531] NeuralStyle Example for Scala (#11621)

* add initial neuralstyle and test coverage

* Add two more test and README

* kill comments

* patch on memory leaks fix

* fix formatting issues

* remove redundant files

* disable the Gan example for now

* add ignore method

* add new download scheme to match the changes
---
 .../neuralstyle/ModelVgg19.scala              | 139 ++++-----
 .../neuralstyle/NeuralStyle.scala             | 251 ++++++++--------
 .../mxnetexamples/neuralstyle/README.md       |  83 ++++++
 .../neuralstyle/end2end/Basic.scala           |  32 +--
 .../neuralstyle/end2end/BoostInference.scala  |  60 ++--
 .../neuralstyle/end2end/BoostTrain.scala      | 271 +++++++++---------
 .../neuralstyle/end2end/DataProcessing.scala  |  15 +-
 .../neuralstyle/end2end/GenV3.scala           |  55 ++--
 .../neuralstyle/end2end/GenV4.scala           |  91 ++----
 .../neuralstyle/end2end/ModelVgg19.scala      | 111 -------
 .../neuralstyle/end2end/Module.scala          |  15 +-
 .../mxnetexamples/gan/GanExampleSuite.scala   |  49 ++--
 .../imclassification/MNISTExampleSuite.scala  |   3 +-
 .../neuralstyle/NeuralStyleSuite.scala        |  92 ++++++
 14 files changed, 642 insertions(+), 625 deletions(-)
 create mode 100644 scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md
 delete mode 100644 scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
 create mode 100644 scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala

diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala
index 4d9aa35d2..ca4c242ab 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/ModelVgg19.scala
@@ -17,92 +17,73 @@
 
 package org.apache.mxnetexamples.neuralstyle
 
-import org.apache.mxnet.Context
-import org.apache.mxnet.Executor
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Shape
+import org.apache.mxnet.{Context, Executor, NDArray, Shape, Symbol}
 
 /**
- * Definition for the neuralstyle network and initialize it with pretrained weight
- * @author Depeng Liang
- */
+  * Definition for the neuralstyle network and initialize it with pretrained weight
+  */
 object ModelVgg19 {
   case class ConvExecutor(executor: Executor, data: NDArray, dataGrad: NDArray,
-                      style: Array[NDArray], content: NDArray, argDict: Map[String, NDArray])
+                          style: Array[NDArray], content: NDArray, argDict: Map[String, NDArray])
+
+  def ConvRelu(data : Symbol, convName : String, reluName : String,
+               numFilter : Int, kernel : (Int, Int) = (3, 3),
+               stride : (Int, Int) = (1, 1)) : Symbol = {
+    val conv = Symbol.api.Convolution(data = Some(data), num_filter = numFilter,
+      pad = Some(Shape(1, 1)), kernel = Shape(kernel._1, kernel._2),
+      stride = Some(Shape(stride._1, stride._2)), no_bias = Some(false),
+      workspace = Some(1024), name = convName)
+    val relu = Symbol.api.relu(data = Some(conv), name = reluName)
+    conv.dispose()
+    relu
+  }
 
   def getSymbol: (Symbol, Symbol) = {
+    getVggSymbol()
+  }
+
+  def getVggSymbol(prefix: String = "", contentOnly: Boolean = false): (Symbol, Symbol) = {
     // declare symbol
-    val data = Symbol.Variable("data")
-    val conv1_1 = Symbol.Convolution("conv1_1")()(Map("data" -> data , "num_filter" -> 64,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu1_1 = Symbol.Activation("relu1_1")()(Map("data" -> conv1_1 , "act_type" -> "relu"))
-    val conv1_2 = Symbol.Convolution("conv1_2")()(Map("data" -> relu1_1 , "num_filter" -> 64,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu1_2 = Symbol.Activation("relu1_2")()(Map("data" -> conv1_2 , "act_type" -> "relu"))
-    val pool1 = Symbol.Pooling("pool1")()(Map("data" -> relu1_2 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv2_1 = Symbol.Convolution("conv2_1")()(Map("data" -> pool1 , "num_filter" -> 128,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu2_1 = Symbol.Activation("relu2_1")()(Map("data" -> conv2_1 , "act_type" -> "relu"))
-    val conv2_2 = Symbol.Convolution("conv2_2")()(Map("data" -> relu2_1 , "num_filter" -> 128,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu2_2 = Symbol.Activation("relu2_2")()(Map("data" -> conv2_2 , "act_type" -> "relu"))
-    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> relu2_2 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv3_1 = Symbol.Convolution("conv3_1")()(Map("data" -> pool2 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_1 = Symbol.Activation("relu3_1")()(Map("data" -> conv3_1 , "act_type" -> "relu"))
-    val conv3_2 = Symbol.Convolution("conv3_2")()(Map("data" -> relu3_1 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_2 = Symbol.Activation("'relu3_2")()(Map("data" -> conv3_2 , "act_type" -> "relu"))
-    val conv3_3 = Symbol.Convolution("conv3_3")()(Map("data" -> relu3_2 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_3 = Symbol.Activation("relu3_3")()(Map("data" -> conv3_3 , "act_type" -> "relu"))
-    val conv3_4 = Symbol.Convolution("conv3_4")()(Map("data" -> relu3_3 , "num_filter" -> 256,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu3_4 = Symbol.Activation("relu3_4")()(Map("data" -> conv3_4 , "act_type" -> "relu"))
-    val pool3 = Symbol.Pooling("pool3")()(Map("data" -> relu3_4 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv4_1 = Symbol.Convolution("conv4_1")()(Map("data" -> pool3 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_1 = Symbol.Activation("relu4_1")()(Map("data" -> conv4_1 , "act_type" -> "relu"))
-    val conv4_2 = Symbol.Convolution("conv4_2")()(Map("data" -> relu4_1 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_2 = Symbol.Activation("relu4_2")()(Map("data" -> conv4_2 , "act_type" -> "relu"))
-    val conv4_3 = Symbol.Convolution("conv4_3")()(Map("data" -> relu4_2 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_3 = Symbol.Activation("relu4_3")()(Map("data" -> conv4_3 , "act_type" -> "relu"))
-    val conv4_4 = Symbol.Convolution("conv4_4")()(Map("data" -> relu4_3 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu4_4 = Symbol.Activation("relu4_4")()(Map("data" -> conv4_4 , "act_type" -> "relu"))
-    val pool4 = Symbol.Pooling("pool4")()(Map("data" -> relu4_4 , "pad" -> "(0,0)",
-                                    "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv5_1 = Symbol.Convolution("conv5_1")()(Map("data" -> pool4 , "num_filter" -> 512,
-                                        "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
-                                        "no_bias" -> false, "workspace" -> 1024))
-    val relu5_1 = Symbol.Activation("relu5_1")()(Map("data" -> conv5_1 , "act_type" -> "relu"))
+    val data = Symbol.Variable(s"${prefix}data")
+
+    val relu1_1 = ConvRelu(data, s"${prefix}conv1_1", s"${prefix}relu1_1", 64)
+    val relu1_2 = ConvRelu(relu1_1, s"${prefix}conv1_2", s"${prefix}relu1_2", 64)
+    val pool1 = Symbol.api.Pooling(data = Some(relu1_2), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool1")
+
+    val relu2_1 = ConvRelu(pool1, s"${prefix}conv2_1", s"${prefix}relu2_1", 128)
+    val relu2_2 = ConvRelu(relu2_1, s"${prefix}conv2_2", s"${prefix}relu2_2", 128)
+    val pool2 = Symbol.api.Pooling(data = Some(relu2_2), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool2")
+
+    val relu3_1 = ConvRelu(pool2, s"${prefix}conv3_1", s"${prefix}relu3_1", 256)
+    val relu3_2 = ConvRelu(relu3_1, s"${prefix}conv3_2", s"${prefix}relu3_2", 256)
+    val relu3_3 = ConvRelu(relu3_2, s"${prefix}conv3_3", s"${prefix}relu3_3", 256)
+    val relu3_4 = ConvRelu(relu3_3, s"${prefix}conv3_4", s"${prefix}relu3_4", 256)
+    val pool3 = Symbol.api.Pooling(data = Some(relu3_4), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool3")
+
+    val relu4_1 = ConvRelu(pool3, s"${prefix}conv4_1", s"${prefix}relu4_1", 512)
+    val relu4_2 = ConvRelu(relu4_1, s"${prefix}conv4_2", s"${prefix}relu4_2", 512)
+    val relu4_3 = ConvRelu(relu4_2, s"${prefix}conv4_3", s"${prefix}relu4_3", 512)
+    val relu4_4 = ConvRelu(relu4_3, s"${prefix}conv4_4", s"${prefix}relu4_4", 512)
+    val pool4 = Symbol.api.Pooling(data = Some(relu4_4), pad = Some(Shape(0, 0)),
+      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)), pool_type = Some("avg"),
+      name = s"${prefix}pool4")
+
+    val relu5_1 = ConvRelu(pool4, s"${prefix}conv5_1", s"${prefix}relu5_1", 512)
 
     // style and content layers
-    val style = Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
+    val style = if (contentOnly) null else Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
     val content = Symbol.Group(relu4_2)
     (style, content)
   }
 
   def getExecutor(style: Symbol, content: Symbol, modelPath: String,
-      inputSize: (Int, Int), ctx: Context): ConvExecutor = {
+                  inputSize: (Int, Int), ctx: Context): ConvExecutor = {
     val out = Symbol.Group(style, content)
     // make executor
     val (argShapes, outputShapes, auxShapes) = out.inferShape(
@@ -116,15 +97,17 @@ object ModelVgg19 {
       val key = s"arg:$name"
       if (pretrained.contains(key)) argDict(name).set(pretrained(key))
     }
+    pretrained.foreach(ele => ele._2.dispose())
     val executor = out.bind(ctx, argDict, gradDict)
+    out.dispose()
     val outArray = executor.outputs
     ConvExecutor(executor = executor,
-                              data = argDict("data"),
-                              dataGrad = gradDict("data"),
-                              style = outArray.take(outArray.length - 1),
-                              content = outArray(outArray.length - 1),
-                              argDict = argDict)
-    }
+      data = argDict("data"),
+      dataGrad = gradDict("data"),
+      style = outArray.take(outArray.length - 1),
+      content = outArray(outArray.length - 1),
+      argDict = argDict)
+  }
 
   def getModel(modelPath: String, inputSize: (Int, Int), ctx: Context): ConvExecutor = {
     val (style, content) = getSymbol
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
index d99ea641b..f98d725c2 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
@@ -17,22 +17,22 @@
 
 package org.apache.mxnetexamples.neuralstyle
 
-import org.apache.mxnet._
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-import scala.collection.JavaConverters._
-import com.sksamuel.scrimage.Image
 import java.io.File
-import com.sksamuel.scrimage.Pixel
+
+import com.sksamuel.scrimage.{Image, Pixel}
 import com.sksamuel.scrimage.filter.GaussianBlurFilter
 import com.sksamuel.scrimage.nio.JpegWriter
+import org.apache.mxnet._
 import org.apache.mxnet.optimizer.Adam
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ListBuffer
 
 /**
- * An Implementation of the paper A Neural Algorithm of Artistic Style
- * by Leon A. Gatys, Alexander S. Ecker, and Matthias Bethge
- * @author Depeng Liang
- */
+  * An Implementation of the paper A Neural Algorithm of Artistic Style
+  */
 object NeuralStyle {
   case class NSExecutor(executor: Executor, data: NDArray, dataGrad: NDArray)
 
@@ -109,11 +109,11 @@ object NeuralStyle {
     var gradScale = List[Int]()
     for (i <- 0 until style.listOutputs().length) {
       val shape = outputShape(i)
-      val x = Symbol.Reshape()()(Map("data" -> style.get(i),
-          "target_shape" -> Shape(shape(1), shape(2) * shape(3))))
-      // use fully connected to quickly do dot(x, x^T)
-      val gram = Symbol.FullyConnected()()(Map("data" -> x, "weight" -> x,
-          "no_bias" -> true, "num_hidden" -> shape(1)))
+      val x = Symbol.api.Reshape(data = Some(style.get(i)),
+        target_shape = Some(Shape(shape(1), shape(2) * shape(3))))
+      val gram = Symbol.api.FullyConnected(data = Some(x), weight = Some(x),
+        no_bias = Some(true), num_hidden = shape(1))
+      x.dispose()
       gramList = gramList :+ gram
       gradScale = gradScale :+ (shape(1) * shape(2) * shape(3) * shape(1))
     }
@@ -121,13 +121,20 @@ object NeuralStyle {
   }
 
   def getLoss(gram: Symbol, content: Symbol): (Symbol, Symbol) = {
-    var gramLoss = List[Symbol]()
+    var gramLoss = ListBuffer[Symbol]()
     for (i <- 0 until gram.listOutputs().length) {
       val gvar = Symbol.Variable(s"target_gram_$i")
-      gramLoss = gramLoss :+ Symbol.sum()(Symbol.square()(gvar - gram.get(i))())()
+      Symbol.api.square(data = Some(gvar - gram.get(i)))
+      gramLoss += Symbol.api.sum(
+        Some(Symbol.api.square(data = Some(gvar - gram.get(i))))
+      )
+      gvar.dispose()
     }
+    gram.dispose()
     val cvar = Symbol.Variable("target_content")
-    val contentLoss = Symbol.sum()(Symbol.square()(cvar - content)())()
+    val contentLoss = Symbol.api.sum(
+      Some(Symbol.api.square(Some(cvar - content)))
+    )
     (Symbol.Group(gramLoss: _*), contentLoss)
   }
 
@@ -138,12 +145,13 @@ object NeuralStyle {
     val nChannel = img.shape(1)
     val sImg = Symbol.Variable("img")
     val sKernel = Symbol.Variable("kernel")
-    val channels = Symbol.SliceChannel()(sImg)(Map("num_outputs" -> nChannel))
-    val out = Symbol.Concat()((0 until nChannel).map { i =>
-      Symbol.Convolution()()(Map("data" -> channels.get(i), "weight" -> sKernel,
-                    "num_filter" -> 1, "kernel" -> "(3,3)", "pad" -> "(1,1)",
-                    "no_bias" -> true, "stride" -> "(1,1)"))
-    }: _*)() * tvWeight
+    val channels = Symbol.api.SliceChannel(data = Some(sImg), num_outputs = nChannel)
+    val result = (0 until nChannel).map { i =>
+      Symbol.api.Convolution(data = Some(channels.get(i)), weight = Some(sKernel),
+        num_filter = 1, kernel = Shape(3, 3), pad = Some(Shape(1, 1)), no_bias = Some(true),
+        stride = Some(Shape(1, 1)))
+    }.toArray
+    val out = Symbol.api.Concat(result, result.length) * tvWeight
     val kernel = {
       val tmp = NDArray.empty(Shape(1, 1, 3, 3), ctx)
       tmp.set(Array[Float](0, -1, 0, -1, 4, -1, 0, -1, 0))
@@ -156,104 +164,123 @@ object NeuralStyle {
     Math.sqrt(array.map(x => x * x).sum.toDouble).toFloat
   }
 
-  def main(args: Array[String]): Unit = {
-    val alle = new NeuralStyle
-    val parser: CmdLineParser = new CmdLineParser(alle)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      assert(alle.contentImage != null && alle.styleImage != null
-        && alle.modelPath != null && alle.outputDir != null)
+  //scalastyle:off
+  def runTraining(model : String, contentImage : String, styleImage: String, dev : Context,
+                  modelPath : String, outputDir : String, styleWeight : Float,
+                  contentWeight : Float, tvWeight : Float, gaussianRadius : Int,
+                  lr: Float, maxNumEpochs: Int, maxLongEdge: Int,
+                  saveEpochs : Int, stopEps: Float) : Unit = {
 
-      val dev = if (alle.gpu >= 0) Context.gpu(alle.gpu) else Context.cpu(0)
-      val contentNp = preprocessContentImage(alle.contentImage, alle.maxLongEdge, dev)
-      val styleNp = preprocessStyleImage(alle.styleImage, contentNp.shape, dev)
-      val size = (contentNp.shape(2), contentNp.shape(3))
+    val contentNp = preprocessContentImage(contentImage, maxLongEdge, dev)
+    val styleNp = preprocessStyleImage(styleImage, contentNp.shape, dev)
+    val size = (contentNp.shape(2), contentNp.shape(3))
 
-      val (style, content) = ModelVgg19.getSymbol
-      val (gram, gScale) = styleGramSymbol(size, style)
-      var modelExecutor = ModelVgg19.getExecutor(gram, content, alle.modelPath, size, dev)
+    val (style, content) = ModelVgg19.getSymbol
+    val (gram, gScale) = styleGramSymbol(size, style)
+    var modelExecutor = ModelVgg19.getExecutor(gram, content, modelPath, size, dev)
 
-      modelExecutor.data.set(styleNp)
-      modelExecutor.executor.forward()
+    modelExecutor.data.set(styleNp)
+    modelExecutor.executor.forward()
 
-      val styleArray = modelExecutor.style.map(_.copyTo(Context.cpu()))
-      modelExecutor.data.set(contentNp)
-      modelExecutor.executor.forward()
-      val contentArray = modelExecutor.content.copyTo(Context.cpu())
+    val styleArray = modelExecutor.style.map(_.copyTo(Context.cpu()))
+    modelExecutor.data.set(contentNp)
+    modelExecutor.executor.forward()
+    val contentArray = modelExecutor.content.copyTo(Context.cpu())
 
-      // delete the executor
-      modelExecutor = null
+    // delete the executor
+    modelExecutor.argDict.foreach(ele => ele._2.dispose())
+    modelExecutor.content.dispose()
+    modelExecutor.data.dispose()
+    modelExecutor.dataGrad.dispose()
+    modelExecutor.style.foreach(_.dispose())
+    modelExecutor.executor.dispose()
+    modelExecutor = null
 
-      val (styleLoss, contentLoss) = getLoss(gram, content)
-      modelExecutor = ModelVgg19.getExecutor(
-          styleLoss, contentLoss, alle.modelPath, size, dev)
+    val (styleLoss, contentLoss) = getLoss(gram, content)
+    modelExecutor = ModelVgg19.getExecutor(
+      styleLoss, contentLoss, modelPath, size, dev)
 
-      val gradArray = {
-        var tmpGA = Array[NDArray]()
-        for (i <- 0 until styleArray.length) {
-          modelExecutor.argDict(s"target_gram_$i").set(styleArray(i))
-          tmpGA = tmpGA :+ NDArray.ones(Shape(1), dev) * (alle.styleWeight / gScale(i))
-        }
-        tmpGA :+ NDArray.ones(Shape(1), dev) * alle.contentWeight
+    val gradArray = {
+      var tmpGA = Array[NDArray]()
+      for (i <- 0 until styleArray.length) {
+        modelExecutor.argDict(s"target_gram_$i").set(styleArray(i))
+        tmpGA = tmpGA :+ NDArray.ones(Shape(1), dev) * (styleWeight / gScale(i))
       }
+      tmpGA :+ NDArray.ones(Shape(1), dev) * contentWeight
+    }
 
-      modelExecutor.argDict("target_content").set(contentArray)
-
-      // train
-      val img = Random.uniform(-0.1f, 0.1f, contentNp.shape, dev)
-      val lr = new FactorScheduler(step = 10, factor = 0.9f)
-
-      saveImage(contentNp, s"${alle.outputDir}/input.jpg", alle.guassianRadius)
-      saveImage(styleNp, s"${alle.outputDir}/style.jpg", alle.guassianRadius)
-
-      val optimizer = new Adam(
-          learningRate = alle.lr,
-          wd = 0.005f,
-          lrScheduler = lr)
-      val optimState = optimizer.createState(0, img)
-
-      logger.info(s"start training arguments $alle")
-
-      var oldImg = img.copyTo(dev)
-      val clipNorm = img.shape.toVector.reduce(_ * _)
-      val tvGradExecutor = getTvGradExecutor(img, dev, alle.tvWeight)
-      var eps = 0f
-      var trainingDone = false
-      var e = 0
-      while (e < alle.maxNumEpochs && !trainingDone) {
-        modelExecutor.data.set(img)
-        modelExecutor.executor.forward()
-        modelExecutor.executor.backward(gradArray)
-
-        val gNorm = NDArray.norm(modelExecutor.dataGrad).toScalar
-        if (gNorm > clipNorm) {
-          modelExecutor.dataGrad.set(modelExecutor.dataGrad * (clipNorm / gNorm))
-        }
-        tvGradExecutor match {
-          case Some(executor) => {
-            executor.forward()
-            optimizer.update(0, img,
-                modelExecutor.dataGrad + executor.outputs(0),
-                optimState)
-          }
-          case None =>
-            optimizer.update(0, img, modelExecutor.dataGrad, optimState)
-        }
-        eps = (NDArray.norm(oldImg - img) / NDArray.norm(img)).toScalar
-        oldImg.set(img)
-        logger.info(s"epoch $e, relative change $eps")
+    modelExecutor.argDict("target_content").set(contentArray)
 
-        if (eps < alle.stopEps) {
-          logger.info("eps < args.stop_eps, training finished")
-          trainingDone = true
-        }
-        if ((e + 1) % alle.saveEpochs == 0) {
-          saveImage(img, s"${alle.outputDir}/tmp_${e + 1}.jpg", alle.guassianRadius)
+    // train
+    val img = Random.uniform(-0.1f, 0.1f, contentNp.shape, dev)
+    val lrFS = new FactorScheduler(step = 10, factor = 0.9f)
+
+    saveImage(contentNp, s"${outputDir}/input.jpg", gaussianRadius)
+    saveImage(styleNp, s"${outputDir}/style.jpg", gaussianRadius)
+
+    val optimizer = new Adam(
+      learningRate = lr,
+      wd = 0.005f,
+      lrScheduler = lrFS)
+    val optimState = optimizer.createState(0, img)
+
+    logger.info(s"start training arguments")
+
+    var oldImg = img.copyTo(dev)
+    val clipNorm = img.shape.toVector.reduce(_ * _)
+    val tvGradExecutor = getTvGradExecutor(img, dev, tvWeight)
+    var eps = 0f
+    var trainingDone = false
+    var e = 0
+    while (e < maxNumEpochs && !trainingDone) {
+      modelExecutor.data.set(img)
+      modelExecutor.executor.forward()
+      modelExecutor.executor.backward(gradArray)
+
+      val gNorm = NDArray.norm(modelExecutor.dataGrad).toScalar
+      if (gNorm > clipNorm) {
+        modelExecutor.dataGrad.set(modelExecutor.dataGrad * (clipNorm / gNorm))
+      }
+      tvGradExecutor match {
+        case Some(executor) => {
+          executor.forward()
+          optimizer.update(0, img,
+            modelExecutor.dataGrad + executor.outputs(0),
+            optimState)
         }
-        e = e + 1
+        case None =>
+          optimizer.update(0, img, modelExecutor.dataGrad, optimState)
+      }
+      eps = (NDArray.norm(oldImg - img) / NDArray.norm(img)).toScalar
+      oldImg.set(img)
+      logger.info(s"epoch $e, relative change $eps")
+
+      if (eps < stopEps) {
+        logger.info("eps < args.stop_eps, training finished")
+        trainingDone = true
+      }
+      if ((e + 1) % saveEpochs == 0) {
+        saveImage(img, s"${outputDir}/tmp_${e + 1}.jpg", gaussianRadius)
       }
-      saveImage(img, s"${alle.outputDir}/out.jpg", alle.guassianRadius)
-      logger.info("Finish fit ...")
+      e = e + 1
+    }
+    saveImage(img, s"${outputDir}/out.jpg", gaussianRadius)
+    logger.info("Finish fit ...")
+  }
+
+  def main(args: Array[String]): Unit = {
+    val alle = new NeuralStyle
+    val parser: CmdLineParser = new CmdLineParser(alle)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(alle.contentImage != null && alle.styleImage != null
+        && alle.modelPath != null && alle.outputDir != null)
+
+      val dev = if (alle.gpu >= 0) Context.gpu(alle.gpu) else Context.cpu(0)
+      runTraining(alle.model, alle.contentImage, alle.styleImage, dev, alle.modelPath,
+        alle.outputDir, alle.styleWeight, alle.contentWeight, alle.tvWeight,
+        alle.gaussianRadius, alle.lr, alle.maxNumEpochs, alle.maxLongEdge,
+        alle.saveEpochs, alle.stopEps)
     } catch {
       case ex: Exception => {
         logger.error(ex.getMessage, ex)
@@ -293,6 +320,6 @@ class NeuralStyle {
   private val outputDir: String = null
   @Option(name = "--save-epochs", usage = "save the output every n epochs")
   private val saveEpochs: Int = 50
-  @Option(name = "--guassian-radius", usage = "the gaussian blur filter radius")
-  private val guassianRadius: Int = 1
+  @Option(name = "--gaussian-radius", usage = "the gaussian blur filter radius")
+  private val gaussianRadius: Int = 1
 }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md
new file mode 100644
index 000000000..fe849343c
--- /dev/null
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/README.md
@@ -0,0 +1,83 @@
+# Neural Style Example for Scala
+
+## Introduction
+This model contains three important components:
+- Boost Inference
+- Boost Training
+- Neural Style conversion
+
+You can use the prebuilt VGG model to do the conversion.
+By adding a style image, you can create several interesting images.
+
+Original Image            |  Style Image
+:-------------------------:|:-------------------------:
+![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/IMG_4343.jpg)  |  ![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/starry_night.jpg)
+
+Boost Inference Image (pretrained)           |  Epoch 150 Image
+:-------------------------:|:-------------------------:
+![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/out_3.jpg)  |  ![](https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/tmp_150.jpg)
+
+## Setup
+Please download the input image and style image following the links below:
+
+Input image
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/IMG_4343.jpg
+```
+Style image
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/starry_night.jpg
+```
+
+VGG model --Boost inference
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/model.zip
+```
+
+VGG model --Boost Training
+```bash
+https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/vgg19.params
+```
+
+Please unzip the model before you use it.
+
+## Boost Inference Example
+
+Please provide the corresponding arguments before you execute the program
+```bash
+--input-image
+<path>/IMG_4343.jpg
+--model-path
+<path>/model
+--output-path
+<outputPath>
+```
+
+## Boost Training Example
+Please download your own training data for boost training.
+You can use 26k images sampled from [MIT Place dataset](http://places.csail.mit.edu/).
+```bash
+--style-image
+<path>/starry_night.jpg
+--data-path
+<path>/images
+--vgg-model-path
+<path>/vgg19.params
+--save-model-path
+<path>
+```
+
+## NeuralStyle Example
+Please provide the corresponding arguments before you execute the program
+```bash
+--model-path
+<path>/vgg19.params
+--content-image
+<path>/IMG_4343.jpg
+--style-image
+<path>/starry_night.jpg
+--gpu
+<num_of_gpus>
+--output-dir
+<path>
+```
\ No newline at end of file
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala
index c604f842c..56303253f 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Basic.scala
@@ -17,16 +17,11 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Initializer
+import org.apache.mxnet.{Context, Initializer, NDArray, Shape, Symbol}
+import org.apache.mxnetexamples.neuralstyle.ModelVgg19
 import org.slf4j.LoggerFactory
 
-/**
- * @author Depeng Liang
- */
+
 object Basic {
 
   class PretrainedInit(prefix: String, params: Map[String, NDArray],
@@ -61,7 +56,7 @@ object Basic {
   def getStyleModule(prefix: String, dShape: Shape,
       ctx: Context, params: Map[String, NDArray]): Module = {
     val inputShape = Map(s"${prefix}_data" -> dShape)
-    val (style, content) = ModelVgg19.getVggSymbol(prefix)
+    val (style, content) = ModelVgg19.getVggSymbol(prefix + "_")
     val (gram, gScale) = styleGramSymbol(inputShape, style)
     val init = new PretrainedInit(prefix, params, true)
     new Module(symbol = gram, context = ctx,
@@ -75,11 +70,10 @@ object Basic {
     var gradScale = List[Int]()
     for (i <- 0 until style.listOutputs().length) {
       val shape = outputShape(i)
-      val x = Symbol.Reshape()()(Map("data" -> style.get(i),
-          "shape" -> Shape(shape(1), shape(2) * shape(3))))
-      // use fully connected to quickly do dot(x, x^T)
-      val gram = Symbol.FullyConnected()()(Map("data" -> x, "weight" -> x,
-          "no_bias" -> true, "num_hidden" -> shape(1)))
+      val x = Symbol.api.Reshape(data = Some(style.get(i)),
+        shape = Some(Shape(shape(1), shape(2) * shape(3))))
+      val gram = Symbol.api.FullyConnected(data = Some(x), weight = Some(x),
+        no_bias = Some(true), num_hidden = shape(1))
       gramList = gramList :+ gram
       gradScale = gradScale :+ (shape(1) * shape(2) * shape(3) * shape(1))
     }
@@ -90,16 +84,18 @@ object Basic {
     var gramLoss = List[Symbol]()
     for (i <- 0 until gram.listOutputs().length) {
       val gvar = Symbol.Variable(s"target_gram_$i")
-      gramLoss = gramLoss :+ Symbol.sum()(Symbol.square()(gvar - gram.get(i))())()
+      gramLoss = gramLoss :+ Symbol.api.sum(Some(
+        Symbol.api.square(Some(gvar - gram.get(i)))
+      ))
     }
     val cvar = Symbol.Variable("target_content")
-    val contentLoss = Symbol.sum()(Symbol.square()(cvar - content)())()
+    val contentLoss = Symbol.api.sum(Some(Symbol.api.square(Some(cvar - content))))
     (Symbol.Group(gramLoss: _*), contentLoss)
   }
 
   def getContentModule(prefix: String, dShape: Shape,
       ctx: Context, params: Map[String, NDArray]): Module = {
-    val (_, sym) = ModelVgg19.getVggSymbol(prefix, true)
+    val (_, sym) = ModelVgg19.getVggSymbol(prefix + "_", true)
     val init = new PretrainedInit(prefix, params)
     new Module(symbol = sym, context = ctx,
                     dataShapes = Map(s"${prefix}_data" -> dShape),
@@ -109,7 +105,7 @@ object Basic {
   def getLossModule(prefix: String, dShape: Shape,
       ctx: Context, params: Map[String, NDArray]): (Module, List[Int]) = {
     val inputShape = Map(s"${prefix}_data" -> dShape)
-    val (style, content) = ModelVgg19.getVggSymbol(prefix)
+    val (style, content) = ModelVgg19.getVggSymbol(prefix + "_")
     val (gram, gScale) = styleGramSymbol(inputShape, style)
     val (styleLoss, contentLoss) = getLoss(gram, content)
     val sym = Symbol.Group(styleLoss, contentLoss)
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala
index 0feb73d30..5410fb9ed 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostInference.scala
@@ -17,19 +17,43 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.slf4j.LoggerFactory
+import org.apache.mxnet.{Context, Shape}
 import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+
 import scala.collection.JavaConverters._
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
 
-/**
- * @author Depeng Liang
- */
 object BoostInference {
 
   private val logger = LoggerFactory.getLogger(classOf[BoostInference])
 
+  def runInference(modelPath: String, outputPath: String, guassianRadius : Int,
+                   inputImage : String, ctx : Context): Unit = {
+    val dShape = Shape(1, 3, 480, 640)
+    val clipNorm = 1.0f * dShape.product
+    // generator
+    val gens = Array(
+      GenV4.getModule("g0", dShape, ctx, isTrain = false),
+      GenV3.getModule("g1", dShape, ctx, isTrain = false),
+      GenV3.getModule("g2", dShape, ctx, isTrain = false),
+      GenV4.getModule("g3", dShape, ctx, isTrain = false)
+    )
+    gens.zipWithIndex.foreach { case (gen, i) =>
+      gen.loadParams(s"$modelPath/$i/v3_0002-0026000.params")
+    }
+
+    val contentNp =
+      DataProcessing.preprocessContentImage(s"$inputImage", dShape, ctx)
+    var data = Array(contentNp)
+    for (i <- 0 until gens.length) {
+      gens(i).forward(data.takeRight(1))
+      val newImg = gens(i).getOutputs()(0)
+      data :+= newImg
+      DataProcessing.saveImage(newImg, s"$outputPath/out_$i.jpg", guassianRadius)
+      logger.info(s"Converted image: $outputPath/out_$i.jpg")
+    }
+  }
+
   def main(args: Array[String]): Unit = {
     val stce = new BoostInference
     val parser: CmdLineParser = new CmdLineParser(stce)
@@ -39,30 +63,10 @@ object BoostInference {
           && stce.inputImage != null
           && stce.outputPath != null)
 
-      val dShape = Shape(1, 3, 480, 640)
-      val clipNorm = 1.0f * dShape.product
       val ctx = if (stce.gpu == -1) Context.cpu() else Context.gpu(stce.gpu)
 
-      // generator
-      val gens = Array(
-          GenV4.getModule("g0", dShape, ctx, isTrain = false),
-          GenV3.getModule("g1", dShape, ctx, isTrain = false),
-          GenV3.getModule("g2", dShape, ctx, isTrain = false),
-          GenV4.getModule("g3", dShape, ctx, isTrain = false)
-      )
-      gens.zipWithIndex.foreach { case (gen, i) =>
-        gen.loadParams(s"${stce.modelPath}/$i/v3_0002-0026000.params")
-      }
+      runInference(stce.modelPath, stce.outputPath, stce.guassianRadius, stce.inputImage, ctx)
 
-      val contentNp =
-        DataProcessing.preprocessContentImage(s"${stce.inputImage}", dShape, ctx)
-      var data = Array(contentNp)
-      for (i <- 0 until gens.length) {
-        gens(i).forward(data.takeRight(1))
-        val newImg = gens(i).getOutputs()(0)
-        data :+= newImg
-        DataProcessing.saveImage(newImg, s"${stce.outputPath}/out_${i}.jpg", stce.guassianRadius)
-      }
     } catch {
       case ex: Exception => {
         logger.error(ex.getMessage, ex)
@@ -74,7 +78,7 @@ object BoostInference {
 }
 
 class BoostInference {
-  @Option(name = "--model-path", usage = "the save model path")
+  @Option(name = "--model-path", usage = "the saved model path")
   private val modelPath: String = null
   @Option(name = "--input-image", usage = "the style image")
   private val inputImage: String = null
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
index 8b5549de4..08b4c85d2 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
@@ -17,24 +17,17 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.slf4j.LoggerFactory
+import java.io.File
+
+import org.apache.mxnet.{Context, Executor, NDArray, Shape, Symbol}
+import org.apache.mxnet.optimizer.SGD
 import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+
 import scala.collection.JavaConverters._
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.DataBatch
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Executor
-import org.apache.mxnet.optimizer.SGD
-import java.io.File
-import javax.imageio.ImageIO
 import scala.util.Random
-import org.apache.mxnet.optimizer.Adam
 
-/**
- * @author Depeng Liang
- */
+
 object BoostTrain {
 
   private val logger = LoggerFactory.getLogger(classOf[BoostTrain])
@@ -46,12 +39,13 @@ object BoostTrain {
     val nChannel = img.shape(1)
     val sImg = Symbol.Variable("img")
     val sKernel = Symbol.Variable("kernel")
-    val channels = Symbol.SliceChannel()(sImg)(Map("num_outputs" -> nChannel))
-    val out = Symbol.Concat()((0 until nChannel).map { i =>
-      Symbol.Convolution()()(Map("data" -> channels.get(i), "weight" -> sKernel,
-                    "num_filter" -> 1, "kernel" -> "(3,3)", "pad" -> "(1,1)",
-                    "no_bias" -> true, "stride" -> "(1,1)"))
-    }.toArray: _*)() * tvWeight
+    val channels = Symbol.api.SliceChannel(data = Some(sImg), num_outputs = nChannel)
+    val toConcat = (0 until nChannel).map( i =>
+      Symbol.api.Convolution(data = Some(channels.get(i)), weight = Some(sKernel),
+        num_filter = 1, kernel = Shape(3, 3), pad = Some(Shape(1, 1)),
+        no_bias = Some(true), stride = Some(Shape(1, 1)))
+    ).toArray
+    val out = Symbol.api.Concat(data = toConcat, num_args = toConcat.length) * tvWeight
     val kernel = {
       val tmp = NDArray.empty(Shape(1, 1, 3, 3), ctx)
       tmp.set(Array[Float](0, -1, 0, -1, 4, -1, 0, -1, 0))
@@ -60,130 +54,135 @@ object BoostTrain {
     out.bind(ctx, Map("img" -> img, "kernel" -> kernel))
   }
 
-  def main(args: Array[String]): Unit = {
-    val stin = new BoostTrain
-    val parser: CmdLineParser = new CmdLineParser(stin)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      assert(stin.dataPath != null
-          && stin.vggModelPath != null
-          && stin.saveModelPath != null
-          && stin.styleImage != null)
-      // params
-      val vggParams = NDArray.load2Map(stin.vggModelPath)
-      val styleWeight = 1.2f
-      val contentWeight = 10f
-      val dShape = Shape(1, 3, 384, 384)
-      val clipNorm = 0.05f * dShape.product
-      val modelPrefix = "v3"
-      val ctx = if (stin.gpu == -1) Context.cpu() else Context.gpu(stin.gpu)
-
-      // init style
-      val styleNp = DataProcessing.preprocessStyleImage(stin.styleImage, dShape, ctx)
-      var styleMod = Basic.getStyleModule("style", dShape, ctx, vggParams)
-      styleMod.forward(Array(styleNp))
-      val styleArray = styleMod.getOutputs().map(_.copyTo(Context.cpu()))
-      styleMod.dispose()
-      styleMod = null
-
-      // content
-      val contentMod = Basic.getContentModule("content", dShape, ctx, vggParams)
-
-      // loss
-      val (loss, gScale) = Basic.getLossModule("loss", dShape, ctx, vggParams)
-      val extraArgs = (0 until styleArray.length)
-                                  .map( i => s"target_gram_$i" -> styleArray(i)).toMap
-      loss.setParams(extraArgs)
-      var gradArray = Array[NDArray]()
-      for (i <- 0 until styleArray.length) {
-        gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * (styleWeight / gScale(i)))
-      }
-      gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * contentWeight)
-
-      // generator
-      val gens = Array(
-          GenV4.getModule("g0", dShape, ctx),
-          GenV3.getModule("g1", dShape, ctx),
-          GenV3.getModule("g2", dShape, ctx),
-          GenV4.getModule("g3", dShape, ctx)
-      )
-      gens.foreach { gen =>
-        val opt = new SGD(learningRate = 1e-4f,
-                          momentum = 0.9f,
-                          wd = 5e-3f,
-                          clipGradient = 5f)
-        gen.initOptimizer(opt)
-      }
+  def runTraining(dataPath : String, vggModelPath: String, ctx : Context,
+                  styleImage : String, saveModelPath : String) : Unit = {
+    // params
+    val vggParams = NDArray.load2Map(vggModelPath)
+    val styleWeight = 1.2f
+    val contentWeight = 10f
+    val dShape = Shape(1, 3, 384, 384)
+    val clipNorm = 0.05f * dShape.product
+    val modelPrefix = "v3"
+    // init style
+    val styleNp = DataProcessing.preprocessStyleImage(styleImage, dShape, ctx)
+    var styleMod = Basic.getStyleModule("style", dShape, ctx, vggParams)
+    styleMod.forward(Array(styleNp))
+    val styleArray = styleMod.getOutputs().map(_.copyTo(Context.cpu()))
+    styleMod.dispose()
+    styleMod = null
+
+    // content
+    val contentMod = Basic.getContentModule("content", dShape, ctx, vggParams)
+
+    // loss
+    val (loss, gScale) = Basic.getLossModule("loss", dShape, ctx, vggParams)
+    val extraArgs = (0 until styleArray.length)
+      .map( i => s"target_gram_$i" -> styleArray(i)).toMap
+    loss.setParams(extraArgs)
+    var gradArray = Array[NDArray]()
+    for (i <- 0 until styleArray.length) {
+      gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * (styleWeight / gScale(i)))
+    }
+    gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * contentWeight)
+
+    // generator
+    val gens = Array(
+      GenV4.getModule("g0", dShape, ctx),
+      GenV3.getModule("g1", dShape, ctx),
+      GenV3.getModule("g2", dShape, ctx),
+      GenV4.getModule("g3", dShape, ctx)
+    )
+    gens.foreach { gen =>
+      val opt = new SGD(learningRate = 1e-4f,
+        momentum = 0.9f,
+        wd = 5e-3f,
+        clipGradient = 5f)
+      gen.initOptimizer(opt)
+    }
 
-      var filelist = new File(stin.dataPath).list().toList
-      val numImage = filelist.length
-      logger.info(s"Dataset size: $numImage")
+    var filelist = new File(dataPath).list().toList
+    val numImage = filelist.length
+    logger.info(s"Dataset size: $numImage")
 
-      val tvWeight = 1e-2f
+    val tvWeight = 1e-2f
 
-      val startEpoch = 0
-      val endEpoch = 3
+    val startEpoch = 0
+    val endEpoch = 3
 
-      for (k <- 0 until gens.length) {
-        val path = new File(s"${stin.saveModelPath}/$k")
-        if (!path.exists()) path.mkdir()
-      }
+    for (k <- 0 until gens.length) {
+      val path = new File(s"${saveModelPath}/$k")
+      if (!path.exists()) path.mkdir()
+    }
 
-      // train
-      for (i <- startEpoch until endEpoch) {
-        filelist = Random.shuffle(filelist)
-        for (idx <- filelist.indices) {
-          var dataArray = Array[NDArray]()
-          var lossGradArray = Array[NDArray]()
-          val data =
-            DataProcessing.preprocessContentImage(s"${stin.dataPath}/${filelist(idx)}", dShape, ctx)
-          dataArray = dataArray :+ data
-          // get content
-          contentMod.forward(Array(data))
-          // set target content
-          loss.setParams(Map("target_content" -> contentMod.getOutputs()(0)))
-          // gen_forward
-          for (k <- 0 until gens.length) {
-            gens(k).forward(dataArray.takeRight(1))
-            dataArray = dataArray :+ gens(k).getOutputs()(0)
-            // loss forward
-            loss.forward(dataArray.takeRight(1))
-            loss.backward(gradArray)
-            lossGradArray = lossGradArray :+ loss.getInputGrads()(0)
-          }
-          val grad = NDArray.zeros(data.shape, ctx)
-          for (k <- gens.length - 1 to 0 by -1) {
-            val tvGradExecutor = getTvGradExecutor(gens(k).getOutputs()(0), ctx, tvWeight)
-            tvGradExecutor.forward()
-            grad += lossGradArray(k) + tvGradExecutor.outputs(0)
-            val gNorm = NDArray.norm(grad)
-            if (gNorm.toScalar > clipNorm) {
-              grad *= clipNorm / gNorm.toScalar
-            }
-            gens(k).backward(Array(grad))
-            gens(k).update()
-            gNorm.dispose()
-            tvGradExecutor.dispose()
+    // train
+    for (i <- startEpoch until endEpoch) {
+      filelist = Random.shuffle(filelist)
+      for (idx <- filelist.indices) {
+        var dataArray = Array[NDArray]()
+        var lossGradArray = Array[NDArray]()
+        val data =
+          DataProcessing.preprocessContentImage(s"${dataPath}/${filelist(idx)}", dShape, ctx)
+        dataArray = dataArray :+ data
+        // get content
+        contentMod.forward(Array(data))
+        // set target content
+        loss.setParams(Map("target_content" -> contentMod.getOutputs()(0)))
+        // gen_forward
+        for (k <- 0 until gens.length) {
+          gens(k).forward(dataArray.takeRight(1))
+          dataArray = dataArray :+ gens(k).getOutputs()(0)
+          // loss forward
+          loss.forward(dataArray.takeRight(1))
+          loss.backward(gradArray)
+          lossGradArray = lossGradArray :+ loss.getInputGrads()(0)
+        }
+        val grad = NDArray.zeros(data.shape, ctx)
+        for (k <- gens.length - 1 to 0 by -1) {
+          val tvGradExecutor = getTvGradExecutor(gens(k).getOutputs()(0), ctx, tvWeight)
+          tvGradExecutor.forward()
+          grad += lossGradArray(k) + tvGradExecutor.outputs(0)
+          val gNorm = NDArray.norm(grad)
+          if (gNorm.toScalar > clipNorm) {
+            grad *= clipNorm / gNorm.toScalar
           }
-          grad.dispose()
-          if (idx % 20 == 0) {
-            logger.info(s"Epoch $i: Image $idx")
-            for (k <- 0 until gens.length) {
-              val n = NDArray.norm(gens(k).getInputGrads()(0))
-              logger.info(s"Data Norm : ${n.toScalar / dShape.product}")
-              n.dispose()
-            }
+          gens(k).backward(Array(grad))
+          gens(k).update()
+          gNorm.dispose()
+          tvGradExecutor.dispose()
+        }
+        grad.dispose()
+        if (idx % 20 == 0) {
+          logger.info(s"Epoch $i: Image $idx")
+          for (k <- 0 until gens.length) {
+            val n = NDArray.norm(gens(k).getInputGrads()(0))
+            logger.info(s"Data Norm : ${n.toScalar / dShape.product}")
+            n.dispose()
           }
-          if (idx % 1000 == 0) {
-            for (k <- 0 until gens.length) {
-              gens(k).saveParams(
-                  s"${stin.saveModelPath}/$k/${modelPrefix}_" +
-                  s"${"%04d".format(i)}-${"%07d".format(idx)}.params")
-            }
+        }
+        if (idx % 1000 == 0) {
+          for (k <- 0 until gens.length) {
+            gens(k).saveParams(
+              s"${saveModelPath}/$k/${modelPrefix}_" +
+                s"${"%04d".format(i)}-${"%07d".format(idx)}.params")
           }
-          data.dispose()
         }
+        data.dispose()
       }
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    val stin = new BoostTrain
+    val parser: CmdLineParser = new CmdLineParser(stin)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(stin.dataPath != null
+          && stin.vggModelPath != null
+          && stin.saveModelPath != null
+          && stin.styleImage != null)
+
+      val ctx = if (stin.gpu == -1) Context.cpu() else Context.gpu(stin.gpu)
+      runTraining(stin.dataPath, stin.vggModelPath, ctx, stin.styleImage, stin.saveModelPath)
     } catch {
       case ex: Exception => {
         logger.error(ex.getMessage, ex)
@@ -197,9 +196,9 @@ object BoostTrain {
 class BoostTrain {
   @Option(name = "--data-path", usage = "the input train data path")
   private val dataPath: String = null
-  @Option(name = "--vgg--model-path", usage = "the pretrained model to use: ['vgg']")
+  @Option(name = "--vgg-model-path", usage = "the pretrained model to use: ['vgg']")
   private val vggModelPath: String = null
-  @Option(name = "--save--model-path", usage = "the save model path")
+  @Option(name = "--save-model-path", usage = "the save model path")
   private val saveModelPath: String = null
   @Option(name = "--style-image", usage = "the style image")
   private val styleImage: String = null
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
index 94d05bb7d..80a009ea4 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
@@ -17,19 +17,14 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import com.sksamuel.scrimage.Image
-import com.sksamuel.scrimage.Pixel
+import java.io.File
+
+import com.sksamuel.scrimage.{Image, Pixel}
 import com.sksamuel.scrimage.filter.GaussianBlurFilter
 import com.sksamuel.scrimage.nio.JpegWriter
-import org.apache.mxnet.Context
-import org.apache.mxnet.NDArray
-import java.io.File
-import org.apache.mxnet.Shape
-import scala.util.Random
+import org.apache.mxnet.{Context, NDArray, Shape}
+
 
-/**
- * @author Depeng Liang
- */
 object DataProcessing {
 
   def preprocessContentImage(path: String,
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala
index b90e9f0e3..d7ab59e28 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV3.scala
@@ -17,34 +17,33 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.Xavier
+import org.apache.mxnet.{Context, Shape, Symbol, Xavier}
+
 
-/**
- * @author Depeng Liang
- */
 object GenV3 {
   def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (5, 5),
-      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
-    var sym = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> false))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    sym = Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    sym
+           pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
+    val sym1 = Symbol.api.Convolution(data = Some(data), num_filter = numFilter,
+      kernel = Shape(kernel._1, kernel._2), stride = Some(Shape(stride._1, stride._2)),
+      pad = Some(Shape(pad._1, pad._2)), no_bias = Some(false))
+    val sym2 = Symbol.api.BatchNorm(data = Some(sym1), fix_gamma = Some(false))
+    val sym3 = Symbol.api.LeakyReLU(data = Some(sym2), act_type = Some("leaky"))
+    sym2.dispose()
+    sym1.dispose()
+    sym3
   }
 
   def Deconv(data: Symbol, numFilter: Int, imHw: (Int, Int),
-      kernel: (Int, Int) = (7, 7), pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2),
-      crop: Boolean = true, out: Boolean = false): Symbol = {
-    var sym = Symbol.Deconvolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> true))
-    if (crop) sym = Symbol.Crop()(sym)(
-        Map("offset" -> "(1, 1)", "h_w" -> s"$imHw", "num_args" -> 1))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    if (out == false) Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    else Symbol.Activation()()(Map("data" -> sym, "act_type" -> "tanh"))
+             kernel: (Int, Int) = (7, 7), pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2),
+             crop: Boolean = true, out: Boolean = false): Symbol = {
+    var sym = Symbol.api.Deconvolution(data = Some(data), num_filter = numFilter,
+      kernel = Shape(kernel._1, kernel._2), stride = Some(Shape(stride._1, stride._2)),
+      pad = Some(Shape(pad._1, pad._2)), no_bias = Some(true))
+    if (crop) sym = Symbol.api.Crop(data = Array(sym), offset = Some(Shape(1, 1)),
+      h_w = Some(Shape(imHw._1, imHw._2)), num_args = 1)
+    sym = Symbol.api.BatchNorm(data = Some(sym), fix_gamma = Some(false))
+    if (out == false) Symbol.api.LeakyReLU(data = Some(sym), act_type = Some("leaky"))
+    else Symbol.api.Activation(data = Some(sym), act_type = "tanh")
   }
 
   def getGenerator(prefix: String, imHw: (Int, Int)): Symbol = {
@@ -61,12 +60,12 @@ object GenV3 {
     val conv5_1 = Conv(deconv2, 96, kernel = (3, 3), pad = (1, 1), stride = (1, 1))
     val deconv3 = Deconv(conv5_1, 3, imHw, kernel = (8, 8), pad = (3, 3), out = true, crop = false)
     val rawOut = (deconv3 * 128) + 128
-    val norm = Symbol.SliceChannel()(rawOut)(Map("num_outputs" -> 3))
+    val norm = Symbol.api.SliceChannel(data = Some(rawOut), num_outputs = 3)
     val rCh = norm.get(0) - 123.68f
     val gCh = norm.get(1) - 116.779f
     val bCh = norm.get(2) - 103.939f
-    val normOut = Symbol.Concat()(rCh, gCh, bCh)() * 0.4f + data * 0.6f
-    normOut
+    val normOut = Symbol.api.Concat(data = Array(rCh, gCh, bCh), num_args = 3)
+    normOut * 0.4f + data * 0.6f
   }
 
   def getModule(prefix: String, dShape: Shape, ctx: Context, isTrain: Boolean = true): Module = {
@@ -77,9 +76,9 @@ object GenV3 {
       else (dataShape, false, false)
     }
     val mod = new Module(symbol = sym, context = ctx,
-                         dataShapes = dataShapes,
-                         initializer = new Xavier(magnitude = 2f),
-                         forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
+      dataShapes = dataShapes,
+      initializer = new Xavier(magnitude = 2f),
+      forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
     mod
   }
 }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala
index 876a0529b..82fc9b6ce 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/GenV4.scala
@@ -17,78 +17,43 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Context
-import org.apache.mxnet.Xavier
+import org.apache.mxnet.{Context, Shape, Symbol, Xavier}
 
-/**
- * @author Depeng Liang
- */
-object GenV4 {
 
-  def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (5, 5),
-      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
-    var sym = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> false))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    sym = Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    sym
-  }
+object GenV4 {
 
-  def Deconv(data: Symbol, numFilter: Int, imHw: (Int, Int), kernel: (Int, Int) = (6, 6),
-      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2), out: Boolean = false): Symbol = {
-    var sym = Symbol.Deconvolution()()(Map("data" -> data, "num_filter" -> numFilter,
-        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> true))
-    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
-    if (out == false) Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
-    else Symbol.Activation()()(Map("data" -> sym, "act_type" -> "tanh"))
+  def Conv(data: Symbol, numFilter: Int, workspace : Long, kernel: (Int, Int) = (5, 5),
+           pad: (Int, Int) = (2, 2)): Symbol = {
+    val sym1 = Symbol.api.Convolution(data = Some(data), num_filter = numFilter,
+      kernel = Shape(kernel._1, kernel._2), workspace = Some(workspace),
+      pad = Some(Shape(pad._1, pad._2)), no_bias = Some(false))
+    val sym2 = Symbol.api.BatchNorm(data = Some(sym1), fix_gamma = Some(false))
+    val sym3 = Symbol.api.LeakyReLU(data = Some(sym2), act_type = Some("leaky"))
+    sym2.dispose()
+    sym1.dispose()
+    sym3
   }
 
   def getGenerator(prefix: String, imHw: (Int, Int)): Symbol = {
     val data = Symbol.Variable(s"${prefix}_data")
 
-    var conv1_1 = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> 48,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv1_1 = Symbol.BatchNorm()()(Map("data" -> conv1_1, "fix_gamma" -> false))
-    conv1_1 = Symbol.LeakyReLU()()(Map("data" -> conv1_1, "act_type" -> "leaky"))
-
-    var conv2_1 = Symbol.Convolution()()(Map("data" -> conv1_1, "num_filter" -> 32,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv2_1 = Symbol.BatchNorm()()(Map("data" -> conv2_1, "fix_gamma" -> false))
-    conv2_1 = Symbol.LeakyReLU()()(Map("data" -> conv2_1, "act_type" -> "leaky"))
-
-    var conv3_1 = Symbol.Convolution()()(Map("data" -> conv2_1, "num_filter" -> 64,
-        "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "no_bias" -> false, "workspace" -> 4096))
-    conv3_1 = Symbol.BatchNorm()()(Map("data" -> conv3_1, "fix_gamma" -> false))
-    conv3_1 = Symbol.LeakyReLU()()(Map("data" -> conv3_1, "act_type" -> "leaky"))
-
-    var conv4_1 = Symbol.Convolution()()(Map("data" -> conv3_1, "num_filter" -> 32,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv4_1 = Symbol.BatchNorm()()(Map("data" -> conv4_1, "fix_gamma" -> false))
-    conv4_1 = Symbol.LeakyReLU()()(Map("data" -> conv4_1, "act_type" -> "leaky"))
-
-    var conv5_1 = Symbol.Convolution()()(Map("data" -> conv4_1, "num_filter" -> 48,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
-    conv5_1 = Symbol.BatchNorm()()(Map("data" -> conv5_1, "fix_gamma" -> false))
-    conv5_1 = Symbol.LeakyReLU()()(Map("data" -> conv5_1, "act_type" -> "leaky"))
-
-    var conv6_1 = Symbol.Convolution()()(Map("data" -> conv5_1, "num_filter" -> 32,
-        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> true, "workspace" -> 4096))
-    conv6_1 = Symbol.BatchNorm()()(Map("data" -> conv6_1, "fix_gamma" -> false))
-    conv6_1 = Symbol.LeakyReLU()()(Map("data" -> conv6_1, "act_type" -> "leaky"))
-
-    var out = Symbol.Convolution()()(Map("data" -> conv6_1, "num_filter" -> 3, "kernel" -> "(3, 3)",
-        "pad" -> "(1, 1)", "no_bias" -> true, "workspace" -> 4096))
-    out = Symbol.BatchNorm()()(Map("data" -> out, "fix_gamma" -> false))
-    out = Symbol.Activation()()(Map("data" -> out, "act_type" -> "tanh"))
+    var conv1_1 = Conv(data, 48, 4096)
+    val conv2_1 = Conv(conv1_1, 32, 4096)
+    var conv3_1 = Conv(conv2_1, 64, 4096, (3, 3), (1, 1))
+    var conv4_1 = Conv(conv3_1, 32, 4096)
+    var conv5_1 = Conv(conv4_1, 48, 4096)
+    var conv6_1 = Conv(conv5_1, 32, 4096)
+    var out = Symbol.api.Convolution(data = Some(conv6_1), num_filter = 3, kernel = Shape(3, 3),
+      pad = Some(Shape(1, 1)), no_bias = Some(true), workspace = Some(4096))
+    out = Symbol.api.BatchNorm(data = Some(out), fix_gamma = Some(false))
+    out = Symbol.api.Activation(data = Some(out), act_type = "tanh")
     val rawOut = (out * 128) + 128
-    val norm = Symbol.SliceChannel()(rawOut)(Map("num_outputs" -> 3))
+    val norm = Symbol.api.SliceChannel(data = Some(rawOut), num_outputs = 3)
     val rCh = norm.get(0) - 123.68f
     val gCh = norm.get(1) - 116.779f
     val bCh = norm.get(2) - 103.939f
-    val normOut = Symbol.Concat()(rCh, gCh, bCh)() * 0.4f + data * 0.6f
-    normOut
+    val normOut = Symbol.api.Concat(data = Array(rCh, gCh, bCh), num_args = 3)
+    normOut * 0.4f + data * 0.6f
   }
 
   def getModule(prefix: String, dShape: Shape, ctx: Context, isTrain: Boolean = true): Module = {
@@ -99,9 +64,9 @@ object GenV4 {
       else (dataShape, false, false)
     }
     val mod = new Module(symbol = sym, context = ctx,
-                         dataShapes = dataShapes,
-                         initializer = new Xavier(magnitude = 2f),
-                         forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
+      dataShapes = dataShapes,
+      initializer = new Xavier(magnitude = 2f),
+      forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
     mod
   }
 }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
deleted file mode 100644
index 6044847be..000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.neuralstyle.end2end
-
-import org.apache.mxnet.Executor
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Symbol
-
-
-object ModelVgg19 {
-
-  def getVggSymbol(prefix: String, contentOnly: Boolean = false): (Symbol, Symbol) = {
-    // declare symbol
-    val data = Symbol.Variable(s"${prefix}_data")
-    val conv1_1 = Symbol.Convolution(s"${prefix}_conv1_1")()(Map("data" -> data,
-                            "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu1_1 = Symbol.Activation(s"${prefix}_relu1_1")()(Map("data" -> conv1_1,
-                            "act_type" -> "relu"))
-    val conv1_2 = Symbol.Convolution(s"${prefix}_conv1_2")()(Map("data" -> relu1_1,
-                            "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu1_2 = Symbol.Activation(s"${prefix}_relu1_2")()(Map("data" -> conv1_2,
-                            "act_type" -> "relu"))
-    val pool1 = Symbol.Pooling(s"${prefix}_pool1")()(Map("data" -> relu1_2 , "pad" -> "(0,0)",
-                            "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv2_1 = Symbol.Convolution(s"${prefix}_conv2_1")()(Map("data" -> pool1,
-                            "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu2_1 = Symbol.Activation(s"${prefix}_relu2_1")()(Map("data" -> conv2_1,
-                            "act_type" -> "relu"))
-    val conv2_2 = Symbol.Convolution(s"${prefix}_conv2_2")()(Map("data" -> relu2_1,
-                            "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu2_2 = Symbol.Activation(s"${prefix}_relu2_2")()(Map("data" -> conv2_2,
-                            "act_type" -> "relu"))
-    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> relu2_2 , "pad" -> "(0,0)",
-                            "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv3_1 = Symbol.Convolution(s"${prefix}_conv3_1")()(Map("data" -> pool2,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_1 = Symbol.Activation(s"${prefix}_relu3_1")()(Map("data" -> conv3_1,
-                            "act_type" -> "relu"))
-    val conv3_2 = Symbol.Convolution(s"${prefix}_conv3_2")()(Map("data" -> relu3_1,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_2 = Symbol.Activation(s"${prefix}_relu3_2")()(Map("data" -> conv3_2,
-                            "act_type" -> "relu"))
-    val conv3_3 = Symbol.Convolution(s"${prefix}_conv3_3")()(Map("data" -> relu3_2,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_3 = Symbol.Activation(s"${prefix}_relu3_3")()(Map("data" -> conv3_3,
-                            "act_type" -> "relu"))
-    val conv3_4 = Symbol.Convolution(s"${prefix}_conv3_4")()(Map("data" -> relu3_3,
-                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu3_4 = Symbol.Activation(s"${prefix}_relu3_4")()(Map("data" -> conv3_4 ,
-                            "act_type" -> "relu"))
-    val pool3 = Symbol.Pooling(s"${prefix}_pool3")()(Map("data" -> relu3_4,
-                            "pad" -> "(0,0)", "kernel" -> "(2,2)", "stride" -> "(2,2)",
-                            "pool_type" -> "avg"))
-    val conv4_1 = Symbol.Convolution(s"${prefix}_conv4_1")()(Map("data" -> pool3,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_1 = Symbol.Activation(s"${prefix}_relu4_1")()(Map("data" -> conv4_1,
-                            "act_type" -> "relu"))
-    val conv4_2 = Symbol.Convolution(s"${prefix}_conv4_2")()(Map("data" -> relu4_1,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_2 = Symbol.Activation(s"${prefix}_relu4_2")()(Map("data" -> conv4_2,
-                            "act_type" -> "relu"))
-    val conv4_3 = Symbol.Convolution(s"${prefix}_conv4_3")()(Map("data" -> relu4_2,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_3 = Symbol.Activation(s"${prefix}_relu4_3")()(Map("data" -> conv4_3,
-                            "act_type" -> "relu"))
-    val conv4_4 = Symbol.Convolution(s"${prefix}_conv4_4")()(Map("data" -> relu4_3,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu4_4 = Symbol.Activation(s"${prefix}_relu4_4")()(Map("data" -> conv4_4,
-                            "act_type" -> "relu"))
-    val pool4 = Symbol.Pooling(s"${prefix}_pool4")()(Map("data" -> relu4_4,
-                            "pad" -> "(0,0)", "kernel" -> "(2,2)", "stride" -> "(2,2)",
-                            "pool_type" -> "avg"))
-    val conv5_1 = Symbol.Convolution(s"${prefix}_conv5_1")()(Map("data" -> pool4,
-                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
-                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
-    val relu5_1 = Symbol.Activation(s"${prefix}_relu5_1")()(Map("data" -> conv5_1,
-                            "act_type" -> "relu"))
-
-    // style and content layers
-    val style = if (contentOnly) null else Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
-    val content = Symbol.Group(relu4_2)
-    (style, content)
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala
index d681b16c5..1d11f8864 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/Module.scala
@@ -17,20 +17,9 @@
 
 package org.apache.mxnetexamples.neuralstyle.end2end
 
-import org.apache.mxnet.Context
+import org.apache.mxnet.{Context, Initializer, NDArray, Optimizer, Shape, Symbol, Uniform}
 import org.slf4j.LoggerFactory
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Optimizer
-import org.apache.mxnet.Executor
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Uniform
-import org.apache.mxnet.Initializer
-import org.apache.mxnet.DataBatch
-
-/**
- * @author Depeng Liang
- */
+
 class Module(symbol: Symbol,
              context: Context,
              dataShapes: Map[String, Shape],
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
index 8ab3a4b36..96820ce4e 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
@@ -18,41 +18,38 @@
 package org.apache.mxnetexamples.gan
 
 import java.io.File
-import java.net.URL
-
-import org.apache.commons.io.FileUtils
 import org.apache.mxnet.Context
 import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
 import org.slf4j.LoggerFactory
 
 import scala.sys.process.Process
 
+@Ignore
 class GanExampleSuite extends FunSuite with BeforeAndAfterAll{
   private val logger = LoggerFactory.getLogger(classOf[GanExampleSuite])
 
   test("Example CI: Test GAN MNIST") {
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      logger.info("Downloading mnist model")
-      val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      val modelDirPath = tempDirPath + File.separator + "mnist/"
-      logger.info("tempDirPath: %s".format(tempDirPath))
-      Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
-        tempDirPath + "/mnist/mnist.zip")
-      // TODO: Need to confirm with Windows
-      Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
-        + tempDirPath + "/mnist/") !
-
-      val context = Context.gpu()
-
-      val output = GanMnist.runTraining(modelDirPath, context, modelDirPath, 5)
-      Process("rm -rf " + modelDirPath) !
-
-      assert(output >= 0.0f)
-    } else {
-      logger.info("GPU test only, skipped...")
-    }
+      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+        System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+        logger.info("Downloading mnist model")
+        val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
+        val tempDirPath = System.getProperty("java.io.tmpdir")
+        val modelDirPath = tempDirPath + File.separator + "mnist/"
+        logger.info("tempDirPath: %s".format(tempDirPath))
+        Util.downloadUrl(baseUrl + "/mnist/mnist.zip", tempDirPath + "/mnist/mnist.zip")
+        // TODO: Need to confirm with Windows
+        Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
+          + tempDirPath + "/mnist/") !
+
+        val context = Context.gpu()
+
+        val output = GanMnist.runTraining(modelDirPath, context, modelDirPath, 5)
+        Process("rm -rf " + modelDirPath) !
+
+        assert(output >= 0.0f)
+      } else {
+        logger.info("GPU test only, skipped...")
+      }
   }
 }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
index 7b1d6ddc3..0fd3af02d 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/MNISTExampleSuite.scala
@@ -29,8 +29,7 @@ import org.slf4j.LoggerFactory
 import scala.sys.process.Process
 
 /**
-  * Integration test for imageClassifier example.
-  * This will run as a part of "make scalatest"
+  * Integration test for MNIST example.
   */
 class MNISTExampleSuite extends FunSuite with BeforeAndAfterAll {
   private val logger = LoggerFactory.getLogger(classOf[MNISTExampleSuite])
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
new file mode 100644
index 000000000..dc8fc5b8c
--- /dev/null
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples.neuralstyle
+
+import org.apache.mxnet.Context
+import org.apache.mxnetexamples.Util
+import org.apache.mxnetexamples.neuralstyle.end2end.{BoostInference, BoostTrain}
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.slf4j.LoggerFactory
+
+import scala.sys.process.Process
+
+/**
+  * Neural Suite Test package
+  * Currently there is no plan to run to test accuracy
+  * This test is just to verify the model is runnable
+  */
+class NeuralStyleSuite extends FunSuite with BeforeAndAfterAll {
+  private val logger = LoggerFactory.getLogger(classOf[NeuralStyleSuite])
+
+
+  override def beforeAll(): Unit = {
+    logger.info("Downloading vgg model")
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    logger.info("tempDirPath: %s".format(tempDirPath))
+    val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/NeuralStyle/"
+    Util.downloadUrl(baseUrl + "IMG_4343.jpg", tempDirPath + "/NS/IMG_4343.jpg")
+    Util.downloadUrl(baseUrl + "starry_night.jpg", tempDirPath + "/NS/starry_night.jpg")
+    Util.downloadUrl(baseUrl + "model.zip", tempDirPath + "/NS/model.zip")
+    Util.downloadUrl(baseUrl + "vgg19.params", tempDirPath + "/NS/vgg19.params")
+    // TODO: Need to confirm with Windows
+    Process(s"unzip $tempDirPath/NS/model.zip -d $tempDirPath/NS/") !
+
+    Process(s"mkdir $tempDirPath/NS/images") !
+
+    for (i <- 0 until 20) {
+      Process(s"cp $tempDirPath/NS/IMG_4343.jpg $tempDirPath/NS/images/img$i.jpg") !
+    }
+  }
+
+  test("Example CI: Test Boost Inference") {
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    var ctx = Context.cpu()
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      ctx = Context.gpu()
+    }
+    BoostInference.runInference(tempDirPath + "/NS/model", tempDirPath + "/NS", 2,
+      tempDirPath + "/NS/IMG_4343.jpg", ctx)
+  }
+
+  test("Example CI: Test Boost Training") {
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      val ctx = Context.gpu()
+      BoostTrain.runTraining(tempDirPath + "/NS/images", tempDirPath + "/NS/vgg19.params", ctx,
+        tempDirPath + "/NS/starry_night.jpg", tempDirPath + "/NS")
+    } else {
+      logger.info("GPU test only, skip CPU...")
+    }
+  }
+
+  test("Example CI: Test Neural Style") {
+    val tempDirPath = System.getProperty("java.io.tmpdir")
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      val ctx = Context.gpu()
+      NeuralStyle.runTraining("vgg19", tempDirPath + "/NS/IMG_4343.jpg",
+        tempDirPath + "/NS/starry_night.jpg",
+        ctx, tempDirPath + "/NS/vgg19.params", tempDirPath + "/NS",
+        1f, 20f, 0.01f, 1, 10f, 60, 600, 50, 0.0005f)
+    } else {
+      logger.info("GPU test only, skip CPU")
+    }
+  }
+}

From ed203047573db7681feb5631901a609e6e074971 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Tue, 31 Jul 2018 16:31:39 -0700
Subject: [PATCH 36/63] [MXNET-750] fix nested call on CachedOp. (#11951)

* fix nested call on cachedop.

* fix.
---
 src/imperative/cached_op.cc                        | 12 ++++++------
 src/imperative/imperative.cc                       |  3 ++-
 src/imperative/imperative_utils.cc                 |  4 ++--
 src/imperative/imperative_utils.h                  |  3 ++-
 tests/python/unittest/test_contrib_control_flow.py |  1 +
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index d4da99ea9..1e7f8e0de 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -821,12 +821,11 @@ OpStatePtr CachedOp::DynamicForward(
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
-  if (recording && !inlining_) Imperative::Get()->set_is_recording(false);
-
+  // If we are already recording, we don't need RunGraph to record all
+  // computation again.
   RunGraph(false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
-           std::move(ref_count), &states, dispatch_modes);
-
-  Imperative::Get()->set_is_recording(recording);
+           std::move(ref_count), &states, dispatch_modes,
+           !recording || inlining_);
 
   return op_state;
 }
@@ -947,7 +946,8 @@ void CachedOp::DynamicBackward(
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
   RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
-           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes,
+           Imperative::Get()->is_recording());
 
   if (retain_graph) {
     buff.resize(num_forward_entries);
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index e1654259a..0c5ff8417 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -495,7 +495,8 @@ std::vector<NDArray*> Imperative::Backward(
   int prev_bulk_size = Engine::Get()->set_bulk_size(backward_bulk_size_);
 
   RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
-           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes,
+           is_recording());
 
   Engine::Get()->set_bulk_size(prev_bulk_size);
   set_is_recording(prev_recording);
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
index 464aefc22..c84a3b9be 100644
--- a/src/imperative/imperative_utils.cc
+++ b/src/imperative/imperative_utils.cc
@@ -30,7 +30,8 @@ void RunGraph(
     std::vector<OpReqType>&& array_reqs,
     std::vector<uint32_t>&& ref_count,
     std::vector<OpStatePtr> *p_states,
-    const DispatchModeVector &dispatch_modes) {
+    const DispatchModeVector &dispatch_modes,
+    bool recording) {
   using namespace nnvm;
   using namespace imperative;
   static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
@@ -40,7 +41,6 @@ void RunGraph(
   const auto imp = Imperative::Get();
 
   std::vector<OpStatePtr>& states = *p_states;
-  bool recording = imp->is_recording();
 
   std::vector<NDArray*> ndinputs, ndoutputs;
   ShapeVector arg_shapes;
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 6daf96e60..9c86843ca 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -994,7 +994,8 @@ void RunGraph(const bool retain_graph,
               std::vector<OpReqType>&& array_reqs,
               std::vector<uint32_t>&& ref_count,
               std::vector<OpStatePtr> *p_states,
-              const DispatchModeVector &dispatch_modes);
+              const DispatchModeVector &dispatch_modes,
+              bool recording);
 
 }  // namespace imperative
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index 67ed78ee0..f1188b53d 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -1159,6 +1159,7 @@ def check_contrib_rnn(cell_type, num_states):
 
     configs = [
             {},
+            {'inline_limit': 0},
             {'static_alloc': True},
             {'static_alloc': True, 'static_shape': True} ]
     for config in configs:

From 51f650e0bf3b905fec4aebfc873c1c56eac61536 Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Tue, 31 Jul 2018 16:58:21 -0700
Subject: [PATCH 37/63] extend reshape op to allow reverse shape inference
 (#11956)

---
 src/c_api/c_api.cc                     |  2 ++
 src/operator/tensor/matrix_op-inl.h    | 36 +++++++++++++++++++++-----
 tests/python/unittest/test_operator.py | 35 ++++++++++++++++++-------
 3 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 118af6793..ed513c0d7 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -443,6 +443,8 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
   nnvm::Tuple<dim_t> shape(dims, dims+ndim);
+  CHECK_GT(arr->shape().Size(), 0) << "Source ndarray's shape is undefined. Input shape: "
+    << arr->shape();
   TShape new_shape = mxnet::op::InferReshapeShape(shape, arr->shape(), reverse);
   *ptr = arr->ReshapeWithRecord(new_shape);
   *out = ptr;
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index eec920555..78e1fa1d9 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -122,7 +122,7 @@ inline TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
       CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
       if (d1 == -1) d1 = d0 / d2;
       if (d2 == -1) d2 = d0 / d1;
-      CHECK_EQ(d1 * d2, static_cast<IType>(d0)) <<
+      CHECK(d1 * d2 == static_cast<IType>(d0) || static_cast<IType>(d0) == IType(0)) <<
         "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
       tmp.push_back(d1);
       tmp.push_back(d2);
@@ -151,13 +151,36 @@ inline TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
   return oshape;
 }
 
+inline bool ReverseReshapeInferShape(TShape *in, const TShape& out) {
+  if (in->Size() && out.Size()) {
+    return true;
+  } else if (!out.Size()) {
+    return false;
+  } else {
+    int zero_axis = -1;
+    int non_zero_prod = 1;
+    for (index_t i = 0; i < in->ndim(); i++) {
+      if ((*in)[i] == 0) {
+        if (zero_axis != -1)
+          return false;  // more than 1 zero found.
+        else
+          zero_axis = i;
+      } else {
+        non_zero_prod *= (*in)[i];
+      }
+    }
+    (*in)[zero_axis] = out.Size() / non_zero_prod;
+    return true;
+  }
+}
+
 inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
-                             std::vector<TShape> *in_attrs,
-                             std::vector<TShape> *out_attrs) {
+                         std::vector<TShape> *in_attrs,
+                         std::vector<TShape> *out_attrs) {
   const ReshapeParam& param_ = nnvm::get<ReshapeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
-  const TShape &dshape = (*in_attrs)[0];
+  TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   TShape oshape;
   if (param_.shape.ndim() != 0) {
@@ -182,14 +205,15 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
       oshape[inf_idx] = dshape.Size() / oshape.Size();
     }
   } else {
-    return (*out_attrs)[0].ndim();
+    return (*out_attrs)[0].ndim() && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
   }
+  ReverseReshapeInferShape(&dshape, oshape);
   CHECK_EQ(oshape.Size(), dshape.Size())
     << "Target shape size is different to source. "
     << "Target: " << oshape
     << "\nSource: " << dshape;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
 }
 
 inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 99d635e35..12d0bd116 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1943,11 +1943,11 @@ def test_bxor(a, b):
     test_bmul(a, b)
     test_bdiv(a, b)
     '''
-    Flaky Test Disabled due to master build failure: 
-    http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1248/pipeline 
+    Flaky Test Disabled due to master build failure:
+    http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1248/pipeline
     Github Issue: https://github.com/apache/incubator-mxnet/issues/11838
-    
-    test_bmod(a, b) 
+
+    test_bmod(a, b)
     '''
     test_bmod_int(a, b)
     test_bpow(a, b)
@@ -2065,6 +2065,23 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
         assert np.square(exe.grad_dict['data'].asnumpy() - grad_npy.reshape(src_shape)).mean() < 1E-7, \
             'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s'\
             %(str(src_shape), str(shape_args), str(reverse), str(dst_shape))
+
+        for i in range(len(src_shape)):
+            holdout_src_shape = list(src_shape)
+            holdout_src_shape[i] = 0
+            holdout_src_shape = tuple(holdout_src_shape)
+            net = mx.sym.Variable('data')
+            net = mx.sym.elemwise_add(net.reshape(shape_args, reverse=reverse), mx.sym.ones(shape=dst_shape))
+            input_shape, output_shape, __ = net.infer_shape(data=holdout_src_shape)
+            assert output_shape[0] == dst_shape, \
+                'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \
+                'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse),
+                                      str(dst_shape), str(output_shape[0]))
+            assert input_shape[0] == src_shape, \
+                'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \
+                'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse),
+                                      str(dst_shape), str(output_shape[0]))
+
     # Test new api (Using shape)
     test_cases = [
         [(2, 3, 5, 5),  (0, -1),          False, (2, 75)],
@@ -6615,7 +6632,7 @@ def test_diag():
     w = np.random.randint(2,9)
     a_np = np.random.random((h, w)).astype(np.float32)
     a = mx.nd.array(a_np).astype('float32')
-    
+
     # k == 0
     r = mx.nd.diag(a)
     assert_almost_equal(r.asnumpy(), np.diag(a_np))
@@ -6658,7 +6675,7 @@ def test_diag():
     d = np.random.randint(2,9)
     a_np = np.random.random((d))
     a = mx.nd.array(a_np)
-    
+
     # k is random
     k = np.random.randint(-d,d)
     r = mx.nd.diag(a, k=k)
@@ -6725,7 +6742,7 @@ def test_invalid_block_size():
         invalid_shape_inp = (n , c, h, w)
         data = rand_ndarray(invalid_shape_inp, 'default')
         assertRaises(MXNetError, mx.nd.depth_to_space, data, block)
-        
+
     test_invalid_depth_dim()
     test_invalid_space_dim()
     test_invalid_block_size()
@@ -6771,12 +6788,12 @@ def test_invalid_block_size():
         invalid_shape_inp = (n, c, h, w)
         data = rand_ndarray(invalid_shape_inp, 'default')
         assertRaises(MXNetError, mx.nd.space_to_depth, data, block)
-    
+
     def test_invalid_depth_dim():
         invalid_shape_inp = (n, 0, h, w)
         data = rand_ndarray(invalid_shape_inp, 'default')
         assertRaises(MXNetError, mx.nd.space_to_depth, data, block)
-    
+
     test_invalid_space_dim()
     test_invalid_block_size()
     test_invalid_depth_dim()

From 1eef07071a6b03e72e8e94c6d6e2c504f5ee8e23 Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Tue, 31 Jul 2018 17:04:10 -0700
Subject: [PATCH 38/63] Improve sparse embedding index out of bound error
 message; (#11940)

---
 src/operator/tensor/indexing_op.cc | 38 ++++++++++++++++++++----
 src/operator/tensor/indexing_op.cu | 46 ++++++++++++++++++++++++------
 2 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 0f96e2cc2..ef59145bb 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -28,6 +28,27 @@
 namespace mxnet {
 namespace op {
 
+/*
+ * \brief returns true if all indices are between [min, max]
+ * \param data_ptr the indices to check
+ * \param data_size the number of indices to examine
+ * \param min the expected min value for indices
+ * \param max the expected max value for indices
+ */
+template<typename DType>
+bool CheckIndexOutOfBound(const DType* data_ptr, size_t data_size,
+                          const DType min, const DType max) {
+  bool is_valid = true;
+  for (size_t i = 0; i < data_size; i++) {
+    if (data_ptr[i] > max || data_ptr[i] < min) {
+      is_valid = false;
+      break;
+    }
+  }
+  return is_valid;
+}
+
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
                                           const TBlob& data,
@@ -48,18 +69,16 @@ void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
     return;
   }
   // check out-of-bound indices
-  bool is_valid = true;
   MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
     DType min = 0;
     DType max = static_cast<DType>(weight.shape()[0] - 1);
     // check with single thread is faster since data is small
     DType* data_ptr = data.dptr<DType>();
     size_t data_size = data.shape_.Size();
-    for (size_t i = 0; i < data_size; i++) {
-      if (data_ptr[i] > max || data_ptr[i] < min) is_valid = false;
-    }
+    bool is_valid = CheckIndexOutOfBound(data_ptr, data_size,
+                                         min, max);
+    CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
   })
-  CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
   // the weight is actually dense
   if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
     EmbeddingOpForwardDnsImpl<cpu>(s, data, weight.data(), req, output);
@@ -101,6 +120,15 @@ inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const bool deterministic,
   MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
     MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
       MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        // check out of bound indices
+        {
+          IType min = 0;
+          IType max = static_cast<IType>(output.shape()[0] - 1);
+          // check with single thread is faster since data is small
+          IType* data_ptr = data.dptr<IType>();
+          bool is_valid = CheckIndexOutOfBound(data_ptr, data.shape_.Size(), min, max);
+          CHECK(is_valid) << "Embedding input contains data out of bound";
+        }
         // mark row flags
         Fill<false>(s, TBlob(row_flg, Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
         Kernel<MarkRowFlgKernel, cpu>::Launch(s, data_size, row_flg, data.dptr<IType>());
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 39fd81ef2..bdc7f6e84 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -36,7 +36,7 @@ namespace op {
 
 struct is_valid_check {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, int32_t* out, const DType* data,
+  MSHADOW_XINLINE static void Map(int i, char* out, const DType* data,
                                   const DType min, const DType max) {
     if (data[i] < min || data[i] > max) *out = 1;
   }
@@ -116,6 +116,27 @@ struct AddTakeGradRspDeterministicKernel {
   }
 };
 
+/*
+ * \brief returns true if all indices are between [min, max]
+ * \param s the stream
+ * \param data_ptr the indices on the stream
+ * \param data_size the number of indices to examine
+ * \param min the expected min value for indices
+ * \param max the expected max value for indices
+ * \param is_valid_ptr the temparary workspace
+ */
+template<typename DType>
+bool CheckIndexOutOfBound(mshadow::Stream<gpu> *s, const DType* data_ptr, size_t data_size,
+                          const DType min, const DType max, char* is_valid_ptr) {
+  using namespace mxnet_op;
+  int32_t is_valid = 0;
+  Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
+  Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
+  CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(char),
+            cudaMemcpyDeviceToHost));
+  return is_valid == 0;
+}
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,
@@ -136,21 +157,17 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
     return;
   }
   // check out-of-bound indices
-  int32_t is_valid = 0;
   MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
     DType min = 0;
     DType max = static_cast<DType>(weight.shape()[0] - 1);
     DType* data_ptr = data.dptr<DType>();
     size_t data_size = data.shape_.Size();
     Tensor<gpu, 1, char> workspace = ctx.requested[0]
-        .get_space_typed<gpu, 1, char>(Shape1(sizeof(int32_t)), s);
-    int32_t* is_valid_ptr = reinterpret_cast<int32_t*>(workspace.dptr_);
-    Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
-    Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
-    CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(int32_t),
-              cudaMemcpyDeviceToHost));
+        .get_space_typed<gpu, 1, char>(Shape1(1), s);
+    char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
+    bool is_valid = CheckIndexOutOfBound(s, data_ptr, data_size, min, max, is_valid_ptr);
+    CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
   })
-  CHECK_EQ(is_valid, 0) << "SparseEmbedding input contains data out of bound";
   // the weight is actually dense
   if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
     EmbeddingOpForwardDnsImpl<gpu>(s, data, weight.data(), req, output);
@@ -207,6 +224,17 @@ void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
                                           sorted_data_storage_bytes);
   temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
 
+  // check out-of-bound indices
+  {
+    IType min = 0;
+    IType max = static_cast<IType>(output.shape()[0] - 1);
+    IType* data_ptr = data.dptr<IType>();
+    size_t data_size = data.shape_.Size();
+    bool is_valid = CheckIndexOutOfBound(s, data_ptr, data_size, min, max,
+                                         reinterpret_cast<char*>(temp_storage));
+    CHECK(is_valid) << "Embedding input contains data out of bound";
+  }
+
   // make a copy of the data, to be sorted
   TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
   auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);

From fc912f31927921cf5e14b00e4c66db1605f1db13 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 1 Aug 2018 09:08:59 -0700
Subject: [PATCH 39/63] [MXNET-770] Remove fixed seed in flaky test (#11958)

* Remove fixed seed in flaky test

* Remove fixed seed in flaky test
---
 tests/python/unittest/test_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 802988b43..41ea5828d 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -381,7 +381,7 @@ def test_module_set_params():
                  aux_params={}, allow_missing=True, allow_extra=False)
 
 
-@with_seed(11)
+@with_seed()
 def test_monitor():
     # data iter
     data = mx.nd.array([[0.05, .10]]);

From 394e5ccc7937de61892e79253e6a8c05aebe9d43 Mon Sep 17 00:00:00 2001
From: Anirudh <anirudhkrec@gmail.com>
Date: Wed, 1 Aug 2018 11:00:01 -0700
Subject: [PATCH 40/63] Update ONNX docs with the latest supported ONNX version
 (#11936)

---
 docs/api/python/contrib/onnx.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/api/python/contrib/onnx.md b/docs/api/python/contrib/onnx.md
index 3fe204800..d7c34ec1e 100644
--- a/docs/api/python/contrib/onnx.md
+++ b/docs/api/python/contrib/onnx.md
@@ -13,7 +13,7 @@ With ONNX format support for MXNet, developers can build and train models with a
 ```
 
 ### Installation Instructions
-- To use this module developers need to **install ONNX**, which requires the protobuf compiler to be installed separately. Please follow the [instructions to install ONNX and its dependencies](https://github.com/onnx/onnx#installation). **MXNet currently supports ONNX v1.1.1**. Once installed, you can go through the tutorials on how to use this module.
+- To use this module developers need to **install ONNX**, which requires the protobuf compiler to be installed separately. Please follow the [instructions to install ONNX and its dependencies](https://github.com/onnx/onnx#installation). **MXNet currently supports ONNX v1.2.1**. Once installed, you can go through the tutorials on how to use this module.
 
 
 This document describes all the ONNX-MXNet APIs.
@@ -24,6 +24,7 @@ This document describes all the ONNX-MXNet APIs.
 
     mxnet.contrib.onnx.import_model
     mxnet.contrib.onnx.get_model_metadata
+    mxnet.contrib.onnx.import_to_gluon
     mxnet.contrib.onnx.export_model
 ```
 
@@ -49,10 +50,10 @@ This document describes all the ONNX-MXNet APIs.
 
 ```eval_rst
 
-.. automodule:: mxnet.contrib.onnx
-    :members: import_model
-    :members: get_model_metadata
-    :members: export_model
+.. automodule:: mxnet.contrib.onnx.import_model
+.. automodule:: mxnet.contrib.onnx.get_model_metadata
+.. automodule:: mxnet.contrib.onnx.import_to_gluon
+.. automodule:: mxnet.contrib.onnx.export_model
 
 ```
 

From eed7a34aa8c8145950fd282cdfe3ab16a358dc5c Mon Sep 17 00:00:00 2001
From: Andrew Ayres <andrew.f.ayres@gmail.com>
Date: Wed, 1 Aug 2018 13:22:04 -0700
Subject: [PATCH 41/63] Reduced test to 3 epochs and made gpu only (#11863)

* Reduced test to 3 epochs and made GPU only

* Moved logger variable so that it's accessible
---
 .../multitask/MultiTaskSuite.scala            | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
index dab977019..b86f6751e 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
@@ -44,21 +44,24 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer}
   * This will run as a part of "make scalatest"
   */
 class MultiTaskSuite extends FunSuite {
-
   test("Multitask Test") {
     val logger = LoggerFactory.getLogger(classOf[MultiTaskSuite])
-    logger.info("Multitask Test...")
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      logger.info("Multitask Test...")
 
-    val batchSize = 100
-    val numEpoch = 10
-    val ctx = Context.cpu()
+      val batchSize = 100
+      val numEpoch = 3
+      val ctx = Context.gpu()
 
-    val modelPath = ExampleMultiTask.getTrainingData
-    val (executor, evalMetric) = ExampleMultiTask.train(batchSize, numEpoch, ctx, modelPath)
-    evalMetric.get.foreach { case (name, value) =>
-      assert(value >= 0.95f)
+      val modelPath = ExampleMultiTask.getTrainingData
+      val (executor, evalMetric) = ExampleMultiTask.train(batchSize, numEpoch, ctx, modelPath)
+      evalMetric.get.foreach { case (name, value) =>
+        assert(value >= 0.95f)
+      }
+      executor.dispose()
+    } else {
+      logger.info("GPU test only, skipped...")
     }
-    executor.dispose()
   }
-
 }

From c6a32b6cfb4c984d3ce96f8f651b4fd4df2bd3f5 Mon Sep 17 00:00:00 2001
From: Piyush Ghai <ghai.8@osu.edu>
Date: Wed, 1 Aug 2018 14:49:09 -0700
Subject: [PATCH 42/63] Fix flaky tests for test_laop_4 (#11972)

---
 tests/python/unittest/test_operator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 12d0bd116..418951fcb 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5465,8 +5465,9 @@ def test_laop_3():
             check_grad(test_syevd_l_4, [a_batch])
 
 
-# Seed set because the test is not robust enough to operate on random data
-@with_seed(1896893923)
+# @piyushghai - Removing the fixed seed for this test.
+# Issue for flakiness is tracked at - https://github.com/apache/incubator-mxnet/issues/11721
+@with_seed()
 def test_laop_4():
     # Currently disabled on GPU as syevd needs cuda8
     # and MxNet builds use cuda 7.5

From 061076dc83fbd26bc88911c3b0dbcbee81095d1f Mon Sep 17 00:00:00 2001
From: Sergey Sokolov <Sergei.Sokolov@gmail.com>
Date: Wed, 1 Aug 2018 15:24:35 -0700
Subject: [PATCH 43/63] Updating R client docs (#11954)

* Updating R client docs

* Forcing build
---
 R-package/R/mlp.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R-package/R/mlp.R b/R-package/R/mlp.R
index ecc30999d..aa510d103 100644
--- a/R-package/R/mlp.R
+++ b/R-package/R/mlp.R
@@ -8,7 +8,7 @@
 #' @param activation either a single string or a vector containing the names of the activation functions.
 #' @param out_activation a single string containing the name of the output activation function.
 #' @param ctx whether train on cpu (default) or gpu.
-#' @param eval_metric the evaluation metric/
+#' @param eval.metric the evaluation metric/
 #' @param ... other parameters passing to \code{mx.model.FeedForward.create}/
 #' 
 #' @examples

From 31c5fbcac00ad00543d93754ae67e29c27eb1033 Mon Sep 17 00:00:00 2001
From: Ankit Khedia <36249596+ankkhedia@users.noreply.github.com>
Date: Wed, 1 Aug 2018 16:34:27 -0700
Subject: [PATCH 44/63] Fix install instructions for MXNET-R (#11976)

* fix install instructions for MXNET-R

* fix install instructions for MXNET-R

* fix default cuda version for MXNet-R
---
 docs/install/index.md         | 13 +++++++------
 docs/install/windows_setup.md |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/install/index.md b/docs/install/index.md
index d4704df2e..57c50eb9b 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -1784,7 +1784,7 @@ Next, we install the ```graphviz``` library that we use for visualizing network
 <div class="cpu">
 
 Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/windows/).
-You can [build MXNet-R from source](windows_setup.html#install-the-mxnet-package-for-r), or you can use a pre-built binary:
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
 
 ```r
 cran <- getOption("repos")
@@ -1797,14 +1797,15 @@ install.packages("mxnet")
 
 <div class="gpu">
 
-You can [build MXNet-R from source](windows_setup.html#install-the-mxnet-package-for-r), or you can use a pre-built binary:
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
 
 ```r
-cran <- getOption("repos")
-cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU"
-options(repos = cran)
-install.packages("mxnet")
+  cran <- getOption("repos")
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cu92"
+  options(repos = cran)
+  install.packages("mxnet")
 ```
+Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
 
 </div> <!-- END of GPU -->
 </div> <!-- END - Windows R -->
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 9d03474b5..40ddeb818 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -218,11 +218,11 @@ For GPU package:
 
 ```r
   cran <- getOption("repos")
-  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cuX"
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cu92"
   options(repos = cran)
   install.packages("mxnet")
 ```
-Change X to 80,90,91 or 92 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
 #### Building MXNet from Source Code(GPU)
 After you have installed above software, continue with the following steps to build MXNet-R: 
 1. Clone the MXNet github repo.

From a93905dcbdbf5f50a769eebc76446f995368e68d Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Wed, 1 Aug 2018 21:30:00 -0700
Subject: [PATCH 45/63] [MXNET-751] fix ce_loss flaky (#11971)

* add xavier initializer

* remove comment line
---
 tests/python/unittest/test_loss.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 8d5b86341..3c147bc4c 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -64,7 +64,8 @@ def get_net(num_hidden, flatten=True):
     fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
     return fc3
 
-@with_seed(1234)
+# tracked at: https://github.com/apache/incubator-mxnet/issues/11692
+@with_seed()
 def test_ce_loss():
     nclass = 10
     N = 20
@@ -78,7 +79,8 @@ def test_ce_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.metric.Loss(), optimizer='adam')
+            eval_metric=mx.metric.Loss(), optimizer='adam',
+            initializer=mx.init.Xavier(magnitude=2))
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 

From 564e01acdf460535d4ab7340db39b0d10028b453 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Thu, 2 Aug 2018 09:56:33 +0200
Subject: [PATCH 46/63] [MXNET-769] set MXNET_HOME as base for downloaded
 models through base.data_dir() (#11636)

* set MXNET_DATA_DIR as base for downloaded models through base.data_dir()
push joblib to save containers so is not required when running

* MXNET_DATA_DIR -> MXNET_HOME
---
 ci/docker_cache.py                            |  2 +-
 .../examples/scripts/get_cifar_data.sh        |  4 +-
 .../examples/scripts/get_mnist_data.sh        |  4 +-
 .../clojure-package/scripts/get_cifar_data.sh |  4 +-
 .../clojure-package/scripts/get_mnist_data.sh |  4 +-
 docs/faq/env_var.md                           |  4 ++
 python/mxnet/base.py                          | 24 ++++++++-
 python/mxnet/contrib/text/embedding.py        |  9 ++--
 python/mxnet/gluon/contrib/data/text.py       | 11 ++--
 python/mxnet/gluon/data/vision/datasets.py    | 18 +++----
 python/mxnet/gluon/model_zoo/model_store.py   | 17 ++++---
 .../mxnet/gluon/model_zoo/vision/__init__.py  |  2 +-
 .../mxnet/gluon/model_zoo/vision/alexnet.py   |  5 +-
 .../mxnet/gluon/model_zoo/vision/densenet.py  | 13 ++---
 .../mxnet/gluon/model_zoo/vision/inception.py |  5 +-
 .../mxnet/gluon/model_zoo/vision/mobilenet.py |  9 ++--
 python/mxnet/gluon/model_zoo/vision/resnet.py | 25 +++++-----
 .../gluon/model_zoo/vision/squeezenet.py      |  9 ++--
 python/mxnet/gluon/model_zoo/vision/vgg.py    | 21 ++++----
 python/mxnet/util.py                          | 30 +++++++++++
 scala-package/core/scripts/get_cifar_data.sh  |  4 +-
 scala-package/core/scripts/get_mnist_data.sh  |  4 +-
 .../scala/org/apache/mxnet/TestUtil.scala     |  2 +-
 .../apache/mxnetexamples/gan/GanMnist.scala   |  2 +-
 .../imclassification/TrainMnist.scala         |  2 +-
 .../ImageClassifierExample.scala              |  6 +--
 tests/python/unittest/test_base.py            | 50 +++++++++++++++++++
 27 files changed, 201 insertions(+), 89 deletions(-)
 create mode 100644 python/mxnet/util.py
 create mode 100644 tests/python/unittest/test_base.py

diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 6637ec377..7a6d1106d 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -31,7 +31,6 @@
 import subprocess
 import json
 import build as build_util
-from joblib import Parallel, delayed
 
 
 
@@ -43,6 +42,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
     :param load_cache: Load cache before building
     :return: 1 if error occurred, 0 otherwise
     """
+    from joblib import Parallel, delayed
     if len(platforms) == 0:
         return 0
 
diff --git a/contrib/clojure-package/examples/scripts/get_cifar_data.sh b/contrib/clojure-package/examples/scripts/get_cifar_data.sh
index 372c7bb57..12b3770c2 100755
--- a/contrib/clojure-package/examples/scripts/get_cifar_data.sh
+++ b/contrib/clojure-package/examples/scripts/get_cifar_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/contrib/clojure-package/examples/scripts/get_mnist_data.sh b/contrib/clojure-package/examples/scripts/get_mnist_data.sh
index 6f32b85f4..703ece207 100755
--- a/contrib/clojure-package/examples/scripts/get_mnist_data.sh
+++ b/contrib/clojure-package/examples/scripts/get_mnist_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/contrib/clojure-package/scripts/get_cifar_data.sh b/contrib/clojure-package/scripts/get_cifar_data.sh
index 372c7bb57..12b3770c2 100755
--- a/contrib/clojure-package/scripts/get_cifar_data.sh
+++ b/contrib/clojure-package/scripts/get_cifar_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/contrib/clojure-package/scripts/get_mnist_data.sh b/contrib/clojure-package/scripts/get_mnist_data.sh
index 6f32b85f4..703ece207 100755
--- a/contrib/clojure-package/scripts/get_mnist_data.sh
+++ b/contrib/clojure-package/scripts/get_mnist_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 881bc14fd..6e9a35941 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -152,6 +152,10 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
   - The repository url to be used for Gluon datasets and pre-trained models.
 
+* MXNET_HOME
+  - Data directory in the filesystem for storage, for example when downloading gluon models.
+  - Default in *nix is .mxnet APPDATA/mxnet in windows.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 4df794bdf..3d8ee0191 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -22,11 +22,11 @@
 
 import atexit
 import ctypes
-import inspect
 import os
 import sys
 import warnings
-
+import inspect
+import platform
 import numpy as np
 
 from . import libinfo
@@ -59,6 +59,26 @@
     py_str = lambda x: x
 
 
+def data_dir_default():
+    """
+
+    :return: default data directory depending on the platform and environment variables
+    """
+    system = platform.system()
+    if system == 'Windows':
+        return os.path.join(os.environ.get('APPDATA'), 'mxnet')
+    else:
+        return os.path.join(os.path.expanduser("~"), '.mxnet')
+
+
+def data_dir():
+    """
+
+    :return: data directory in the filesystem for storage, for example when downloading models
+    """
+    return os.getenv('MXNET_HOME', data_dir_default())
+
+
 class _NullType(object):
     """Placeholder for arguments"""
     def __repr__(self):
diff --git a/python/mxnet/contrib/text/embedding.py b/python/mxnet/contrib/text/embedding.py
index 6598718e6..38defb4b9 100644
--- a/python/mxnet/contrib/text/embedding.py
+++ b/python/mxnet/contrib/text/embedding.py
@@ -34,6 +34,7 @@
 from . import vocab
 from ... import ndarray as nd
 from ... import registry
+from ... import base
 
 
 def register(embedding_cls):
@@ -496,7 +497,7 @@ class GloVe(_TokenEmbedding):
     ----------
     pretrained_file_name : str, default 'glove.840B.300d.txt'
         The name of the pre-trained token embedding file.
-    embedding_root : str, default os.path.join('~', '.mxnet', 'embeddings')
+    embedding_root : str, default $MXNET_HOME/embeddings
         The root directory for storing embedding-related files.
     init_unknown_vec : callback
         The callback used to initialize the embedding vector for the unknown token.
@@ -541,7 +542,7 @@ def _get_download_file_name(cls, pretrained_file_name):
         return archive
 
     def __init__(self, pretrained_file_name='glove.840B.300d.txt',
-                 embedding_root=os.path.join('~', '.mxnet', 'embeddings'),
+                 embedding_root=os.path.join(base.data_dir(), 'embeddings'),
                  init_unknown_vec=nd.zeros, vocabulary=None, **kwargs):
         GloVe._check_pretrained_file_names(pretrained_file_name)
 
@@ -600,7 +601,7 @@ class FastText(_TokenEmbedding):
     ----------
     pretrained_file_name : str, default 'wiki.en.vec'
         The name of the pre-trained token embedding file.
-    embedding_root : str, default os.path.join('~', '.mxnet', 'embeddings')
+    embedding_root : str, default $MXNET_HOME/embeddings
         The root directory for storing embedding-related files.
     init_unknown_vec : callback
         The callback used to initialize the embedding vector for the unknown token.
@@ -642,7 +643,7 @@ def _get_download_file_name(cls, pretrained_file_name):
         return '.'.join(pretrained_file_name.split('.')[:-1])+'.zip'
 
     def __init__(self, pretrained_file_name='wiki.simple.vec',
-                 embedding_root=os.path.join('~', '.mxnet', 'embeddings'),
+                 embedding_root=os.path.join(base.data_dir(), 'embeddings'),
                  init_unknown_vec=nd.zeros, vocabulary=None, **kwargs):
         FastText._check_pretrained_file_names(pretrained_file_name)
 
diff --git a/python/mxnet/gluon/contrib/data/text.py b/python/mxnet/gluon/contrib/data/text.py
index 98fe6b657..9e78e3c2e 100644
--- a/python/mxnet/gluon/contrib/data/text.py
+++ b/python/mxnet/gluon/contrib/data/text.py
@@ -30,8 +30,7 @@
 from ...data import dataset
 from ...utils import download, check_sha1, _get_repo_file_url
 from ....contrib import text
-from .... import nd
-
+from .... import nd, base
 
 class _LanguageModelDataset(dataset._DownloadedDataset): # pylint: disable=abstract-method
     def __init__(self, root, namespace, vocabulary):
@@ -116,7 +115,7 @@ class WikiText2(_WikiText):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/wikitext-2'
+    root : str, default $MXNET_HOME/datasets/wikitext-2
         Path to temp folder for storing data.
     segment : str, default 'train'
         Dataset segment. Options are 'train', 'validation', 'test'.
@@ -127,7 +126,7 @@ class WikiText2(_WikiText):
         The sequence length of each sample, regardless of the sentence boundary.
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'wikitext-2'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'wikitext-2'),
                  segment='train', vocab=None, seq_len=35):
         self._archive_file = ('wikitext-2-v1.zip', '3c914d17d80b1459be871a5039ac23e752a53cbe')
         self._data_file = {'train': ('wiki.train.tokens',
@@ -154,7 +153,7 @@ class WikiText103(_WikiText):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/wikitext-103'
+    root : str, default $MXNET_HOME/datasets/wikitext-103
         Path to temp folder for storing data.
     segment : str, default 'train'
         Dataset segment. Options are 'train', 'validation', 'test'.
@@ -164,7 +163,7 @@ class WikiText103(_WikiText):
     seq_len : int, default 35
         The sequence length of each sample, regardless of the sentence boundary.
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'wikitext-103'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'wikitext-103'),
                  segment='train', vocab=None, seq_len=35):
         self._archive_file = ('wikitext-103-v1.zip', '0aec09a7537b58d4bb65362fee27650eeaba625a')
         self._data_file = {'train': ('wiki.train.tokens',
diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py
index 74a5aebf1..2c9800038 100644
--- a/python/mxnet/gluon/data/vision/datasets.py
+++ b/python/mxnet/gluon/data/vision/datasets.py
@@ -30,7 +30,7 @@
 
 from .. import dataset
 from ...utils import download, check_sha1, _get_repo_file_url
-from .... import nd, image, recordio
+from .... import nd, image, recordio, base
 
 
 class MNIST(dataset._DownloadedDataset):
@@ -40,7 +40,7 @@ class MNIST(dataset._DownloadedDataset):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/mnist'
+    root : str, default $MXNET_HOME/datasets/mnist
         Path to temp folder for storing data.
     train : bool, default True
         Whether to load the training or testing set.
@@ -51,7 +51,7 @@ class MNIST(dataset._DownloadedDataset):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'mnist'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'mnist'),
                  train=True, transform=None):
         self._train = train
         self._train_data = ('train-images-idx3-ubyte.gz',
@@ -101,7 +101,7 @@ class FashionMNIST(MNIST):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/fashion-mnist'
+    root : str, default $MXNET_HOME/datasets/fashion-mnist'
         Path to temp folder for storing data.
     train : bool, default True
         Whether to load the training or testing set.
@@ -112,7 +112,7 @@ class FashionMNIST(MNIST):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'fashion-mnist'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'fashion-mnist'),
                  train=True, transform=None):
         self._train = train
         self._train_data = ('train-images-idx3-ubyte.gz',
@@ -134,7 +134,7 @@ class CIFAR10(dataset._DownloadedDataset):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/cifar10'
+    root : str, default $MXNET_HOME/datasets/cifar10
         Path to temp folder for storing data.
     train : bool, default True
         Whether to load the training or testing set.
@@ -145,7 +145,7 @@ class CIFAR10(dataset._DownloadedDataset):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'cifar10'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'cifar10'),
                  train=True, transform=None):
         self._train = train
         self._archive_file = ('cifar-10-binary.tar.gz', 'fab780a1e191a7eda0f345501ccd62d20f7ed891')
@@ -197,7 +197,7 @@ class CIFAR100(CIFAR10):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/cifar100'
+    root : str, default $MXNET_HOME/datasets/cifar100
         Path to temp folder for storing data.
     fine_label : bool, default False
         Whether to load the fine-grained (100 classes) or coarse-grained (20 super-classes) labels.
@@ -210,7 +210,7 @@ class CIFAR100(CIFAR10):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'cifar100'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'cifar100'),
                  fine_label=False, train=True, transform=None):
         self._train = train
         self._archive_file = ('cifar-100-binary.tar.gz', 'a0bb982c76b83111308126cc779a992fa506b90b')
diff --git a/python/mxnet/gluon/model_zoo/model_store.py b/python/mxnet/gluon/model_zoo/model_store.py
index 7eead68f0..11ac47bae 100644
--- a/python/mxnet/gluon/model_zoo/model_store.py
+++ b/python/mxnet/gluon/model_zoo/model_store.py
@@ -21,8 +21,10 @@
 __all__ = ['get_model_file', 'purge']
 import os
 import zipfile
+import logging
 
 from ..utils import download, check_sha1
+from ... import base, util
 
 _model_sha1 = {name: checksum for checksum, name in [
     ('44335d1f0046b328243b32a26a4fbd62d9057b45', 'alexnet'),
@@ -68,7 +70,7 @@ def short_hash(name):
         raise ValueError('Pretrained model for {name} is not available.'.format(name=name))
     return _model_sha1[name][:8]
 
-def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
+def get_model_file(name, root=os.path.join(base.data_dir(), 'models')):
     r"""Return location for the pretrained on local file system.
 
     This function will download from online model zoo when model cannot be found or has mismatch.
@@ -78,7 +80,7 @@ def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
     ----------
     name : str
         Name of the model.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
 
     Returns
@@ -95,12 +97,11 @@ def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
         if check_sha1(file_path, sha1_hash):
             return file_path
         else:
-            print('Mismatch in the content of model file detected. Downloading again.')
+            logging.warning('Mismatch in the content of model file detected. Downloading again.')
     else:
-        print('Model file is not found. Downloading.')
+        logging.info('Model file not found. Downloading to %s.', file_path)
 
-    if not os.path.exists(root):
-        os.makedirs(root)
+    util.makedirs(root)
 
     zip_file_path = os.path.join(root, file_name+'.zip')
     repo_url = os.environ.get('MXNET_GLUON_REPO', apache_repo_url)
@@ -118,12 +119,12 @@ def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
     else:
         raise ValueError('Downloaded file has different hash. Please try again.')
 
-def purge(root=os.path.join('~', '.mxnet', 'models')):
+def purge(root=os.path.join(base.data_dir(), 'models')):
     r"""Purge all pretrained model files in local file store.
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     root = os.path.expanduser(root)
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py b/python/mxnet/gluon/model_zoo/vision/__init__.py
index a6e5dc137..7d33ce409 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -101,7 +101,7 @@ def get_model(name, **kwargs):
         Number of classes for the output layer.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
 
     Returns
diff --git a/python/mxnet/gluon/model_zoo/vision/alexnet.py b/python/mxnet/gluon/model_zoo/vision/alexnet.py
index fdb006258..daf4617cd 100644
--- a/python/mxnet/gluon/model_zoo/vision/alexnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/alexnet.py
@@ -25,6 +25,7 @@
 from ....context import cpu
 from ...block import HybridBlock
 from ... import nn
+from .... import base
 
 # Net
 class AlexNet(HybridBlock):
@@ -68,7 +69,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def alexnet(pretrained=False, ctx=cpu(),
-            root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+            root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""AlexNet model from the `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
 
     Parameters
@@ -77,7 +78,7 @@ def alexnet(pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = AlexNet(**kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/densenet.py b/python/mxnet/gluon/model_zoo/vision/densenet.py
index b03f5ce8d..83febd365 100644
--- a/python/mxnet/gluon/model_zoo/vision/densenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/densenet.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from ...contrib.nn import HybridConcurrent, Identity
+from .... import base
 
 # Helpers
 def _make_dense_block(num_layers, bn_size, growth_rate, dropout, stage_index):
@@ -122,7 +123,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_densenet(num_layers, pretrained=False, ctx=cpu(),
-                 root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                 root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""Densenet-BC model from the
     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.
 
@@ -134,7 +135,7 @@ def get_densenet(num_layers, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     num_init_features, growth_rate, block_config = densenet_spec[num_layers]
@@ -154,7 +155,7 @@ def densenet121(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(121, **kwargs)
@@ -169,7 +170,7 @@ def densenet161(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(161, **kwargs)
@@ -184,7 +185,7 @@ def densenet169(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(169, **kwargs)
@@ -199,7 +200,7 @@ def densenet201(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(201, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/inception.py b/python/mxnet/gluon/model_zoo/vision/inception.py
index 7c54691f1..6bdc526a6 100644
--- a/python/mxnet/gluon/model_zoo/vision/inception.py
+++ b/python/mxnet/gluon/model_zoo/vision/inception.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from ...contrib.nn import HybridConcurrent
+from .... import base
 
 # Helpers
 def _make_basic_conv(**kwargs):
@@ -199,7 +200,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def inception_v3(pretrained=False, ctx=cpu(),
-                 root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                 root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""Inception v3 model from
     `"Rethinking the Inception Architecture for Computer Vision"
     <http://arxiv.org/abs/1512.00567>`_ paper.
@@ -210,7 +211,7 @@ def inception_v3(pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = Inception3(**kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/mobilenet.py b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
index 1a2c9b946..1a84e05af 100644
--- a/python/mxnet/gluon/model_zoo/vision/mobilenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
@@ -30,6 +30,7 @@
 from ... import nn
 from ....context import cpu
 from ...block import HybridBlock
+from .... import base
 
 
 # Helpers
@@ -188,7 +189,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
-                  root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                  root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
     <https://arxiv.org/abs/1704.04861>`_ paper.
@@ -203,7 +204,7 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = MobileNet(multiplier, **kwargs)
@@ -219,7 +220,7 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
 
 
 def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
-                     root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                     root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
       Mobile Networks for Classification, Detection and Segmentation"
@@ -235,7 +236,7 @@ def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = MobileNetV2(multiplier, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index da279b895..48390decb 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -32,6 +32,7 @@
 from ....context import cpu
 from ...block import HybridBlock
 from ... import nn
+from .... import base
 
 # Helpers
 def _conv3x3(channels, stride, in_channels):
@@ -356,7 +357,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
-               root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+               root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""ResNet V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
     ResNet V2 model from `"Identity Mappings in Deep Residual Networks"
@@ -372,7 +373,7 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     assert num_layers in resnet_spec, \
@@ -400,7 +401,7 @@ def resnet18_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 18, **kwargs)
@@ -415,7 +416,7 @@ def resnet34_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 34, **kwargs)
@@ -430,7 +431,7 @@ def resnet50_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 50, **kwargs)
@@ -445,7 +446,7 @@ def resnet101_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 101, **kwargs)
@@ -460,7 +461,7 @@ def resnet152_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 152, **kwargs)
@@ -475,7 +476,7 @@ def resnet18_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 18, **kwargs)
@@ -490,7 +491,7 @@ def resnet34_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 34, **kwargs)
@@ -505,7 +506,7 @@ def resnet50_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 50, **kwargs)
@@ -520,7 +521,7 @@ def resnet101_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 101, **kwargs)
@@ -535,7 +536,7 @@ def resnet152_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 152, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/squeezenet.py b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
index aaff4c36d..b97d1274a 100644
--- a/python/mxnet/gluon/model_zoo/vision/squeezenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from ...contrib.nn import HybridConcurrent
+from .... import base
 
 # Helpers
 def _make_fire(squeeze_channels, expand1x1_channels, expand3x3_channels):
@@ -110,7 +111,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_squeezenet(version, pretrained=False, ctx=cpu(),
-                   root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                   root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""SqueezeNet model from the `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
     and <0.5MB model size" <https://arxiv.org/abs/1602.07360>`_ paper.
     SqueezeNet 1.1 model from the `official SqueezeNet repo
@@ -126,7 +127,7 @@ def get_squeezenet(version, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = SqueezeNet(version, **kwargs)
@@ -145,7 +146,7 @@ def squeezenet1_0(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_squeezenet('1.0', **kwargs)
@@ -162,7 +163,7 @@ def squeezenet1_1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_squeezenet('1.1', **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/vgg.py b/python/mxnet/gluon/model_zoo/vision/vgg.py
index a3b1685b4..9a740e633 100644
--- a/python/mxnet/gluon/model_zoo/vision/vgg.py
+++ b/python/mxnet/gluon/model_zoo/vision/vgg.py
@@ -30,6 +30,7 @@
 from ....initializer import Xavier
 from ...block import HybridBlock
 from ... import nn
+from .... import base
 
 
 class VGG(HybridBlock):
@@ -94,7 +95,7 @@ def hybrid_forward(self, F, x):
 
 # Constructors
 def get_vgg(num_layers, pretrained=False, ctx=cpu(),
-            root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+            root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""VGG model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
 
@@ -106,7 +107,7 @@ def get_vgg(num_layers, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     layers, filters = vgg_spec[num_layers]
@@ -128,7 +129,7 @@ def vgg11(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(11, **kwargs)
@@ -143,7 +144,7 @@ def vgg13(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(13, **kwargs)
@@ -158,7 +159,7 @@ def vgg16(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(16, **kwargs)
@@ -173,7 +174,7 @@ def vgg19(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(19, **kwargs)
@@ -189,7 +190,7 @@ def vgg11_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
@@ -206,7 +207,7 @@ def vgg13_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
@@ -223,7 +224,7 @@ def vgg16_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
@@ -240,7 +241,7 @@ def vgg19_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
new file mode 100644
index 000000000..57bc2bf76
--- /dev/null
+++ b/python/mxnet/util.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""general utility functions"""
+
+import os
+import sys
+
+
+def makedirs(d):
+    """Create directories recursively if they don't exist. os.makedirs(exist_ok=True) is not
+    available in Python2"""
+    if sys.version_info[0] < 3:
+        from distutils.dir_util import mkpath
+        mkpath(d)
+    else:
+        os.makedirs(d, exist_ok=True)
diff --git a/scala-package/core/scripts/get_cifar_data.sh b/scala-package/core/scripts/get_cifar_data.sh
index 9ec1c39a4..b061c1895 100755
--- a/scala-package/core/scripts/get_cifar_data.sh
+++ b/scala-package/core/scripts/get_cifar_data.sh
@@ -20,8 +20,8 @@
 
 set -e
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/scala-package/core/scripts/get_mnist_data.sh b/scala-package/core/scripts/get_mnist_data.sh
index 97e151bf8..ded206fbb 100755
--- a/scala-package/core/scripts/get_mnist_data.sh
+++ b/scala-package/core/scripts/get_mnist_data.sh
@@ -20,8 +20,8 @@
 
 set -e
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala b/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala
index 1187757a0..4fc8ec982 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala
@@ -24,7 +24,7 @@ class TestUtil {
     * @return Data direcotry path ()may be relative)
     */
   def getDataDirectory: String = {
-    var dataDir = System.getenv("MXNET_DATA_DIR")
+    var dataDir = System.getenv("MXNET_HOME")
     if(dataDir == null) {
       dataDir = "data"
     } else {
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
index 6186989b7..70846eebf 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
@@ -181,7 +181,7 @@ object GanMnist {
     try {
       parser.parseArgument(args.toList.asJava)
 
-      val dataPath = if (anst.mnistDataPath == null) System.getenv("MXNET_DATA_DIR")
+      val dataPath = if (anst.mnistDataPath == null) System.getenv("MXNET_HOME")
       else anst.mnistDataPath
 
       assert(dataPath != null)
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
index b0ecc7d29..bd0ce45ff 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
@@ -112,7 +112,7 @@ object TrainMnist {
     try {
       parser.parseArgument(args.toList.asJava)
 
-      val dataPath = if (inst.dataDir == null) System.getenv("MXNET_DATA_DIR")
+      val dataPath = if (inst.dataDir == null) System.getenv("MXNET_HOME")
         else inst.dataDir
 
       val (dataShape, net) =
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
index e886b908b..3bbd780d3 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
@@ -119,13 +119,13 @@ object ImageClassifierExample {
       parser.parseArgument(args.toList.asJava)
 
 
-      val modelPathPrefix = if (inst.modelPathPrefix == null) System.getenv("MXNET_DATA_DIR")
+      val modelPathPrefix = if (inst.modelPathPrefix == null) System.getenv("MXNET_HOME")
       else inst.modelPathPrefix
 
-      val inputImagePath = if (inst.inputImagePath == null) System.getenv("MXNET_DATA_DIR")
+      val inputImagePath = if (inst.inputImagePath == null) System.getenv("MXNET_HOME")
       else inst.inputImagePath
 
-      val inputImageDir = if (inst.inputImageDir == null) System.getenv("MXNET_DATA_DIR")
+      val inputImageDir = if (inst.inputImageDir == null) System.getenv("MXNET_HOME")
       else inst.inputImageDir
 
       val singleOutput = runInferenceOnSingleImage(modelPathPrefix, inputImagePath, context)
diff --git a/tests/python/unittest/test_base.py b/tests/python/unittest/test_base.py
new file mode 100644
index 000000000..3189729e1
--- /dev/null
+++ b/tests/python/unittest/test_base.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.base import data_dir
+from nose.tools import *
+import os
+import unittest
+import logging
+import os.path as op
+import platform
+
+class MXNetDataDirTest(unittest.TestCase):
+    def setUp(self):
+        self.mxnet_data_dir = os.environ.get('MXNET_HOME')
+        if 'MXNET_HOME' in os.environ:
+            del os.environ['MXNET_HOME']
+
+    def tearDown(self):
+        if self.mxnet_data_dir:
+            os.environ['MXNET_HOME'] = self.mxnet_data_dir
+        else:
+            if 'MXNET_HOME' in os.environ:
+                del os.environ['MXNET_HOME']
+
+    def test_data_dir(self,):
+        prev_data_dir = data_dir()
+        system = platform.system()
+        if system != 'Windows':
+            self.assertEqual(data_dir(), op.join(op.expanduser('~'), '.mxnet'))
+        os.environ['MXNET_HOME'] = '/tmp/mxnet_data'
+        self.assertEqual(data_dir(), '/tmp/mxnet_data')
+        del os.environ['MXNET_HOME']
+        self.assertEqual(data_dir(), prev_data_dir)
+
+

From 6009b26232636c87ca35e8d3716064655417bd32 Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Thu, 2 Aug 2018 09:55:47 -0700
Subject: [PATCH 47/63] [MXNET-748] linker fixed on Scala issues (#11989)

* put force load back as a temporary solution

* use project.basedir as relative path for OSX linker
---
 scala-package/native/osx-x86_64-cpu/pom.xml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 3f66fe68e..e1c63104f 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -73,6 +73,8 @@
             <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
             <linkerMiddleOption>${lddeps}</linkerMiddleOption>
+            <linkerMiddleOption>-force_load ${project.basedir}/../../../lib/libmxnet.a</linkerMiddleOption>
+            <linkerMiddleOption>-force_load ${project.basedir}/../../../3rdparty/tvm/nnvm/lib/libnnvm.a</linkerMiddleOption>
           </linkerMiddleOptions>
           <linkerEndOptions>
             <linkerEndOption>${ldflags}</linkerEndOption>

From 946e9d018452e8b9a8096a90d15c7015311d894f Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Thu, 2 Aug 2018 10:09:43 -0700
Subject: [PATCH 48/63] [MXNET-772] Re-enable
 test_module.py:test_module_set_params (#11979)

---
 tests/python/unittest/test_module.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 41ea5828d..a2a24762a 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -317,8 +317,9 @@ def create_bucketing_module(key):
     assert total_bytes_after == total_bytes_before
 
 
-
-@with_seed(11)
+# roywei: Getting rid of fixed seed as flakiness could not be reproduced,
+# tracked at: https://github.com/apache/incubator-mxnet/issues/11705
+@with_seed()
 def test_module_set_params():
     # data iter
     data = mx.nd.array([[0.05, .10]]);

From 1bd9356b30ecde412663d0020ed769042cf456d6 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Thu, 2 Aug 2018 10:11:17 -0700
Subject: [PATCH 49/63] [MXNET-771] Fix Flaky Test test_executor.py:test_dot
 (#11978)

* use assert_almost_equal, increase rtol, reduce matrix size

* remove seed in test_bind

* add seed 0 to test_bind, it is still flaky

* add comments for tracking
---
 tests/python/unittest/test_executor.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index 630cad874..3117f6646 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -18,13 +18,7 @@
 import numpy as np
 import mxnet as mx
 from common import setup_module, with_seed, teardown
-
-
-def reldiff(a, b):
-    diff = np.sum(np.abs(a - b))
-    norm = np.sum(np.abs(a))
-    reldiff = diff  / norm
-    return reldiff
+from mxnet.test_utils import assert_almost_equal
 
 
 def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
@@ -64,9 +58,9 @@ def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
     out1 = uf(lhs_arr.asnumpy(), rhs_arr.asnumpy())
     out3 = exec3.outputs[0].asnumpy()
     out4 = exec4.outputs[0].asnumpy()
-    assert reldiff(out1, out2) < 1e-6
-    assert reldiff(out1, out3) < 1e-6
-    assert reldiff(out1, out4) < 1e-6
+    assert_almost_equal(out1, out2, rtol=1e-5, atol=1e-5)
+    assert_almost_equal(out1, out3, rtol=1e-5, atol=1e-5)
+    assert_almost_equal(out1, out4, rtol=1e-5, atol=1e-5)
     # test gradient
     out_grad = mx.nd.array(np.ones(out2.shape))
     lhs_grad2, rhs_grad2 = gf(out_grad.asnumpy(),
@@ -74,8 +68,8 @@ def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
                               rhs_arr.asnumpy())
     executor.backward([out_grad])
 
-    assert reldiff(lhs_grad.asnumpy(), lhs_grad2) < 1e-6
-    assert reldiff(rhs_grad.asnumpy(), rhs_grad2) < 1e-6
+    assert_almost_equal(lhs_grad.asnumpy(), lhs_grad2, rtol=1e-5, atol=1e-5)
+    assert_almost_equal(rhs_grad.asnumpy(), rhs_grad2, rtol=1e-5, atol=1e-5)
 
 
 @with_seed(0)
@@ -118,12 +112,14 @@ def check_bind(disable_bulk_exec):
     check_bind(False)
 
 
-@with_seed(0)
+# @roywei: Removing fixed seed as flakiness in this test is fixed
+# tracked at https://github.com/apache/incubator-mxnet/issues/11686
+@with_seed()
 def test_dot():
     nrepeat = 10
     maxdim = 4
     for repeat in range(nrepeat):
-        s =tuple(np.random.randint(1, 500, size=3))
+        s =tuple(np.random.randint(1, 200, size=3))
         check_bind_with_uniform(lambda x, y: np.dot(x, y),
                                 lambda g, x, y: (np.dot(g, y.T), np.dot(x.T, g)),
                                 2,
@@ -131,7 +127,7 @@ def test_dot():
                                 rshape=(s[1], s[2]),
                                 sf = mx.symbol.dot)
     for repeat in range(nrepeat):
-        s =tuple(np.random.randint(1, 500, size=1))
+        s =tuple(np.random.randint(1, 200, size=1))
         check_bind_with_uniform(lambda x, y: np.dot(x, y),
                                 lambda g, x, y: (g * y, g * x),
                                 2,

From 833de7e78118768c35358db7d2234f950fbd0637 Mon Sep 17 00:00:00 2001
From: jimdunn <jdunn@yieldbot.com>
Date: Thu, 2 Aug 2018 14:38:19 -0700
Subject: [PATCH 50/63] remove mod from arity 2 version of load-checkpoint in
 clojure-package (#11808)

* remove mod from arity 2 version of load-checkpoint

* load-checkpoint arity 2 test
---
 .../src/org/apache/clojure_mxnet/module.clj        |  4 +---
 .../test/org/apache/clojure_mxnet/module_test.clj  | 14 ++++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
index 22ab76154..ab6d345fe 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
@@ -309,7 +309,6 @@
 
 (defn load-checkpoint
   "Create a model from previously saved checkpoint.
-   - mod module
    - opts map of
      -  prefix Path prefix of saved model files. You should have prefix-symbol.json,
                  prefix-xxxx.params, and optionally prefix-xxxx.states,
@@ -341,7 +340,7 @@
     (util/->option (when workload-list (util/vec->indexed-seq workload-list)))
     (util/->option (when fixed-param-names (util/vec->set fixed-param-names)))))
   ([prefix epoch]
-   (load-checkpoint mod {:prefix prefix :epoch epoch})))
+   (load-checkpoint {:prefix prefix :epoch epoch})))
 
 (defn load-optimizer-states [mod fname]
   (.mod load fname))
@@ -670,4 +669,3 @@
 
   (fit-params {:allow-missing true})
   (fit-params {}))
-
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
index f3d4e75e8..0f71b5a85 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
@@ -101,13 +101,20 @@
         (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})})
         (m/update)
         (m/save-checkpoint {:prefix "test" :epoch 0 :save-opt-states true}))
-
     (let [mod2 (m/load-checkpoint {:prefix "test" :epoch 0 :load-optimizer-states true})]
       (-> mod2
           (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
           (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})}))
-      (is (= (-> mod m/symbol sym/to-json)  (-> mod2 m/symbol sym/to-json)))
-      (is (= (-> mod m/params first) (-> mod2 m/params first))))))
+      (is (= (-> mod m/symbol sym/to-json) (-> mod2 m/symbol sym/to-json)))
+      (is (= (-> mod m/params first) (-> mod2 m/params first))))
+    ;; arity 2 version of above. `load-optimizer-states` is `false` here by default,
+    ;; but optimizers states aren't checked here so it's not relevant to the test outcome.
+    (let [mod3 (m/load-checkpoint "test" 0)]
+      (-> mod3
+          (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
+          (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})}))
+      (is (= (-> mod m/symbol sym/to-json) (-> mod3 m/symbol sym/to-json)))
+      (is (= (-> mod m/params first) (-> mod3 m/params first))))))
 
 (deftest test-module-save-load-multi-device
   (let [s (sym/variable "data")
@@ -321,4 +328,3 @@
 (comment
 
   (m/data-shapes x))
-

From bcfab3a523baf8c4cbcf36514fe0fd2a580cbc30 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Fri, 3 Aug 2018 00:37:12 +0200
Subject: [PATCH 51/63] Add unit test stage for mxnet cpu in debug mode
 (#11974)

---
 Jenkinsfile                    | 27 +++++++++++++++++++++++++++
 ci/docker/runtime_functions.sh | 19 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6d21f4964..6d9c60732 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -26,6 +26,8 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+// mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
+mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 // timeout in minutes
@@ -233,6 +235,17 @@ try {
         }
       }
     },
+    'CPU: Openblas, debug': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/build-cpu-openblas') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_debug', false)
+            pack_lib('cpu_debug', mx_cmake_lib_debug)
+          }
+        }
+      }
+    },
     'CPU: Clang 3.9': {
       node('mxnetlinux-cpu') {
         ws('workspace/build-cpu-clang39') {
@@ -574,6 +587,20 @@ try {
         }
       }
     },
+    'Python3: CPU debug': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-python3-cpu-debug') {
+          try {
+            init_git()
+            unpack_lib('cpu_debug', mx_cmake_lib_debug)
+            python3_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_debug_unittest.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_debug_quantization.xml')
+          }
+        }
+      }
+    },
     'Python2: GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python2-gpu') {
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 52a2650a1..371d2cf63 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -345,6 +345,25 @@ build_ubuntu_cpu_openblas() {
     report_ccache_usage
 }
 
+build_ubuntu_cpu_cmake_debug() {
+    set -ex
+    pushd .
+    cd /work/build
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DUSE_CUDA=OFF \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_OPENCV=ON \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -G Ninja \
+        /work/mxnet
+
+    ninja -v
+    popd
+}
+
 build_ubuntu_cpu_clang39() {
     set -ex
 

From c9372776ace1eaeeb12aab69bb2151a30fcde93a Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Thu, 2 Aug 2018 18:23:09 -0700
Subject: [PATCH 52/63] Website broken link fixes (#12014)

* fix broken link

* fix broken link

* switch to .md links

* fix broken link
---
 docs/community/ecosystem.md      | 2 +-
 docs/community/mxnet_channels.md | 2 +-
 docs/tutorials/scala/index.md    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/community/ecosystem.md b/docs/community/ecosystem.md
index 5ca6d7a0b..54f8c8993 100644
--- a/docs/community/ecosystem.md
+++ b/docs/community/ecosystem.md
@@ -57,7 +57,7 @@ Community contributions to MXNet have added many new valuable features and funct
 
 ## Model Serving
 
-* [MXNet Model Server (MMS)](https://github.com/apache/incubator-mxnet/tree/master/example/model-server/mms.md) - simple yet scalable solution for model inference.
+* [MXNet Model Server (MMS)](https://github.com/awslabs/mxnet-model-server) - simple yet scalable solution for model inference.
 
 
 ## Model Zoos
diff --git a/docs/community/mxnet_channels.md b/docs/community/mxnet_channels.md
index ef3963f7d..18dc1bc55 100644
--- a/docs/community/mxnet_channels.md
+++ b/docs/community/mxnet_channels.md
@@ -2,7 +2,7 @@
 
 Converse with the MXNet community via the following channels:
 
-- [Forum](https://discuss.mxnet.io/): [discuss.mxnet.io](discuss.mxnet.io) <i class="fas fa-external-link-alt"></i>
+- [Forum](https://discuss.mxnet.io/): [discuss.mxnet.io](https://discuss.mxnet.io/) <i class="fas fa-external-link-alt"></i>
 - [MXNet Apache developer mailing list](https://lists.apache.org/list.html?dev@mxnet.apache.org) (dev@mxnet.apache.org): To subscribe, send an email to <a href="mailto:user-subscribe@mxnet.apache.org">dev-subscribe@mxnet.apache.org</a> <i class="far fa-envelope"></i>
 - [MXNet Apache user mailing list](https://lists.apache.org/list.html?user@mxnet.apache.org) (user@mxnet.apache.org): To subscribe, send an email to <a href="mailto:dev-subscribe@mxnet.apache.org">user-subscribe@mxnet.apache.org</a> <i class="far fa-envelope"></i>
 - [MXNet Slack channel](https://apache-mxnet.slack.com): To request an invitation to the channel please subscribe to the mailing list above and then email: <a href="mailto:dev@mxnet.apache.org">dev@mxnet.apache.org</a> <i class="far fa-envelope"></i>
diff --git a/docs/tutorials/scala/index.md b/docs/tutorials/scala/index.md
index cd9b2e219..f14337f90 100644
--- a/docs/tutorials/scala/index.md
+++ b/docs/tutorials/scala/index.md
@@ -6,8 +6,8 @@ Using MXNet-Scala is easiest with Maven. You have a couple of options for settin
 
 **Note:** Windows is not yet supported.
 
-* [MXNet-Scala Setup Guide Using Maven](../install/scala_setup.html)
-* [Setup Scala with MXNet and Create a MXNet-Scala Project with IntelliJ](mxnet_scala_on_intellij.html)
+* [MXNet-Scala Setup Guide Using Maven](../../install/scala_setup.md)
+* [Setup Scala with MXNet and Create a MXNet-Scala Project with IntelliJ](mxnet_scala_on_intellij.md)
 
 ## Tutorials
 

From 1818280d33201ee369a41ab1be324e22b162c46f Mon Sep 17 00:00:00 2001
From: Ankit Khedia <36249596+ankkhedia@users.noreply.github.com>
Date: Thu, 2 Aug 2018 20:47:50 -0700
Subject: [PATCH 53/63] removed seed from flaky test (#11975)

---
 tests/python/unittest/test_operator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 418951fcb..90e85d123 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -4963,8 +4963,9 @@ def _make_lower_triangle_symm(a, ndims, m, dtype=np.float32):
     lt_mask = mx.sym.reshape(lt_mask, shape=shp)
     return mx.sym.broadcast_mul(a, lt_mask)
 
-# Seed set because the test is not robust enough to operate on random data
-@with_seed(42)
+# @ankkhedia: Getting rid of fixed seed as flakiness could not be reproduced
+# tracked at https://github.com/apache/incubator-mxnet/issues/11718
+@with_seed()
 def test_laop():
     dtype = np.float64
     rtol_fw = 1e-7

From 619700a5ba0f482f4e48d985f0beb61e3e861655 Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Fri, 3 Aug 2018 06:24:53 +0200
Subject: [PATCH 54/63] Disable ccache log print due to threadunsafety (#11997)

---
 ci/docker/runtime_functions.sh | 53 ++--------------------------------
 1 file changed, 3 insertions(+), 50 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 371d2cf63..21471902c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -110,23 +110,6 @@ build_jetson() {
     popd
 }
 
-report_ccache_usage() {
-    set -ex
-    pushd .
-
-    # Show global ccache summary at the end of each run.
-    ccache -s
-    if [ -e $CCACHE_LOGFILE ]
-    then
-        # Display local ccache log, excluding some overly verbose output.
-        cat $CCACHE_LOGFILE | grep -v "Config:" | grep -v "stats.lock"
-    else
-        echo "No ccache log found."
-    fi
-
-    popd
-}
-
 #
 # ARM builds
 #
@@ -159,7 +142,6 @@ build_armv6() {
         -G Ninja /work/mxnet
 
     ninja -v
-    report_ccache_usage
     build_wheel
     popd
 }
@@ -191,7 +173,6 @@ build_armv7() {
         -G Ninja /work/mxnet
 
     ninja -v
-    report_ccache_usage
     build_wheel
     popd
 }
@@ -210,7 +191,6 @@ build_armv8() {
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja -v
-    report_ccache_usage
     build_wheel
 }
 
@@ -237,7 +217,6 @@ build_android_armv7() {
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja -v
-    report_ccache_usage
 }
 
 build_android_armv8() {
@@ -270,8 +249,6 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_amzn_linux_cpu() {
@@ -289,7 +266,6 @@ build_amzn_linux_cpu() {
         -DUSE_DIST_KVSTORE=ON\
         -G Ninja /work/mxnet
     ninja -v
-    report_ccache_usage
 }
 
 
@@ -306,8 +282,6 @@ build_centos7_mkldnn() {
         USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_centos7_gpu() {
@@ -341,8 +315,6 @@ build_ubuntu_cpu_openblas() {
         USE_BLAS=openblas             \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_cmake_debug() {
@@ -366,20 +338,15 @@ build_ubuntu_cpu_cmake_debug() {
 
 build_ubuntu_cpu_clang39() {
     set -ex
-
-    export CXX=clang++-3.9
+     export CXX=clang++-3.9
     export CC=clang-3.9
-
-    build_ccache_wrappers
-
-    make \
+     build_ccache_wrappers
+     make \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_OPENMP=0                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50() {
@@ -396,8 +363,6 @@ build_ubuntu_cpu_clang50() {
         USE_OPENMP=1                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang39_mkldnn() {
@@ -414,8 +379,6 @@ build_ubuntu_cpu_clang39_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=0                  \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50_mkldnn() {
@@ -432,8 +395,6 @@ build_ubuntu_cpu_clang50_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=1                  \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_mkldnn() {
@@ -447,8 +408,6 @@ build_ubuntu_cpu_mkldnn() {
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_gpu() {
@@ -469,8 +428,6 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDA_PATH=/usr/local/cuda \
         USE_CUDNN=1                   \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_gpu_mkldnn_nocudnn() {
@@ -486,8 +443,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         USE_CUDA_PATH=/usr/local/cuda \
         USE_CUDNN=0                   \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_gpu_cuda91_cudnn7() {
@@ -534,7 +489,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
-    report_ccache_usage
     # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
     cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
     mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
@@ -556,7 +510,6 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
-    report_ccache_usage
 }
 
 

From 25341648365598a9a123f033bf92ce7fb51c0a39 Mon Sep 17 00:00:00 2001
From: Piyush Ghai <ghai.8@osu.edu>
Date: Fri, 3 Aug 2018 03:08:05 -0700
Subject: [PATCH 55/63] Added default tolerance levels for regression checks
 for MBCC (#12006)

* Added tolerance level for assert_almost_equal for MBCC

* Nudge to CI
---
 .../nightly/model_backwards_compatibility_check/common.py | 2 ++
 .../model_backwards_compat_inference.py                   | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py
index 4c61cc4e3..8950a9270 100644
--- a/tests/nightly/model_backwards_compatibility_check/common.py
+++ b/tests/nightly/model_backwards_compatibility_check/common.py
@@ -41,6 +41,8 @@
 backslash = '/'
 s3 = boto3.resource('s3')
 ctx = mx.cpu(0)
+atol_default = 1e-5
+rtol_default = 1e-5
 
 
 def get_model_path(model_name):
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
index ae368e3a0..5d63e7e9b 100644
--- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
@@ -44,7 +44,7 @@ def test_module_checkpoint_api():
         old_inference_results = load_inference_results(model_name)
         inference_results = loaded_model.predict(data_iter)
         # Check whether they are equal or not ?
-        assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy())
+        assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
 
@@ -69,7 +69,7 @@ def test_lenet_gluon_load_params_api():
         loaded_model.load_params(model_name + '-params')
         output = loaded_model(test_data)
         old_inference_results = mx.nd.load(model_name + '-inference')['inference']
-        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
     logging.info('Assertion passed for model : %s' % model_name)
@@ -92,7 +92,7 @@ def test_lenet_gluon_hybrid_imports_api():
         loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0000.params')
         output = loaded_model(test_data)
         old_inference_results = mx.nd.load(model_name + '-inference')['inference']
-        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
     logging.info('Assertion passed for model : %s' % model_name)
@@ -124,7 +124,7 @@ def test_lstm_gluon_load_parameters_api():
         loaded_model.load_parameters(model_name + '-params')
         output = loaded_model(test_data)
         old_inference_results = mx.nd.load(model_name + '-inference')['inference']
-        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
     logging.info('Assertion passed for model : %s' % model_name)

From 32c2e159ae63458b9aa0231761c8fec38be42df4 Mon Sep 17 00:00:00 2001
From: Kellen Sunderland <kellen.sunderland@gmail.com>
Date: Fri, 3 Aug 2018 13:02:15 +0200
Subject: [PATCH 56/63] Disable flaky mkldnn test_requantize_int32_to_int8
 (#11748)

---
 tests/python/quantization/test_quantization.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 359bbee56..08303c816 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -77,6 +77,7 @@ def test_dequantize_int8_to_float32():
 
 
 @with_seed()
+@unittest.skip('Flaky test, tracked in: https://github.com/apache/incubator-mxnet/issues/11747')
 def test_requantize_int32_to_int8():
     def quantized_int32_to_float(qdata, min_range, max_range):
         assert qdata.dtype == 'int32'

From 1fa04f2c9a7ba0c3273d080afa7fc993b927f114 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Fri, 3 Aug 2018 14:05:41 +0200
Subject: [PATCH 57/63] [MXNET-769] Usability improvements to windows builds
 (#11947)

* Windows scripted build
Adjust Jenkins builds to use ci/build_windows.py

Issues:

    #8714
    #11100
    #10166
    #10049

* Fix bug

* Fix non-portable ut

* add xunit
---
 CMakeLists.txt                   |   2 +-
 Jenkinsfile                      | 152 +++----------------
 ci/build.py                      |  16 +-
 ci/build_windows.py              | 253 +++++++++++++++++++++++++++++++
 ci/util.py                       |  43 ++++++
 ci/windows/test_py2_cpu.ps1      |  25 +++
 ci/windows/test_py2_gpu.ps1      |  29 ++++
 ci/windows/test_py3_cpu.ps1      |  25 +++
 ci/windows/test_py3_gpu.ps1      |  29 ++++
 tests/python/gpu/test_forward.py |   6 +-
 tests/requirements.txt           |   3 +
 tools/license_header.py          |   2 +-
 12 files changed, 438 insertions(+), 147 deletions(-)
 create mode 100755 ci/build_windows.py
 create mode 100644 ci/util.py
 create mode 100644 ci/windows/test_py2_cpu.ps1
 create mode 100644 ci/windows/test_py2_gpu.ps1
 create mode 100644 ci/windows/test_py3_cpu.ps1
 create mode 100644 ci/windows/test_py3_gpu.ps1
 create mode 100644 tests/requirements.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 483108a68..000bbbf17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -372,13 +372,13 @@ endif()
 
 # ---[ LAPack
 if(USE_LAPACK)
+  message("USE_LAPACK is ON")
   add_definitions(-DMXNET_USE_LAPACK=1)
   if (NOT MSVC)
     list(APPEND mxnet_LINKER_LIBS lapack)
   endif()
 endif()
 
-message("USE LAPACK ${USE_LAPACK}")
 
 # ---[ jemalloc
 if(USE_JEMALLOC)
diff --git a/Jenkinsfile b/Jenkinsfile
index 6d9c60732..9d7792066 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -391,28 +391,8 @@ try {
           ws('workspace/build-cpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
               init_git_win()
-              bat """mkdir build_vc14_cpu
-                call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-                cd build_vc14_cpu
-                cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DUSE_MKL_IF_AVAILABLE=0 ${env.WORKSPACE}"""
-              bat 'C:\\mxnet\\build_vc14_cpu.bat'
-
-              bat '''rmdir /s/q pkg_vc14_cpu
-                mkdir pkg_vc14_cpu\\lib
-                mkdir pkg_vc14_cpu\\python
-                mkdir pkg_vc14_cpu\\include
-                mkdir pkg_vc14_cpu\\build
-                copy build_vc14_cpu\\Release\\libmxnet.lib pkg_vc14_cpu\\lib
-                copy build_vc14_cpu\\Release\\libmxnet.dll pkg_vc14_cpu\\build
-                xcopy python pkg_vc14_cpu\\python /E /I /Y
-                xcopy include pkg_vc14_cpu\\include /E /I /Y
-                xcopy 3rdparty\\dmlc-core\\include pkg_vc14_cpu\\include /E /I /Y
-                xcopy 3rdparty\\mshadow\\mshadow pkg_vc14_cpu\\include\\mshadow /E /I /Y
-                xcopy 3rdparty\\nnvm\\include pkg_vc14_cpu\\nnvm\\include /E /I /Y
-                del /Q *.7z
-                7z.exe a vc14_cpu.7z pkg_vc14_cpu\\
-                '''
-              stash includes: 'vc14_cpu.7z', name: 'vc14_cpu'
+              powershell 'python ci/build_windows.py -f WIN_CPU'
+              stash includes: 'windows_package.7z', name: 'windows_package_cpu'
             }
           }
         }
@@ -424,28 +404,9 @@ try {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/build-gpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-            init_git_win()
-            bat """mkdir build_vc14_gpu
-              call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-              cd build_vc14_gpu
-              cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=52 -DCUDA_ARCH_PTX=52 -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release -DUSE_MKL_IF_AVAILABLE=0 ${env.WORKSPACE}"""
-            bat 'C:\\mxnet\\build_vc14_gpu.bat'
-            bat '''rmdir /s/q pkg_vc14_gpu
-              mkdir pkg_vc14_gpu\\lib
-              mkdir pkg_vc14_gpu\\python
-              mkdir pkg_vc14_gpu\\include
-              mkdir pkg_vc14_gpu\\build
-              copy build_vc14_gpu\\libmxnet.lib pkg_vc14_gpu\\lib
-              copy build_vc14_gpu\\libmxnet.dll pkg_vc14_gpu\\build
-              xcopy python pkg_vc14_gpu\\python /E /I /Y
-              xcopy include pkg_vc14_gpu\\include /E /I /Y
-              xcopy 3rdparty\\dmlc-core\\include pkg_vc14_gpu\\include /E /I /Y
-              xcopy 3rdparty\\mshadow\\mshadow pkg_vc14_gpu\\include\\mshadow /E /I /Y
-              xcopy 3rdparty\\nnvm\\include pkg_vc14_gpu\\nnvm\\include /E /I /Y
-              del /Q *.7z
-              7z.exe a vc14_gpu.7z pkg_vc14_gpu\\
-              '''
-            stash includes: 'vc14_gpu.7z', name: 'vc14_gpu'
+              init_git_win()
+              powershell 'python ci/build_windows.py -f WIN_GPU'
+              stash includes: 'windows_package.7z', name: 'windows_package_gpu'
             }
           }
         }
@@ -456,37 +417,9 @@ try {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/build-gpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0','BUILD_NAME=vc14_gpu_mkldnn']) {
-            init_git_win()
-            bat """mkdir build_%BUILD_NAME%
-              call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-              cd build_%BUILD_NAME%
-              copy ${env.WORKSPACE}\\3rdparty\\mkldnn\\config_template.vcxproj.user ${env.WORKSPACE}\\config_template.vcxproj.user /y
-              cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=52 -DCUDA_ARCH_PTX=52 -DUSE_MKLDNN=1 -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
-            bat '''
-                call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-                cd build_%BUILD_NAME%
-                set /a cores=%NUMBER_OF_PROCESSORS% * 2
-                jom -j %cores%
-                '''
-            bat '''rmdir /s/q pkg_%BUILD_NAME%
-              mkdir pkg_%BUILD_NAME%\\lib
-              mkdir pkg_%BUILD_NAME%\\python
-              mkdir pkg_%BUILD_NAME%\\include
-              mkdir pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\libmxnet.lib pkg_%BUILD_NAME%\\lib
-              copy build_%BUILD_NAME%\\libmxnet.dll pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\3rdparty\\mkldnn\\src\\mkldnn.dll pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\libiomp5md.dll pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\mklml.dll pkg_%BUILD_NAME%\\build
-              xcopy python pkg_%BUILD_NAME%\\python /E /I /Y
-              xcopy include pkg_%BUILD_NAME%\\include /E /I /Y
-              xcopy 3rdparty\\dmlc-core\\include pkg_%BUILD_NAME%\\include /E /I /Y
-              xcopy 3rdparty\\mshadow\\mshadow pkg_%BUILD_NAME%\\include\\mshadow /E /I /Y
-              xcopy 3rdparty\\nnvm\\include pkg_%BUILD_NAME%\\nnvm\\include /E /I /Y
-              del /Q *.7z
-              7z.exe a %BUILD_NAME%.7z pkg_%BUILD_NAME%\\
-              '''
-            stash includes: 'vc14_gpu_mkldnn.7z', name: 'vc14_gpu_mkldnn'
+              init_git_win()
+              powershell 'python ci/build_windows.py -f WIN_GPU_MKLDNN'
+              stash includes: 'windows_package.7z', name: 'windows_package_gpu_mkldnn'
             }
           }
         }
@@ -870,16 +803,8 @@ try {
           ws('workspace/ut-python-cpu') {
             try {
               init_git_win()
-              unstash 'vc14_cpu'
-              bat '''rmdir /s/q pkg_vc14_cpu
-                7z x -y vc14_cpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py2
-                pip install mock
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-                C:\\mxnet\\test_cpu.bat"""
+              unstash 'windows_package_cpu'
+              powershell 'ci/windows/test_py2_cpu.ps1'
             } finally {
               collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python2_cpu.xml')
             }
@@ -893,15 +818,8 @@ try {
           ws('workspace/ut-python-cpu') {
             try {
               init_git_win()
-              unstash 'vc14_cpu'
-              bat '''rmdir /s/q pkg_vc14_cpu
-                7z x -y vc14_cpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py3
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-                C:\\mxnet\\test_cpu.bat"""
+              unstash 'windows_package_cpu'
+              powershell 'ci/windows/test_py3_cpu.ps1'
             } finally {
               collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python3_cpu.xml')
             }
@@ -915,19 +833,11 @@ try {
           ws('workspace/ut-python-gpu') {
             try {
               init_git_win()
-              unstash 'vc14_gpu'
-              bat '''rmdir /s/q pkg_vc14_gpu
-                7z x -y vc14_gpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py2
-                pip install mock
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-                C:\\mxnet\\test_gpu.bat"""
+              unstash 'windows_package_gpu'
+              powershell 'ci/windows/test_py2_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_gpu_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
-              collect_test_results_windows('nosetests_gpu_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
+              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
+              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
             }
           }
         }
@@ -939,18 +849,11 @@ try {
           ws('workspace/ut-python-gpu') {
             try {
               init_git_win()
-              unstash 'vc14_gpu'
-              bat '''rmdir /s/q pkg_vc14_gpu
-                7z x -y vc14_gpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py3
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-                C:\\mxnet\\test_gpu.bat"""
+              unstash 'windows_package_gpu'
+              powershell 'ci/windows/test_py3_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_gpu_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
-              collect_test_results_windows('nosetests_gpu_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
+              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
+              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
             }
           }
         }
@@ -962,18 +865,11 @@ try {
           ws('workspace/ut-python-gpu') {
             try {
               init_git_win()
-              unstash 'vc14_gpu_mkldnn'
-              bat '''rmdir /s/q pkg_vc14_gpu_mkldnn
-                7z x -y vc14_gpu_mkldnn.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py3
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python\\*.pyc
-                C:\\mxnet\\test_gpu.bat"""
+              unstash 'windows_package_gpu_mkldnn'
+              powershell 'ci/windows/test_py3_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_gpu_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
-              collect_test_results_windows('nosetests_gpu_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
+              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
+              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
             }
           }
         }
diff --git a/ci/build.py b/ci/build.py
index 09f2d4709..a9d6a6353 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -39,6 +39,7 @@
 from itertools import chain
 from subprocess import call, check_call
 from typing import *
+from util import *
 
 CCACHE_MAXSIZE = '500G'
 
@@ -138,24 +139,9 @@ def _get_local_image_id(docker_binary, docker_tag):
     return image_id
 
 
-def get_mxnet_root() -> str:
-    curpath = os.path.abspath(os.path.dirname(__file__))
-
-    def is_mxnet_root(path: str) -> bool:
-        return os.path.exists(os.path.join(path, ".mxnet_root"))
-
-    while not is_mxnet_root(curpath):
-        parent = os.path.abspath(os.path.join(curpath, os.pardir))
-        if parent == curpath:
-            raise RuntimeError("Got to the root and couldn't find a parent folder with .mxnet_root")
-        curpath = parent
-    return curpath
-
-
 def buildir() -> str:
     return os.path.join(get_mxnet_root(), "build")
 
-
 def default_ccache_dir() -> str:
     # Share ccache across containers
     if 'CCACHE_DIR' in os.environ:
diff --git a/ci/build_windows.py b/ci/build_windows.py
new file mode 100755
index 000000000..5eca58db7
--- /dev/null
+++ b/ci/build_windows.py
@@ -0,0 +1,253 @@
+﻿#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""User friendly / multi platform builder script"""
+
+import subprocess
+import logging
+import os
+import tempfile
+import sys
+from distutils import spawn
+import logging
+from subprocess import check_call
+import platform
+import argparse
+from util import *
+import json
+from enum import Enum
+import time
+import datetime
+import shutil
+import glob
+from distutils.dir_util import copy_tree
+
+KNOWN_VCVARS = [
+    # VS 2015
+      r'C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64\vcvarsx86_amd64.bat'
+    # VS 2017
+    , r'c:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat'
+]
+
+class BuildFlavour(Enum):
+    WIN_CPU = 'WIN_CPU'
+    WIN_CPU_MKLDNN = 'WIN_CPU_MKLDNN'
+    WIN_GPU = 'WIN_GPU'
+    WIN_GPU_MKLDNN = 'WIN_GPU_MKLDNN'
+
+CMAKE_FLAGS = {
+    'WIN_CPU': '-DUSE_CUDA=0 \
+                -DUSE_CUDNN=0 \
+                -DUSE_NVRTC=0 \
+                -DUSE_OPENCV=1 \
+                -DUSE_OPENMP=1 \
+                -DUSE_PROFILER=1 \
+                -DUSE_BLAS=open \
+                -DUSE_LAPACK=1 \
+                -DUSE_DIST_KVSTORE=0 \
+                -DBUILD_CPP_EXAMPLES=1 \
+                -DUSE_MKL_IF_AVAILABLE=0'
+
+    ,'WIN_CPU_MKLDNN': '-DUSE_CUDA=0 \
+                        -DUSE_CUDNN=0 \
+                        -DUSE_NVRTC=0 \
+                        -DUSE_OPENCV=1 \
+                        -DUSE_OPENMP=1 \
+                        -DUSE_PROFILER=1 \
+                        -DUSE_BLAS=open \
+                        -DUSE_LAPACK=1 \
+                        -DUSE_DIST_KVSTORE=0 \
+                        -DUSE_MKL_IF_AVAILABLE=1'
+
+    ,'WIN_GPU': '-DUSE_CUDA=1 \
+                 -DUSE_CUDNN=1 \
+                 -DUSE_NVRTC=1 \
+                 -DUSE_OPENCV=1  \
+                 -DUSE_OPENMP=1 \
+                 -DUSE_PROFILER=1 \
+                 -DUSE_BLAS=open  \
+                 -DUSE_LAPACK=1  \
+                 -DUSE_DIST_KVSTORE=0 \
+                 -DCUDA_ARCH_NAME=Manual \
+                 -DCUDA_ARCH_BIN=52 \
+                 -DCUDA_ARCH_PTX=52 \
+                 -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" \
+                 -DUSE_MKL_IF_AVAILABLE=0 \
+                 -DCMAKE_BUILD_TYPE=Release'
+
+    ,'WIN_GPU_MKLDNN': '-DUSE_CUDA=1 \
+                        -DUSE_CUDNN=1 \
+                        -DUSE_NVRTC=1 \
+                        -DUSE_OPENCV=1 \
+                        -DUSE_OPENMP=1 \
+                        -DUSE_PROFILER=1 \
+                        -DUSE_BLAS=open \
+                        -DUSE_LAPACK=1 \
+                        -DUSE_DIST_KVSTORE=0 \
+                        -DCUDA_ARCH_NAME=Manual \
+                        -DCUDA_ARCH_BIN=52 \
+                        -DCUDA_ARCH_PTX=52 \
+                        -DUSE_MKLDNN=1 \
+                        -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 \
+                        /DNDEBUG" \
+                        -DCMAKE_BUILD_TYPE=Release'
+
+}
+
+
+def get_vcvars_environment(architecture, vcvars):
+    """
+    Returns a dictionary containing the environment variables set up by vcvars
+    """
+    result = None
+    python = sys.executable
+
+    vcvars_list = [vcvars]
+    vcvars_list.extend(KNOWN_VCVARS)
+    for vcvars in vcvars_list:
+        if os.path.isfile(vcvars):
+            process = subprocess.Popen('("%s" %s>nul) && "%s" -c "import os; import json; print(json.dumps(dict(os.environ)))"' % (vcvars, architecture, python), stdout=subprocess.PIPE, shell=True)
+            stdout, stderr = process.communicate()
+            exitcode = process.wait()
+            if exitcode == 0:
+                logging.info("Using build environment from: %s", vcvars)
+                return(json.loads(stdout.strip()))
+            else:
+                raise RuntimeError('Failed cloning environment from vcvars file: %s stdout: %s stderr: %s', vcvars, stdout, stderr)
+    raise RuntimeError('Couldn\'t find vcvars batch file: %s', vcvars)
+
+
+def windows_build(args):
+    vcvars_env = get_vcvars_environment(args.arch, args.vcvars)
+    logging.debug("vcvars environment: %s", vcvars_env)
+    os.environ.update(vcvars_env)
+
+    path = args.output
+    os.makedirs(path, exist_ok=True)
+    mxnet_root = get_mxnet_root()
+    logging.info("Found mxnet root: {}".format(mxnet_root))
+    with remember_cwd():
+        os.chdir(path)
+        logging.info("Generating project with CMake")
+        check_call("cmake -G \"Visual Studio 14 2015 Win64\" {} {}".format(CMAKE_FLAGS[args.flavour], mxnet_root), shell=True)
+        logging.info("Building with visual studio")
+        t0 = int(time.time())
+        check_call(["msbuild", "mxnet.sln","/p:configuration=release;platform=x64", "/maxcpucount","/v:minimal"])
+        logging.info("Build flavour: %s complete in directory: \"%s\"", args.flavour, os.path.abspath(path))
+        logging.info("Build took %s" , datetime.timedelta(seconds=int(time.time()-t0)))
+    windows_package(args)
+
+def windows_package(args):
+    pkgfile = 'windows_package.7z'
+    pkgdir = os.path.abspath('windows_package')
+    logging.info("Packaging libraries and headers in package: %s", pkgfile)
+    j = os.path.join
+    pkgdir_lib = os.path.abspath(j(pkgdir, 'lib'))
+    with remember_cwd():
+        os.chdir(args.output)
+        logging.info("Looking for static libraries and dlls in: \"%s", os.getcwd())
+        libs = list(glob.iglob('**/*.lib', recursive=True))
+        dlls = list(glob.iglob('**/*.dll', recursive=True))
+        os.makedirs(pkgdir_lib, exist_ok=True)
+        for lib in libs:
+            logging.info("packing lib: %s", lib)
+            shutil.copy(lib, pkgdir_lib)
+        for dll in dlls:
+            logging.info("packing dll: %s", dll)
+            shutil.copy(dll, pkgdir_lib)
+        os.chdir(get_mxnet_root())
+        logging.info('packing python bindings')
+        copy_tree('python', j(pkgdir, 'python'))
+        logging.info('packing headers')
+        copy_tree('include', j(pkgdir, 'include'))
+        copy_tree(j('3rdparty','dmlc-core','include'), j(pkgdir, 'include'))
+        copy_tree(j('3rdparty','mshadow', 'mshadow'), j(pkgdir, 'include', 'mshadow'))
+        copy_tree(j('3rdparty','tvm','nnvm', 'include'), j(pkgdir,'include', 'nnvm', 'include'))
+        logging.info("Compressing package: %s", pkgfile)
+        check_call(['7z', 'a', pkgfile, pkgdir])
+
+
+def nix_build(args):
+    path = args.output
+    os.makedirs(path, exist_ok=True)
+    with remember_cwd():
+        os.chdir(path)
+        logging.info("Generating project with CMake")
+        check_call("cmake \
+            -DUSE_CUDA=OFF \
+            -DUSE_OPENCV=OFF \
+            -DUSE_OPENMP=OFF \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -GNinja ..", shell=True)
+        check_call("ninja", shell=True)
+
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+    logging.basicConfig(format='%(asctime)-15s %(message)s')
+    logging.info("MXNet Windows build helper")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output",
+        help="output directory",
+        default='build',
+        type=str)
+
+    parser.add_argument("--vcvars",
+        help="vcvars batch file location, typically inside vs studio install dir",
+        default=r'c:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat',
+        type=str)
+
+    parser.add_argument("--arch",
+        help="architecture",
+        default='x64',
+        type=str)
+
+    parser.add_argument("-f", "--flavour",
+        help="build flavour",
+        default='WIN_CPU',
+        choices=[x.name for x in BuildFlavour],
+        type=str)
+
+    args = parser.parse_args()
+    logging.info("Build flavour: %s", args.flavour)
+
+    system = platform.system()
+    if system == 'Windows':
+        logging.info("Detected Windows platform")
+        if 'OpenBLAS_HOME' not in os.environ:
+            os.environ["OpenBLAS_HOME"] = "C:\\mxnet\\openblas"
+        if 'OpenCV_DIR' not in os.environ:
+            os.environ["OpenCV_DIR"] = "C:\\mxnet\\opencv_vc14"
+        if 'CUDA_PATH' not in os.environ:
+            os.environ["CUDA_PATH"] = "C:\\CUDA\\v8.0"
+        windows_build(args)
+
+    elif system == 'Linux' or system == 'Darwin':
+        nix_build(args)
+
+    else:
+        logging.error("Don't know how to build for {} yet".format(platform.system()))
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/ci/util.py b/ci/util.py
new file mode 100644
index 000000000..22631f304
--- /dev/null
+++ b/ci/util.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import contextlib
+
+def get_mxnet_root() -> str:
+    curpath = os.path.abspath(os.path.dirname(__file__))
+
+    def is_mxnet_root(path: str) -> bool:
+        return os.path.exists(os.path.join(path, ".mxnet_root"))
+
+    while not is_mxnet_root(curpath):
+        parent = os.path.abspath(os.path.join(curpath, os.pardir))
+        if parent == curpath:
+            raise RuntimeError("Got to the root and couldn't find a parent folder with .mxnet_root")
+        curpath = parent
+    return curpath
+
+@contextlib.contextmanager
+def remember_cwd():
+    '''
+    Restore current directory when exiting context
+    '''
+    curdir = os.getcwd()
+    try: yield
+    finally: os.chdir(curdir)
+
+
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
new file mode 100644
index 000000000..1623d2956
--- /dev/null
+++ b/ci/windows/test_py2_cpu.ps1
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_train.xml tests\python\train
+if (! $?) { Throw ("Error running train tests") }
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
new file mode 100644
index 000000000..13cd5366e
--- /dev/null
+++ b/ci/windows/test_py2_gpu.ps1
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v tests\python\train
+if (! $?) { Throw ("Error running tests") }
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
new file mode 100644
index 000000000..98d4e410e
--- /dev/null
+++ b/ci/windows/test_py3_cpu.ps1
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_train.xml tests\python\train
+if (! $?) { Throw ("Error running train tests") }
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
new file mode 100644
index 000000000..b94b4f389
--- /dev/null
+++ b/ci/windows/test_py3_gpu.ps1
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_train.xml tests\python\train
+if (! $?) { Throw ("Error running tests") }
diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
index 126ccabaa..02b025602 100644
--- a/tests/python/gpu/test_forward.py
+++ b/tests/python/gpu/test_forward.py
@@ -24,11 +24,13 @@
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import setup_module, with_seed, teardown
 from mxnet.gluon import utils
+import tarfile
 
 def _get_model():
     if not os.path.exists('model/Inception-7-symbol.json'):
-        download('http://data.mxnet.io/models/imagenet/inception-v3.tar.gz', dirname='model')
-        os.system("cd model; tar -xf inception-v3.tar.gz --strip-components 1")
+        download('http://data.mxnet.io/models/imagenet/inception-v3.tar.gz')
+        with tarfile.open(name="inception-v3.tar.gz", mode="r:gz") as tf:
+            tf.extractall()
 
 def _dump_images(shape):
     import skimage.io
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 000000000..0eca73fbb
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,3 @@
+# Requirements for tests, those are installed before running on the virtualenv
+mock
+nose
diff --git a/tools/license_header.py b/tools/license_header.py
index 0ee404933..7aef33b71 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -82,7 +82,7 @@
 _LANGS = {'.cc':'*', '.h':'*', '.cu':'*', '.cuh':'*', '.py':'#',
           '.pm':'#', '.scala':'*', '.cc':'*', '.sh':'#', '.cmake':'#',
           '.java':'*', '.sh':'#', '.cpp':'*', '.hpp':'*', '.c':'*',
-          '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#', '.t':'#'}
+          '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#', '.t':'#', '.ps1': '#'}
 
 # Previous license header, which will be removed
 _OLD_LICENSE = re.compile('.*Copyright.*by Contributors')

From 56281946cc0d1185235893c61c174cd6afea28a3 Mon Sep 17 00:00:00 2001
From: Vandana Kannan <vandanavk@users.noreply.github.com>
Date: Fri, 3 Aug 2018 11:52:38 -0700
Subject: [PATCH 58/63] Fix import statement (#12005)

array and multiply are undefined. Importing them from
ndarray
---
 python/mxnet/optimizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index f758af5f9..7e69cf613 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -24,7 +24,7 @@
 import warnings
 import numpy
 from .base import py_str
-from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs)
+from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from .ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                       mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
                       signsgd_update, signum_update)

From 3dd0003c2b9d4d561c5346372caea1db1aa37744 Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Fri, 3 Aug 2018 23:40:23 +0200
Subject: [PATCH 59/63] Disable flaky test test_random.test_gamma_generator
 (#12022)

---
 tests/python/unittest/test_random.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index d90dfcf85..43e960893 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -447,6 +447,7 @@ def test_uniform_generator():
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
 
 @with_seed()
+@unittest.skip('Flaky test, tracked in: https://github.com/apache/incubator-mxnet/issues/9856')
 def test_gamma_generator():
     ctx = mx.context.current_context()
     for dtype in ['float16', 'float32', 'float64']:

From 3910c08d4e054ae55d59cd6335f9448819550116 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 3 Aug 2018 15:42:18 -0700
Subject: [PATCH 60/63] [MXNET-770] Fix flaky test:
 test_factorization_machine_module (#12023)

* Remove fixed seed in flaky test

* Remove fixed seed in flaky test

* Update random seed to reproduce the issue

* Fix Flaky unit test and add a training test

* Remove fixed seed in flaky test

* Update random seed to reproduce the issue

* Fix Flaky unit test and add a training test

* Increase accuracy check
---
 tests/python/train/test_sparse_fm.py | 138 +++++++++++++++++++++++++++
 tests/python/unittest/test_module.py |  64 ++++---------
 2 files changed, 154 insertions(+), 48 deletions(-)
 create mode 100644 tests/python/train/test_sparse_fm.py

diff --git a/tests/python/train/test_sparse_fm.py b/tests/python/train/test_sparse_fm.py
new file mode 100644
index 000000000..99a22f54c
--- /dev/null
+++ b/tests/python/train/test_sparse_fm.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import mxnet.ndarray as nd
+from mxnet.test_utils import *
+import numpy as np
+
+def test_factorization_machine_module(verbose=False):
+    """ Test factorization machine model with sparse operators """
+    def check_factorization_machine_module(optimizer=None, num_epochs=None):
+        print("check_factorization_machine_module( {} )".format(optimizer))
+
+        def fm(factor_size, feature_dim, init):
+            x = mx.symbol.Variable("data", stype='csr')
+            v = mx.symbol.Variable("v", shape=(feature_dim, factor_size),
+                                   init=init, stype='row_sparse')
+
+            w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1),
+                                      init=init, stype='row_sparse')
+            w1_bias = mx.symbol.var('w1_bias', shape=(1))
+            w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias)
+
+            v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True)
+            x_s = mx.symbol.square(data=x)
+            bd_sum = mx.sym.dot(x_s, v_s)
+
+            w2 = mx.symbol.dot(x, v)
+            w2_squared = 0.5 * mx.symbol.square(data=w2)
+
+            w_all = mx.symbol.Concat(w1, w2_squared, dim=1)
+            sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True)
+            sum2 = 0.5 * mx.symbol.negative(bd_sum)
+            model = mx.sym.elemwise_add(sum1, sum2)
+
+            y = mx.symbol.Variable("label")
+            model = mx.symbol.LinearRegressionOutput(data=model, label=y)
+            return model
+
+        # model
+        init = mx.initializer.Normal(sigma=0.01)
+        factor_size = 4
+        feature_dim = 10000
+        model = fm(factor_size, feature_dim, init)
+
+        # data iter
+        num_batches = 5
+        batch_size = 64
+        num_samples = batch_size * num_batches
+        # generate some random csr data
+        csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1)
+        label = mx.nd.ones((num_samples,1))
+        # the alternative is to use LibSVMIter
+        train_iter = mx.io.NDArrayIter(data=csr_nd,
+                                       label={'label':label},
+                                       batch_size=batch_size,
+                                       last_batch_handle='discard')
+        # create module
+        mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
+        # allocate memory by given the input data and lable shapes
+        mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+        # initialize parameters by uniform random numbers
+        mod.init_params(initializer=init)
+        if optimizer == 'sgd':
+            # use Sparse SGD with learning rate 0.1 to train
+            sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
+                                   rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=sgd)
+            if num_epochs is None:
+                num_epochs = 10
+            expected_accuracy = 0.02
+        elif optimizer == 'adam':
+            # use Sparse Adam to train
+            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.0005,
+                                     rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=adam)
+            if num_epochs is None:
+                num_epochs = 10
+            expected_accuracy = 0.05
+        elif optimizer == 'adagrad':
+            # use Sparse AdaGrad with learning rate 0.1 to train
+            adagrad = mx.optimizer.AdaGrad(clip_gradient=5.0, learning_rate=0.01,
+                                           rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=adagrad)
+            if num_epochs is None:
+                num_epochs = 20
+            expected_accuracy = 0.09
+        else:
+            raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified")
+        # use accuracy as the metric
+        metric = mx.metric.create('MSE')
+        # train 'num_epochs' epoch
+        for epoch in range(num_epochs):
+            train_iter.reset()
+            metric.reset()
+            for batch in train_iter:
+                mod.forward(batch, is_train=True)       # compute predictions
+                mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
+                mod.backward()                          # compute gradients
+                mod.update()                            # update parameters
+            print('Epoch %d, Training %s' % (epoch, metric.get()))
+        if num_epochs > 1:
+            assert(metric.get()[1] < expected_accuracy)
+
+    if verbose is True:
+        print("============ SGD ==========================")
+        start = time.clock()
+    check_factorization_machine_module('sgd')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+        print("============ ADAM ==========================")
+        start = time.clock()
+    check_factorization_machine_module('adam')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+        print("============ ADAGRAD ==========================")
+        start = time.clock()
+    check_factorization_machine_module('adagrad')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+
+# run as a script
+if __name__ == "__main__":
+    test_factorization_machine_module()	
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index a2a24762a..a21527a5a 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -558,11 +558,12 @@ def check_shared_exec_group(sparse_embedding):
     for opt in sparse_embedding_opt:
         check_shared_exec_group(opt)
 
-@with_seed(11)
-def test_factorization_machine_module(verbose=False):
+@with_seed()
+def test_factorization_machine_module():
     """ Test factorization machine model with sparse operators """
-    def check_factorization_machine_module(optimizer=None, num_epochs=None):
-        print("check_factorization_machine_module( {} )".format(optimizer))
+    # this unit test is to test the flow, training accuracy is tested in another test
+    def check_factorization_machine_module(num_epochs=None):
+        print("check_factorization_machine_module")
 
         def fm(factor_size, feature_dim, init):
             x = mx.symbol.Variable("data", stype='csr')
@@ -614,33 +615,16 @@ def fm(factor_size, feature_dim, init):
         mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
         # initialize parameters by uniform random numbers
         mod.init_params(initializer=init)
-        if optimizer == 'sgd':
-            # use Sparse SGD with learning rate 0.1 to train
-            sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
-                                   rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=sgd)
-            if num_epochs is None:
-                num_epochs = 10
-            expected_accuracy = 0.02
-        elif optimizer == 'adam':
-            # use Sparse Adam to train
-            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.0005,
-                                     rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=adam)
-            if num_epochs is None:
-                num_epochs = 10
-            expected_accuracy = 0.05
-        elif optimizer == 'adagrad':
-            # use Sparse AdaGrad with learning rate 0.1 to train
-            adagrad = mx.optimizer.AdaGrad(clip_gradient=5.0, learning_rate=0.01,
-                                           rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=adagrad)
-            if num_epochs is None:
-                num_epochs = 20
-            expected_accuracy = 0.09
-        else:
-            raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified")
-        # use accuracy as the metric
+
+        # use Sparse SGD with learning rate 0.1 to train
+        sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
+                               rescale_grad=1.0/batch_size)
+        mod.init_optimizer(optimizer=sgd)
+        if num_epochs is None:
+            num_epochs = 50
+        expected_accuracy = 0.02
+
+	# use accuracy as the metric
         metric = mx.metric.create('MSE')
         # train 'num_epochs' epoch
         for epoch in range(num_epochs):
@@ -655,23 +639,7 @@ def fm(factor_size, feature_dim, init):
         if num_epochs > 1:
             assert(metric.get()[1] < expected_accuracy)
 
-    if verbose is True:
-        print("============ SGD ==========================")
-        start = time.clock()
-    check_factorization_machine_module('sgd')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-        print("============ ADAM ==========================")
-        start = time.clock()
-    check_factorization_machine_module('adam')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-        print("============ ADAGRAD ==========================")
-        start = time.clock()
-    check_factorization_machine_module('adagrad')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-
+    check_factorization_machine_module()
 
 @with_seed()
 def test_module_initializer():

From ae698f9598d88118b0c96cae3965dd2375884562 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 3 Aug 2018 15:42:35 -0700
Subject: [PATCH 61/63] disable opencv threading for forked process (#12025)

---
 src/initialize.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/initialize.cc b/src/initialize.cc
index 1fd92628e..342b0ee01 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -26,6 +26,9 @@
 #include <dmlc/logging.h>
 #include <mxnet/engine.h>
 #include "./engine/openmp.h"
+#if MXNET_USE_OPENCV
+#include <opencv2/opencv.hpp>
+#endif  // MXNET_USE_OPENCV
 
 namespace mxnet {
 #if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
@@ -57,6 +60,9 @@ class LibraryInitializer {
         // Make children single threaded since they are typically workers
         dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
         dmlc::SetEnv("OMP_NUM_THREADS", 1);
+#if MXNET_USE_OPENCV
+        cv::setNumThreads(0);  // disable opencv threading
+#endif  // MXNET_USE_OPENCV
         engine::OpenMP::Get()->set_enabled(false);
         Engine::Get()->Start();
       });

From 22c97efc418463befc12881892246f999a85b971 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 3 Aug 2018 15:48:02 -0700
Subject: [PATCH 62/63] Bug fixes in control flow operators (#11942)

---
 python/mxnet/symbol/contrib.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 884288364..1d42cf7c1 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -486,12 +486,12 @@ def _union_inputs(*graphs):
         input_id_to_loc = {}    # Dict[int, int], given id(sym), input_id_to_loc maps it
                                 # to a `loc`, where inputs[loc] = sym
         for graph in graphs:
-            # input_syms: all inputs to the `graph`
-            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # some loop_vars are inputs to `graph`, some are not
             name_to_loop_vars = {sym.name: sym for sym in loop_vars}
             # other inputs to `graph` created by cut_graph
             name_to_cut_g_syms = {sym.list_outputs()[0]: sym for sym in _cut_subgraph(graph)}
+            # input_syms: all inputs to the `graph`
+            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # also we collect the mapping from var's name to var's loc in loop_vars
             name_to_var_locs = {sym.name: i for i, sym in enumerate(loop_vars)}
             # collect arguments for each subgraph
@@ -644,12 +644,12 @@ def _union_inputs(*graphs):
         input_id_to_loc = {}    # Dict[int, int], given id(sym), input_id_to_loc maps it
                                 # to a `loc`, where inputs[loc] = sym
         for graph in graphs:
-            # input_syms: all inputs to the `graph`
-            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # some input_vars are inputs to `graph`, some are not
             name_to_input_vars = {sym.name: sym for sym in inputs}
             # other inputs to `graph` created by cut_graph
             name_to_cut_g_syms = {sym.list_outputs()[0]: sym for sym in _cut_subgraph(graph)}
+            # input_syms: all inputs to the `graph`
+            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # collect arguments for each subgraph
             input_locs = []                         # results from the second step
             for name in graph.list_inputs():
@@ -696,5 +696,4 @@ def _union_inputs(*graphs):
         else_input_locs=else_input_locs,
         num_outputs=then_num_outputs
     )
-    result = _to_symbol_tuple(result, "result")
-    return list(result)
+    return [result[i] for i in range(then_num_outputs)]

From 8d4d5fa9e6594ac633ae1418c3ce4c47adcea653 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Sat, 4 Aug 2018 00:48:31 +0200
Subject: [PATCH 63/63] Fix data narrowing warning on graph_executor.cc
 (#11969)

---
 src/executor/graph_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 7386de4d1..33c6f574a 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1282,7 +1282,7 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   for (size_t i = 0; i < pool_info.size(); i++) {
     sorted_pool_index.push_back(i);
   }
-  auto pool_comparator = [&pool_info](int lhs, int rhs){
+  auto pool_comparator = [&pool_info](size_t lhs, size_t rhs){
     return pool_info[lhs].bytes > pool_info[rhs].bytes;
   };
   std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator);