From b5ea1943bdf113546d7161cfda339365de4d80f8 Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Fri, 30 Nov 2018 09:47:53 -0800
Subject: [PATCH 01/54] Skip flaky test
 https://github.com/apache/incubator-mxnet/issues/13446 (#13480)

---
 tests/python/unittest/test_random.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 3436e9a9e80e..3026d31c0f96 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -860,6 +860,7 @@ def test_randint_extremes():
     assert a>=50000000 and a<=50000010
 
 @with_seed()
+@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/13446")
 def test_randint_generator():
     ctx = mx.context.current_context()
     for dtype in ['int32', 'int64']:

From 883d7712af9d3d6f3a112746c6a318f5f2677b7c Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 30 Nov 2018 10:09:47 -0800
Subject: [PATCH 02/54] Rewrite dataloader with process pool, improves
 responsiveness and reliability (#13447)

* fix recordio.py

* rewrite dataloader with pool

* fix batch as tuple

* fix prefetching

* fix pylint

* picklable function

* use pickle

* add missing commit
---
 python/mxnet/gluon/data/dataloader.py | 223 ++++++++++++++++++++++----
 python/mxnet/recordio.py              |  17 ++
 2 files changed, 209 insertions(+), 31 deletions(-)

diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 86cb835f5128..ad0f534d16dd 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -36,7 +36,6 @@
 
 from . import sampler as _sampler
 from ... import nd, context
-from ...recordio import MXRecordIO
 
 if sys.platform == 'darwin' or sys.platform == 'win32':
     def rebuild_ndarray(*args):
@@ -159,29 +158,9 @@ def _as_in_context(data, ctx):
         return [_as_in_context(d, ctx) for d in data]
     return data
 
-def _recursive_fork_recordio(obj, depth, max_depth=1000):
-    """Recursively find instance of MXRecordIO and reset file handler.
-    This is required for MXRecordIO which holds a C pointer to a opened file after fork.
-    """
-    if depth >= max_depth:
-        return
-    if isinstance(obj, MXRecordIO):
-        obj.close()
-        obj.open()  # re-obtain file hanlder in new process
-    elif (hasattr(obj, '__dict__')):
-        for _, v in obj.__dict__.items():
-            _recursive_fork_recordio(v, depth + 1, max_depth)
-
-def worker_loop(dataset, key_queue, data_queue, batchify_fn):
-    """Worker loop for multiprocessing DataLoader."""
-    # re-fork a new recordio handler in new process if applicable
-    # for a dataset with transform function, the depth of MXRecordIO is 1
-    # for a lazy transformer, the depth is 2
-    # for a user defined transformer, the depth is unknown, try a reasonable depth
-    limit = sys.getrecursionlimit()
-    max_recursion_depth = min(limit - 5, max(10, limit // 2))
-    _recursive_fork_recordio(dataset, 0, max_recursion_depth)
 
+def worker_loop_v1(dataset, key_queue, data_queue, batchify_fn):
+    """Worker loop for multiprocessing DataLoader."""
     while True:
         idx, samples = key_queue.get()
         if idx is None:
@@ -189,7 +168,7 @@ def worker_loop(dataset, key_queue, data_queue, batchify_fn):
         batch = batchify_fn([dataset[i] for i in samples])
         data_queue.put((idx, batch))
 
-def fetcher_loop(data_queue, data_buffer, pin_memory=False, data_buffer_lock=None):
+def fetcher_loop_v1(data_queue, data_buffer, pin_memory=False, data_buffer_lock=None):
     """Fetcher loop for fetching data from queue and put in reorder dict."""
     while True:
         idx, batch = data_queue.get()
@@ -206,10 +185,10 @@ def fetcher_loop(data_queue, data_buffer, pin_memory=False, data_buffer_lock=Non
             data_buffer[idx] = batch
 
 
-class _MultiWorkerIter(object):
-    """Interal multi-worker iterator for DataLoader."""
+class _MultiWorkerIterV1(object):
+    """Internal multi-worker iterator for DataLoader."""
     def __init__(self, num_workers, dataset, batchify_fn, batch_sampler, pin_memory=False,
-                 worker_fn=worker_loop):
+                 worker_fn=worker_loop_v1):
         assert num_workers > 0, "_MultiWorkerIter is not for {} workers".format(num_workers)
         self._num_workers = num_workers
         self._dataset = dataset
@@ -237,7 +216,7 @@ def __init__(self, num_workers, dataset, batchify_fn, batch_sampler, pin_memory=
         self._workers = workers
 
         self._fetcher = threading.Thread(
-            target=fetcher_loop,
+            target=fetcher_loop_v1,
             args=(self._data_queue, self._data_buffer, pin_memory, self._data_buffer_lock))
         self._fetcher.daemon = True
         self._fetcher.start()
@@ -299,7 +278,7 @@ def shutdown(self):
             self._shutdown = True
 
 
-class DataLoader(object):
+class DataLoaderV1(object):
     """Loads data from a dataset and returns mini-batches of data.
 
     Parameters
@@ -390,8 +369,190 @@ def same_process_iter():
             return same_process_iter()
 
         # multi-worker
-        return _MultiWorkerIter(self._num_workers, self._dataset,
-                                self._batchify_fn, self._batch_sampler, self._pin_memory)
+        return _MultiWorkerIterV1(self._num_workers, self._dataset,
+                                  self._batchify_fn, self._batch_sampler, self._pin_memory)
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+_worker_dataset = None
+def _worker_initializer(dataset):
+    """Initialier for processing pool."""
+    # global dataset is per-process based and only available in worker processes
+    # this is only necessary to handle MXIndexedRecordIO because otherwise dataset
+    # can be passed as argument
+    global _worker_dataset
+    _worker_dataset = dataset
+
+def _worker_fn(samples, batchify_fn):
+    """Function for processing data in worker process."""
+    # it is required that each worker process has to fork a new MXIndexedRecordIO handle
+    # preserving dataset as global variable can save tons of overhead and is safe in new process
+    global _worker_dataset
+    batch = batchify_fn([_worker_dataset[i] for i in samples])
+    buf = io.BytesIO()
+    ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(batch)
+    return buf.getvalue()
+
+class _MultiWorkerIter(object):
+    """Internal multi-worker iterator for DataLoader."""
+    def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
+                 worker_fn=_worker_fn, prefetch=0):
+        self._worker_pool = worker_pool
+        self._batchify_fn = batchify_fn
+        self._batch_sampler = batch_sampler
+        self._data_buffer = {}
+        self._rcvd_idx = 0
+        self._sent_idx = 0
+        self._iter = iter(self._batch_sampler)
+        self._worker_fn = worker_fn
+        self._pin_memory = pin_memory
+        # pre-fetch
+        for _ in range(prefetch):
+            self._push_next()
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+    def _push_next(self):
+        """Assign next batch workload to workers."""
+        r = next(self._iter, None)
+        if r is None:
+            return
+        async_ret = self._worker_pool.apply_async(self._worker_fn, (r, self._batchify_fn))
+        self._data_buffer[self._sent_idx] = async_ret
+        self._sent_idx += 1
+
+    def __next__(self):
+        self._push_next()
+        if self._rcvd_idx == self._sent_idx:
+            assert not self._data_buffer, "Data buffer should be empty at this moment"
+            raise StopIteration
+
+        assert self._rcvd_idx < self._sent_idx, "rcvd_idx must be smaller than sent_idx"
+        assert self._rcvd_idx in self._data_buffer, "fatal error with _push_next, rcvd_idx missing"
+        ret = self._data_buffer.pop(self._rcvd_idx)
+        batch = pickle.loads(ret.get())
+        if self._pin_memory:
+            batch = _as_in_context(batch, context.cpu_pinned())
+        batch = batch[0] if len(batch) == 1 else batch
+        self._rcvd_idx += 1
+        return batch
+
+    def next(self):
+        return self.__next__()
+
+    def __iter__(self):
+        return self
+
+
+class DataLoader(object):
+    """Loads data from a dataset and returns mini-batches of data.
+
+    Parameters
+    ----------
+    dataset : Dataset
+        Source dataset. Note that numpy and mxnet arrays can be directly used
+        as a Dataset.
+    batch_size : int
+        Size of mini-batch.
+    shuffle : bool
+        Whether to shuffle the samples.
+    sampler : Sampler
+        The sampler to use. Either specify sampler or shuffle, not both.
+    last_batch : {'keep', 'discard', 'rollover'}
+        How to handle the last batch if batch_size does not evenly divide
+        `len(dataset)`.
+
+        keep - A batch with less samples than previous batches is returned.
+        discard - The last batch is discarded if its incomplete.
+        rollover - The remaining samples are rolled over to the next epoch.
+    batch_sampler : Sampler
+        A sampler that returns mini-batches. Do not specify batch_size,
+        shuffle, sampler, and last_batch if batch_sampler is specified.
+    batchify_fn : callable
+        Callback function to allow users to specify how to merge samples
+        into a batch. Defaults to `default_batchify_fn`::
+
+            def default_batchify_fn(data):
+                if isinstance(data[0], nd.NDArray):
+                    return nd.stack(*data)
+                elif isinstance(data[0], tuple):
+                    data = zip(*data)
+                    return [default_batchify_fn(i) for i in data]
+                else:
+                    data = np.asarray(data)
+                    return nd.array(data, dtype=data.dtype)
+
+    num_workers : int, default 0
+        The number of multiprocessing workers to use for data preprocessing.
+    pin_memory : boolean, default False
+        If ``True``, the dataloader will copy NDArrays into pinned memory
+        before returning them. Copying from CPU pinned memory to GPU is faster
+        than from normal CPU memory.
+    prefetch : int, default is `num_workers * 2`
+        The number of prefetching batches only works if `num_workers` > 0.
+        If `prefetch` > 0, it allow worker process to prefetch certain batches before
+        acquiring data from iterators.
+        Note that using large prefetching batch will provide smoother bootstrapping performance,
+        but will consume more shared_memory. Using smaller number may forfeit the purpose of using
+        multiple worker processes, try reduce `num_workers` in this case.
+        By default it defaults to `num_workers * 2`.
+    """
+    def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
+                 last_batch=None, batch_sampler=None, batchify_fn=None,
+                 num_workers=0, pin_memory=False, prefetch=None):
+        self._dataset = dataset
+        self._pin_memory = pin_memory
+
+        if batch_sampler is None:
+            if batch_size is None:
+                raise ValueError("batch_size must be specified unless " \
+                                 "batch_sampler is specified")
+            if sampler is None:
+                if shuffle:
+                    sampler = _sampler.RandomSampler(len(dataset))
+                else:
+                    sampler = _sampler.SequentialSampler(len(dataset))
+            elif shuffle:
+                raise ValueError("shuffle must not be specified if sampler is specified")
+
+            batch_sampler = _sampler.BatchSampler(
+                sampler, batch_size, last_batch if last_batch else 'keep')
+        elif batch_size is not None or shuffle or sampler is not None or \
+                last_batch is not None:
+            raise ValueError("batch_size, shuffle, sampler and last_batch must " \
+                             "not be specified if batch_sampler is specified.")
+
+        self._batch_sampler = batch_sampler
+        self._num_workers = num_workers if num_workers >= 0 else 0
+        self._worker_pool = None
+        self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
+        if self._num_workers > 0:
+            self._worker_pool = multiprocessing.Pool(
+                self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
+        if batchify_fn is None:
+            if num_workers > 0:
+                self._batchify_fn = default_mp_batchify_fn
+            else:
+                self._batchify_fn = default_batchify_fn
+        else:
+            self._batchify_fn = batchify_fn
+
+    def __iter__(self):
+        if self._num_workers == 0:
+            def same_process_iter():
+                for batch in self._batch_sampler:
+                    ret = self._batchify_fn([self._dataset[idx] for idx in batch])
+                    if self._pin_memory:
+                        ret = _as_in_context(ret, context.cpu_pinned())
+                    yield ret
+            return same_process_iter()
+
+        # multi-worker
+        return _MultiWorkerIter(self._worker_pool, self._batchify_fn, self._batch_sampler,
+                                pin_memory=self._pin_memory, worker_fn=_worker_fn,
+                                prefetch=self._prefetch)
 
     def __len__(self):
         return len(self._batch_sampler)
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
index 2def141c9340..bdc63235d702 100644
--- a/python/mxnet/recordio.py
+++ b/python/mxnet/recordio.py
@@ -18,6 +18,7 @@
 """Read and write for the RecordIO data format."""
 from __future__ import absolute_import
 from collections import namedtuple
+from multiprocessing import current_process
 
 import ctypes
 import struct
@@ -65,6 +66,7 @@ def __init__(self, uri, flag):
         self.uri = c_str(uri)
         self.handle = RecordIOHandle()
         self.flag = flag
+        self.pid = None
         self.is_open = False
         self.open()
 
@@ -78,6 +80,7 @@ def open(self):
             self.writable = False
         else:
             raise ValueError("Invalid flag %s"%self.flag)
+        self.pid = current_process().pid
         self.is_open = True
 
     def __del__(self):
@@ -109,6 +112,14 @@ def __setstate__(self, d):
         if is_open:
             self.open()
 
+    def _check_pid(self, allow_reset=False):
+        """Check process id to ensure integrity, reset if in new process."""
+        if not self.pid == current_process().pid:
+            if allow_reset:
+                self.reset()
+            else:
+                raise RuntimeError("Forbidden operation in multiple processes")
+
     def close(self):
         """Closes the record file."""
         if not self.is_open:
@@ -118,6 +129,7 @@ def close(self):
         else:
             check_call(_LIB.MXRecordIOReaderFree(self.handle))
         self.is_open = False
+        self.pid = None
 
     def reset(self):
         """Resets the pointer to first item.
@@ -156,6 +168,7 @@ def write(self, buf):
             Buffer to write.
         """
         assert self.writable
+        self._check_pid(allow_reset=False)
         check_call(_LIB.MXRecordIOWriterWriteRecord(self.handle,
                                                     ctypes.c_char_p(buf),
                                                     ctypes.c_size_t(len(buf))))
@@ -182,6 +195,9 @@ def read(self):
             Buffer read.
         """
         assert not self.writable
+        # trying to implicitly read from multiple processes is forbidden,
+        # there's no elegant way to handle unless lock is introduced
+        self._check_pid(allow_reset=False)
         buf = ctypes.c_char_p()
         size = ctypes.c_size_t()
         check_call(_LIB.MXRecordIOReaderReadRecord(self.handle,
@@ -255,6 +271,7 @@ def seek(self, idx):
         This function is internally called by `read_idx(idx)` to find the current
         reader pointer position. It doesn't return anything."""
         assert not self.writable
+        self._check_pid(allow_reset=True)
         pos = ctypes.c_size_t(self.idx[idx])
         check_call(_LIB.MXRecordIOReaderSeek(self.handle, pos))
 

From 068c2c612c6581f5d04a580ac1f5c3a1293bb491 Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Fri, 30 Nov 2018 10:41:07 -0800
Subject: [PATCH 03/54] Fix errors in docstrings for subgraph op; use code
 directive (#13463)

---
 src/operator/contrib/dgl_graph.cc | 52 ++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index 1bb47b89bdea..74ad3d435648 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -1118,19 +1118,24 @@ sets of vertices as input. For each set of vertices, it returns a pair
 of CSR matrices if return_mapping is True: the first matrix contains edges
 with new edge Ids, the second matrix contains edges with the original
 edge Ids.
-Example::
-  x=[[1, 0, 0, 2],
-     [3, 0, 4, 0],
-     [0, 5, 0, 0],
-     [0, 6, 7, 0]]
-  v = [0, 1, 2]
-  dgl_subgraph(x, v, return_mapping=True) =
-    [[1, 0, 0],
-     [2, 0, 3],
-     [0, 4, 0]],
-    [[1, 0, 0],
-     [3, 0, 4],
-     [0, 5, 0]]
+
+Example:
+
+   .. code:: python
+
+     x=[[1, 0, 0, 2],
+       [3, 0, 4, 0],
+       [0, 5, 0, 0],
+       [0, 6, 7, 0]]
+     v = [0, 1, 2]
+     dgl_subgraph(x, v, return_mapping=True) =
+       [[1, 0, 0],
+        [2, 0, 3],
+        [0, 4, 0]],
+       [[1, 0, 0],
+        [3, 0, 4],
+        [0, 5, 0]]
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<DGLSubgraphParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {
@@ -1296,13 +1301,17 @@ NNVM_REGISTER_OP(_contrib_edge_id)
 stored in a CSR matrix (the value of the CSR stores the edge Id of the graph).
 output[i] = input[u[i], v[i]] if there is an edge between u[i] and v[i]],
 otherwise output[i] will be -1. Both u and v should be 1D vectors.
-Example::
-  x = [[ 1, 0, 0 ],
-       [ 0, 2, 0 ],
-       [ 0, 0, 3 ]]
-  u = [ 0, 0, 1, 1, 2, 2 ]
-  v = [ 0, 1, 1, 2, 0, 2 ]
-  edge_id(x, u, v) = [ 1, -1, 2, -1, -1, 3 ]
+
+Example:
+
+   .. code:: python
+
+      x = [[ 1, 0, 0 ],
+           [ 0, 2, 0 ],
+           [ 0, 0, 3 ]]
+      u = [ 0, 0, 1, 1, 2, 2 ]
+      v = [ 0, 1, 1, 2, 0, 2 ]
+      edge_id(x, u, v) = [ 1, -1, 2, -1, -1, 3 ]
 
 The storage type of ``edge_id`` output depends on storage types of inputs
   - edge_id(csr, default, default) = default
@@ -1367,7 +1376,8 @@ NNVM_REGISTER_OP(_contrib_dgl_adjacency)
 .describe(R"code(This operator converts a CSR matrix whose values are edge Ids
 to an adjacency matrix whose values are ones. The output CSR matrix always has
 the data value of float32.
-Example::
+
+Example:
 
   x = [[ 1, 0, 0 ],
        [ 0, 2, 0 ],

From 55acf569da0eddef61ff7d7b0a042b7e3781847e Mon Sep 17 00:00:00 2001
From: Naveen Swamy <mn.naveen@gmail.com>
Date: Fri, 30 Nov 2018 10:54:12 -0800
Subject: [PATCH 04/54] [MXNET-1158] JVM Memory Management Documentation
 (#13105)

* update train_mnist

* Add documentation for JVM Memory Management

* update doc

* address nit picks

* address nit picks

* Grammar and clarity edits for memory management doc

* Edits for scala memory management

* Update memory-management.md

* Update memory-management.md

* Update memory-management.md

* capitalization fix
---
 .../examples/scripts/run_train_mnist.sh       |  24 +++-
 scala-package/memory-management.md            | 118 ++++++++++++++++++
 2 files changed, 138 insertions(+), 4 deletions(-)
 create mode 100644 scala-package/memory-management.md

diff --git a/scala-package/examples/scripts/run_train_mnist.sh b/scala-package/examples/scripts/run_train_mnist.sh
index ea53c1ade66f..d27b7cbb3657 100755
--- a/scala-package/examples/scripts/run_train_mnist.sh
+++ b/scala-package/examples/scripts/run_train_mnist.sh
@@ -19,15 +19,31 @@
 
 set -e
 
+hw_type=cpu
+if [[ $1 = gpu ]]
+then
+    hw_type=gpu
+fi
+
+platform=linux-x86_64
+
+if [[ $OSTYPE = [darwin]* ]]
+then
+    platform=osx-x86_64
+    hw_type=cpu
+fi
+
 MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
 echo $MXNET_ROOT
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-cpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*:$MXNET_ROOT/scala-package/infer/target/*
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/$platform-$hw_type/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
 
 # model dir
 DATA_PATH=$2
 
-java -XX:+PrintGC -Xms256M -Xmx512M -Dmxnet.traceLeakedObjects=false -cp $CLASS_PATH \
-        org.apache.mxnetexamples.imclassification.TrainMnist \
-        --data-dir /home/ubuntu/mxnet_scala/scala-package/examples/mnist/ \
+java -XX:+PrintGC -Dmxnet.traceLeakedObjects=false -cp $CLASS_PATH \
+        org.apache.mxnetexamples.imclassification.TrainModel \
+        --data-dir $MXNET_ROOT/scala-package/examples/mnist/ \
+        --network mlp \
+        --num-layers 50 \
         --num-epochs 10000000 \
         --batch-size 1024
\ No newline at end of file
diff --git a/scala-package/memory-management.md b/scala-package/memory-management.md
new file mode 100644
index 000000000000..33c36b6e6ab0
--- /dev/null
+++ b/scala-package/memory-management.md
@@ -0,0 +1,118 @@
+# JVM Memory Management
+The Scala and Java bindings of Apache MXNet use native memory (memory from the C++ heap in either RAM or GPU memory) for most of the MXNet objects such as NDArray, Symbol, Executor, KVStore, Data Iterators, etc.
+The associated Scala classes act only as wrappers. The operations done on these wrapper objects are then directed to the high performance MXNet C++ backend via the Java Native Interface (JNI). Therefore, the bytes are stored in the C++ native heap which allows for fast access.
+
+However, the JVM Garbage Collector only manages objects allocated in the JVM Heap and is not aware of the memory footprint of these objects in the native memory. Hence, the allocation/deallocation of native memory must be managed by MXNet Scala.
+Allocating native memory is straight forward and is done during the construction of the object by calling the associated C++ API through JNI. However, since JVM languages do not have destructors, the deallocation of these objects must be done explicitly.
+MXNet Scala provides a few easy modes of operation which are explained in detail below.
+
+## Memory Management in Scala 
+### 1.  [ResourceScope.using](https://github.com/apache/incubator-mxnet/blob/master/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala#L106) (Recommended)
+`ResourceScope.using` provides the familiar Java try-with-resources primitive in Scala and will automatically manage the memory of all the MXNet objects created in the associated code block (`body`). It works by tracking the allocations performed inside the code block deallocating when exiting the block. 
+Passing MXNet objects out of a using block can be easily accomplished by simply returning an object or an iterable containing multiple MXNet objects. If you have nested using blocks, then the returned objects will be moved into the parent scope as well.
+
+**Usage** 
+```scala
+ResourceScope.using() {
+    ResourceScope.using() {
+        val r1 = NDArray.ones(Shape(2, 2))
+        val r2 = NDArray.ones(Shape(3, 4))
+        val r3 = NDArray.ones(Shape(5, 6))
+        val r4 = NDArray.ones(Shape(7, 8))
+        (r3, r4)
+    }
+    r4
+}
+```
+In the example above, we have two ResourceScopes stacked together. In the inner scope, 4 NDArrays `(r1, r2, r3, r4)` are created and the NDArrays 
+`(r3, r4)` are returned. The inner ResourceScope recognizes that it should not deallocate these objects and automatically moves `r3` and  `r4` to the outer scope. When the outer scope 
+returns `r4` from its code-block, it will only deallocate `r3` and will remove `r4` from its list of objects to be deallocated. All other objects are automatically released by calling the C++ backend to free the native memory.
+
+**Note:**
+You should consider nesting ResourceScopes when you have layers of functionality in your application code or create a lot of MXNet objects such as NDArrays.  
+For example, holding onto all the memory that is created for an entire training loop can result in running out of memory, especially when training on GPUs which might only have 8 to 16 GB.  
+It is recommended not to use a single ResourceScope block which spans the entire training code. You should instead nest multiple scopes: an innermost scope where you run forward-backward passes on each batch, a middle scope for each epoch, and an outer scope that runs the entire training script. This is demonstrated in the example below:
+
+```scala
+ResourceScope.using() {
+ val m = Module()
+ m.bind()
+ val k = KVStore(...)
+ ResourceScope.using() {
+     val itr = MXIterator(..)
+     val num_epochs: Int = 100
+     //... 
+     for (i <- 0 until num_epoch) {
+     ResourceScope.using() {
+        val dataBatch = itr.next()
+        while(itr.next()) {
+           m.forward(dataBatch)
+           m.backward(dataBatch)
+           m.update()
+        }
+     }
+   }
+ }
+}
+
+```  
+       
+### 2.  Using Phantom References (Recommended for some use cases)
+
+Apache MXNet uses [Phantom References](https://docs.oracle.com/javase/8/docs/api/java/lang/ref/PhantomReference.html) to track all MXNet Objects that have native memory associated with it. 
+When the Garbage Collector runs, it identifies unreachable Scala/Java objects in the JVM Heap and finalizes them. 
+It then enqueues objects which are ready to be reclaimed into a reference queue. We take advantage of this and do a 
+pre-mortem cleanup on these wrapper objects by freeing the corresponding native memory as well.
+ 
+This approach is automatic and does not require any special code to clean up the native memory. However, the Garbage Collector is not aware of the potentially large amount of native memory used and therefore may not free up memory often enough with it's standard behavior.
+You can control the frequency of garbage collection by calling System.gc() at strategic points such as the end of an epoch or the end of a mini-batch.
+
+This approach could be suitable for some use cases such as inference on CPUs where you have a large amount of Memory (RAM) on your system.
+
+**Note:**
+Calling GC too frequently can also cause your application to perform poorly. This approach might not be suitable 
+for use cases which quickly allocate a large number of large NDArrays such as when training a GAN model.
+
+### 3. Using dispose Pattern (least Recommended)
+ 
+There might be situations where you want to manually manage the lifecycle of Apache MXNet objects. For such use-cases, we have provided the `dispose()` method which will manually deallocate the associated native memory when called. We have also
+made all MXNet objects [AutoCloseable](https://docs.oracle.com/javase/8/docs/api/java/lang/AutoCloseable.html). If you are using Java8 and above you can use it with try-with-resources or call close() in the finally block.
+
+**Note:**
+We recommend you avoid manually managing MXNet objects and instead use `ResourceScope.using`. This creates less readable code and could leak memory if you miss calling dispose (until it is cleaned up by the Garbage Collector through the Phantom References).
+
+```scala
+def showDispose(): Unit = {
+    val r = NDArray.ones(Shape (2, 2))
+    r.dispose()
+}
+```
+
+## Memory Management in Java
+Memory Management in MXNet Java is similar to Scala. We recommend you use [ResourceScope](https://github.com/apache/incubator-mxnet/blob/master/scala-package/core/src/main/scala/org/apache/mxnet/ResourceScope.scala#L32) in a `try-with-resources` block or in a `try-finally` block.
+The [try-with-resource](https://docs.oracle.com/javase/tutorial/essential/exceptions/tryResourceClose.html) tracks the resources declared in the try block and automatically closes them upon exiting (supported from Java 7 onwards). 
+The ResourceScope discussed above implements AutoCloseable and tracks all MXNet Objects created at a Thread Local scope level. 
+
+```java
+try(ResourceScope scope = new ResourceScope()) {
+    NDArray test = NDArray.ones((Shape (2,2))
+}
+```
+or 
+```java
+try {
+    ResourceScope scope = new ResourceScope()
+    NDArray test = NDArray.ones((Shape(2,2))
+} finally {
+    scope.close()
+}
+``` 
+
+**Note:**
+A ResourceScope within a try block tracks all MXNet Native Object Allocations (NDArray, Symbol, Executor, etc.,) and deallocates them at
+the end of the try block. This is also true of the objects that are returned e.g. in the example above, the native memory associated with `test` would be deallocated even if it were to be returned. 
+If you use the object outside of the try block, the process might crash due to illegal memory access.
+
+To retain certain objects created within try blocks, you should explicitly remove them from the scope by calling `scope.moveToOuterScope`.
+It is highly recommended to nest multiple try-with-resource ResourceScopes so you do not have to explicitly manage the lifecycle of the Native objects.
+

From dabd6886a845e4c2a3163dc05a1406e85e8cc78e Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Fri, 30 Nov 2018 11:58:16 -0800
Subject: [PATCH 05/54] Update row_sparse tutorial (#13414)

Update row_sparse tutorial
---
 docs/tutorials/sparse/row_sparse.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/sparse/row_sparse.md b/docs/tutorials/sparse/row_sparse.md
index 27cc0d3d903e..46a5edad075e 100644
--- a/docs/tutorials/sparse/row_sparse.md
+++ b/docs/tutorials/sparse/row_sparse.md
@@ -459,7 +459,7 @@ Note that warning messages will be printed when such a storage fallback event ha
 
 ## Sparse Optimizers
 
-In MXNet, sparse gradient updates are applied when weight, state and gradient are all in `row_sparse` storage.
+In MXNet, sparse gradient updates are applied when gradient is in `row_sparse` storage and the optimizer is created with `lazy_update=True`.
 The sparse optimizers only update the row slices of the weight and the states whose indices appear
 in `gradient.indices`. For example, the default update rule for SGD optimizer is:
 

From b58de7494d9bf329af9730da91b5f6c21348cbff Mon Sep 17 00:00:00 2001
From: Sina Afrooze <sina.beh@gmail.com>
Date: Fri, 30 Nov 2018 13:33:16 -0800
Subject: [PATCH 06/54] Add resiliency to onnx export code (#13426)

* Added resiliency to onnx export code

- With previous infer-shape implementation, if input shape was list instead of tuple or if extra non-existent parameters were provided, the code would still work. The fixes in this commit make sure that behavior is restored to prevent any compatibility issues with existing export code.

* Fixed name of net in unittest

* Fix pylint
---
 .../mxnet/contrib/onnx/mx2onnx/export_onnx.py |  5 +++--
 .../onnx/export/mxnet_export_test.py          | 21 +++++++++++++++++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py b/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
index 14c674f56f2d..84db5decd503 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
@@ -134,9 +134,10 @@ def get_outputs(sym, params, in_shape, in_label):
         # remove any input listed in params from sym.list_inputs() and bind them to the input shapes provided
         # by user. Also remove in_label, which is the name of the label symbol that may have been used
         # as the label for loss during training.
-        inputs = {n: s for n, s in zip([n for n in sym.list_inputs() if n not in params and n != in_label], in_shape)}
+        inputs = {n: tuple(s) for n, s in zip([n for n in sym.list_inputs() if n not in params and n != in_label],
+                                              in_shape)}
         # Add params and their shape to list of inputs
-        inputs.update({n: v.shape for n, v in params.items()})
+        inputs.update({n: v.shape for n, v in params.items() if n in sym.list_inputs()})
         # Provide input data as well as input params to infer_shape()
         _, out_shapes, _ = sym.infer_shape(**inputs)
 
diff --git a/tests/python-pytest/onnx/export/mxnet_export_test.py b/tests/python-pytest/onnx/export/mxnet_export_test.py
index f4144fd6c7fa..964d0e760cae 100644
--- a/tests/python-pytest/onnx/export/mxnet_export_test.py
+++ b/tests/python-pytest/onnx/export/mxnet_export_test.py
@@ -286,18 +286,19 @@ def _optional_group(symbols, group=False):
         return symbols
 
 
-def _check_onnx_export(net, group_outputs=False):
+def _check_onnx_export(net, group_outputs=False, shape_type=tuple, extra_params={}):
     net.initialize()
     data = nd.random.uniform(0, 1, (1, 1024))
     output = _force_list(net(data))  # initialize weights
     net_sym = _optional_group(net(sym.Variable('data')), group_outputs)
     net_params = {name:param._reduce() for name, param in net.collect_params().items()}
+    net_params.update(extra_params)
     with tempfile.TemporaryDirectory() as tmpdirname:
         onnx_file_path = os.path.join(tmpdirname, 'net.onnx')
         export_path = onnx_mxnet.export_model(
             sym=net_sym,
             params=net_params,
-            input_shape=[data.shape],
+            input_shape=[shape_type(data.shape)],
             onnx_file_path=onnx_file_path)
         assert export_path == onnx_file_path
         # Try importing the model to symbol
@@ -340,6 +341,22 @@ def hybrid_forward(self, F, x):
     _check_onnx_export(net, group_outputs=True)
 
 
+@with_seed()
+def test_onnx_export_list_shape():
+    net = nn.HybridSequential(prefix='list_shape_net')
+    with net.name_scope():
+        net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
+    _check_onnx_export(net, shape_type=list)
+
+
+@with_seed()
+def test_onnx_export_extra_params():
+    net = nn.HybridSequential(prefix='extra_params_net')
+    with net.name_scope():
+        net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
+    _check_onnx_export(net, extra_params={'extra_param': nd.array([1, 2])})
+
+
 if __name__ == '__main__':
     test_models("bvlc_googlenet", (1, 3, 224, 224), (1, 1000))
     test_models("bvlc_reference_caffenet", (1, 3, 224, 224), (1, 1000))

From 0d480fbe1534d1c1228bf2c7470018ae06cbac37 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 30 Nov 2018 21:48:20 -0800
Subject: [PATCH 07/54] [MXNET-1185] Support large array in several operators
 (part 1) (#13418)

* fix a few operators with large arrays (# of elements)

* fix bug in broadcast_div and add tests

* address reviewer comment

* add unit test

* add empty line

* retrigger CI
---
 src/operator/elemwise_op_common.h             |   8 +-
 src/operator/mxnet_op.h                       |  68 +++---
 src/operator/random/sampler.h                 |  43 ++--
 src/operator/tensor/broadcast_reduce-inl.h    |  94 ++++----
 .../tensor/elemwise_binary_broadcast_op.h     |  14 +-
 src/operator/tensor/indexing_op.cc            |  26 +--
 src/operator/tensor/indexing_op.cu            |  10 +-
 src/operator/tensor/indexing_op.h             |  39 ++--
 src/operator/tensor/init_op.h                 |   6 +-
 src/operator/tensor/matrix_op-inl.h           | 219 +++++++++---------
 tests/nightly/test_large_array.py             | 128 +++++++++-
 11 files changed, 384 insertions(+), 271 deletions(-)

diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index cf44da699156..4b8663bba6ea 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -100,7 +100,7 @@ inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs,
  *  \tparam rsp whether row sparse stype is supported
  *  \tparam rsp whether csr stype is supported
  */
-template<int n_in, int n_out, bool cpu_only, bool rsp, bool csr>
+template<index_t n_in, index_t n_out, bool cpu_only, bool rsp, bool csr>
 inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
                                 const int dev_mask,
                                 DispatchMode* dispatch_mode,
@@ -115,7 +115,7 @@ inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
 template<typename AttrType, bool (*is_none)(const AttrType&),
          bool (*assign)(AttrType*, const AttrType&), bool reverse_infer,
          std::string (*attr_string)(const AttrType&),
-         int n_in = -1, int n_out = -1>
+         index_t n_in = -1, index_t n_out = -1>
 inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
                          std::vector<AttrType> *in_attrs,
                          std::vector<AttrType> *out_attrs,
@@ -154,7 +154,7 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-template<int n_in, int n_out>
+template<index_t n_in, index_t n_out>
 inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
@@ -168,7 +168,7 @@ inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
     attrs, in_attrs, out_attrs, TShape());
 }
 
-template<int n_in, int n_out>
+template<index_t n_in, index_t n_out>
 inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
                          std::vector<int> *out_attrs) {
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 5b106afd8d5b..6cab1990858b 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -289,8 +289,8 @@ inline int get_num_threads<cpu>(const int N) {
 
 /* \brief Compute flattened index given coordinates and shape. */
 template<int ndim>
-MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
-  int ret = 0;
+MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     ret = ret * shape[i] + (shape[i] > coord[i]) * coord[i];
@@ -301,11 +301,11 @@ MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
 
 /* Compute coordinates from flattened index given shape */
 template<int ndim>
-MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
+MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
   Shape<ndim> ret;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret[i] = j - tmp*shape[i];
     j = tmp;
   }
@@ -315,8 +315,8 @@ MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
 
 /* Compute dot product of two vector */
 template<int ndim>
-MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
-  int ret = 0;
+MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     ret += coord[i] * stride[i];
@@ -327,12 +327,12 @@ MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
 
 /* Combining unravel and dot */
 template<int ndim>
-MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
+MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
   const Shape<ndim>& stride) {
-  int ret = 0;
+  index_t ret = 0;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret += (j - tmp*shape[i])*stride[i];
     j = tmp;
   }
@@ -433,51 +433,51 @@ struct op_with_req {
 
   /*! \brief input is one tensor */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i]));
   }
 
   /*! \brief inputs are two tensors */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *lhs, const DType *rhs) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *lhs, const DType *rhs) {
     KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
   }
 
   /*! \brief input is tensor and a scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in, const DType value) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
   }
 
   /*! \brief input is tensor and two scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType *in,
                                   const DType value_1, const DType value_2) {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value_1, value_2));
   }
 
   /*! \brief No inputs (ie fill to constant value) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out) {
     KERNEL_ASSIGN(out[i], req, OP::Map());
   }
 
   /*! \brief input is single scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out, const DType value) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(value));
   }
 
   /*! \brief inputs are two tensors and a scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out,
                                   const DType *input_1, const DType *input_2, const DType value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(input_1[i], input_2[i], value));
   }
 
   /*! \brief inputs are three tensors (ie backward grad with binary grad function) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
+  MSHADOW_XINLINE static void Map(index_t i, DType *out,
                                   const DType *input_1,
                                   const DType *input_2,
                                   const DType *input_3) {
@@ -503,21 +503,21 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the OP::Map() function
    */
   template<typename ...Args>
-  inline static bool Launch(mshadow::Stream<cpu> *, const int N, Args... args) {
+  inline static bool Launch(mshadow::Stream<cpu> *, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2) {
-      for (int i = 0; i < N; ++i) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
         OP::Map(i, args...);
       }
     }
 #else
-    for (int i = 0; i < N; ++i) {
+    for (size_t i = 0; i < N; ++i) {
       OP::Map(i, args...);
     }
 #endif
@@ -567,22 +567,22 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the OP::Map() function
    */
   template<typename PRIMITIVE_OP, typename DType, typename ...Args>
-  static void LaunchTuned(mshadow::Stream<cpu> *, const int N, Args... args) {
+  static void LaunchTuned(mshadow::Stream<cpu> *, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2 || !tuned_op<PRIMITIVE_OP, DType>::UseOMP(
-      static_cast<size_t>(N), static_cast<size_t>(omp_threads))) {
-      for (int i = 0; i < N; ++i) {
+      N, static_cast<size_t>(omp_threads))) {
+      for (size_t i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
     } else {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < static_cast<index_t>(N); ++i) {
         OP::Map(i, args...);
       }
     }
 #else
-    for (int i = 0; i < N; ++i) {
+    for (size_t i = 0; i < N; ++i) {
       OP::Map(i, args...);
     }
 #endif
@@ -596,15 +596,15 @@ struct Kernel<OP, cpu> {
    * \param args Varargs to eventually pass to the UseOMP() and OP::Map() functions
    */
   template<typename ...Args>
-  inline static void LaunchEx(mshadow::Stream<cpu> *s, const int N, Args... args) {
+  inline static void LaunchEx(mshadow::Stream<cpu> *s, const size_t N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (omp_threads < 2) {
       OP::Map(0, N, args...);
     } else {
-      const int length = (N + omp_threads - 1) / omp_threads;
+      const auto length = (N + omp_threads - 1) / omp_threads;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i += length) {
+      for (index_t i = 0; i < static_cast<index_t>(N); i += length) {
         OP::Map(i, i + length > N ? N - i : length, args...);
       }
     }
@@ -626,7 +626,7 @@ struct Kernel<OP, cpu> {
   template<typename DType, typename T = OP, typename ...Args>
   static MSHADOW_CINLINE
   typename std::enable_if<std::is_base_of<tunable, T>::value, bool>::type
-  Launch(mshadow::Stream<cpu> *s, const int N, DType *dest, Args... args) {
+  Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
     LaunchTuned<T, DType>(s, N, dest, args...);
     return true;
   }
@@ -644,7 +644,7 @@ struct Kernel<OP, cpu> {
   template<typename DType, typename T = OP, typename ...Args>
   static MSHADOW_CINLINE
   typename std::enable_if<std::is_base_of<tunable, typename T::Operation>::value, bool>::type
-  Launch(mshadow::Stream<cpu> *s, const int N, DType *dest, Args... args) {
+  Launch(mshadow::Stream<cpu> *s, const size_t N, DType *dest, Args... args) {
     LaunchTuned<typename T::Operation, DType>(s, N, dest, args...);
     return true;
   }
@@ -700,7 +700,7 @@ template<int val>
 struct set_to_int : public tunable {
   // mxnet_op version (when used directly with Kernel<>::Launch()) */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out) {
+  MSHADOW_XINLINE static void Map(index_t i, DType *out) {
     out[i] = DType(val);
   }
   // mshadow_op version (when used with op_with_req<>)
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index ca764e706c64..00963a6785ee 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -43,32 +43,33 @@ namespace op {
 template<typename OP, typename xpu, typename GType, typename ...Args>
 inline static void LaunchRNG(mshadow::Stream<xpu> *s,
                              common::random::RandGenerator<xpu, GType> *gen,
-                             const int N, Args... args) {
+                             const index_t N, Args... args) {
   // minimal check to avoid division by zero, below.
   // if `N` is zero the map operation is a no-op in any case.
   if (N <= 0) {
     return;
   }
-  const int nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
+  const index_t nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
                     RandGenerator<xpu>::kMinNumRandomPerThread;
-  const int nthread = std::min(nloop, RandGenerator<xpu>::kNumRandomStates);
-  const int step = (N + nthread - 1) / nthread;
+  const index_t nthread = std::min(nloop,
+                                   static_cast<index_t>(RandGenerator<xpu>::kNumRandomStates));
+  const index_t step = (N + nthread - 1) / nthread;
   Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
 }
 
 #define RNG_KERNEL_LOOP(xpu, GType, thread_id, gen, N, step, ...)        \
-  const int start = thread_id * step;                                    \
-  const int end = start + step;                                          \
+  const index_t start = thread_id * step;                                    \
+  const index_t end = start + step;                                          \
   typename RandGenerator<xpu, GType>::Impl genImpl(&gen, thread_id);     \
-  for (int i = start; i < end && i < N; ++i) {                           \
+  for (index_t i = start; i < end && i < N; ++i) {                           \
     {__VA_ARGS__}                                                        \
   }
 
 template<typename xpu>
 struct SampleUniformKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lower, const IType *upper, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -127,8 +128,8 @@ struct RandIntSampler {
 template<typename xpu>
 struct SampleNormalKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mean, const IType *std, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -154,8 +155,8 @@ struct NormalSampler {
 template<typename xpu>
 struct SampleExponentialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
@@ -202,8 +203,8 @@ MSHADOW_XINLINE OType SampleGamma(IType a, IType b, typename RandGenerator<xpu,
 template<typename xpu>
 struct SampleGammaKernel {
   template<typename IType, typename OType, typename FType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, FType> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, FType> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *alpha, const IType *beta, OType *out) {
     RNG_KERNEL_LOOP(xpu, FType, id, gen, N, step, {
@@ -264,8 +265,8 @@ MSHADOW_XINLINE int SamplePoisson(float lambda, typename RandGenerator<xpu, floa
 template<typename xpu>
 struct SamplePoissonKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
@@ -291,8 +292,8 @@ struct PoissonSampler {
 template<typename xpu>
 struct SampleNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *k, const IType *p, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
@@ -323,8 +324,8 @@ struct NegativeBinomialSampler {
 template<typename xpu>
 struct SampleGeneralizedNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
-                                  const int N, const int step,
+  MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
+                                  const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mu, const IType *alpha, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 167fa34b083f..141d2fb83d0d 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -53,14 +53,14 @@ MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
 }
 
 template<int ndim>
-MSHADOW_XINLINE void unravel_dot(const int idx, const Shape<ndim>& shape,
-  const Shape<ndim>& stridej, const Shape<ndim>& stridek, int* j, int* k) {
+MSHADOW_XINLINE void unravel_dot(const index_t idx, const Shape<ndim>& shape,
+  const Shape<ndim>& stridej, const Shape<ndim>& stridek, index_t* j, index_t* k) {
   *j = 0;
   *k = 0;
   #pragma unroll
-  for (int i = ndim-1, idx_t = idx; i >=0; --i) {
-    const int tmp = idx_t / shape[i];
-    const int coord = idx_t - tmp*shape[i];
+  for (index_t i = ndim-1, idx_t = idx; i >=0; --i) {
+    const auto tmp = idx_t / shape[i];
+    const auto coord = idx_t - tmp*shape[i];
     *j += coord*stridej[i];
     *k += coord*stridek[i];
     idx_t = tmp;
@@ -68,11 +68,11 @@ MSHADOW_XINLINE void unravel_dot(const int idx, const Shape<ndim>& shape,
 }
 
 template<int ndim>
-MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
+MSHADOW_XINLINE Shape<ndim> unravel(const index_t idx, const Shape<ndim>& shape) {
   Shape<ndim> ret;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret[i] = j - tmp*shape[i];
     j = tmp;
   }
@@ -80,10 +80,10 @@ MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
 }
 
 template<int ndim>
-MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
-  int ret = 0;
+MSHADOW_XINLINE index_t ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
+  index_t ret = 0;
   #pragma unroll
-  for (int i = 0; i < ndim; ++i) {
+  for (index_t i = 0; i < ndim; ++i) {
     ret = ret * shape[i] + (shape[i] > 1) * coord[i];
   }
   return ret;
@@ -111,12 +111,12 @@ MSHADOW_XINLINE int diff(const Shape<ndim>& small, const Shape<ndim>& big, Shape
 }
 
 template<int ndim>
-MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
+MSHADOW_XINLINE index_t unravel_dot(const index_t idx, const Shape<ndim>& shape,
   const Shape<ndim>& stride) {
-  int ret = 0;
+  index_t ret = 0;
   #pragma unroll
-  for (int i = ndim-1, j = idx; i >=0; --i) {
-    int tmp = j / shape[i];
+  for (index_t i = ndim-1, j = idx; i >=0; --i) {
+    auto tmp = j / shape[i];
     ret += (j - tmp*shape[i])*stride[i];
     j = tmp;
   }
@@ -124,8 +124,8 @@ MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
 }
 
 template<int ndim>
-MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
-  int ret = 0;
+MSHADOW_XINLINE index_t dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
+  index_t ret = 0;
   #pragma unroll
   for (int i = 0; i < ndim; ++i)
     ret += coord[i] * stride[i];
@@ -142,27 +142,27 @@ MSHADOW_XINLINE void assign(DType* dst, const bool addto, const DType src) {
 }
 
 template<int ndim, typename DType, typename OP>
-MSHADOW_XINLINE void binary_broadcast_assign(const int idx, const bool addto,
+MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto,
                                              const DType* __restrict lhs,
                                              const DType* __restrict rhs, DType* out,
                                              const Shape<ndim>& lshape, const Shape<ndim>& rshape,
                                              const Shape<ndim>& oshape) {
   const Shape<ndim> coord = unravel(idx, oshape);
-  const int j = ravel(coord, lshape);
-  const int k = ravel(coord, rshape);
+  const index_t j = ravel(coord, lshape);
+  const index_t k = ravel(coord, rshape);
   assign(&out[idx], addto, OP::Map(lhs[j], rhs[k]));
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP>
-MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
+MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
                                        const DType* __restrict big, DType *small,
                                        const Shape<ndim>& bshape, const Shape<ndim>& sshape,
                                        const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
   Shape<ndim> coord = unravel(idx, sshape);
-  int j = ravel(coord, bshape);
+  index_t j = ravel(coord, bshape);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (int k = 0; k < M; ++k) {
+  for (size_t k = 0; k < M; ++k) {
     coord = unravel(k, rshape);
     Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
   }
@@ -176,10 +176,10 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
 #else
 
 template<int ndim, typename DType, typename OP>
-void binary_broadcast_compute(const int N, const bool addto, const DType *lhs,
+void binary_broadcast_compute(const size_t N, const bool addto, const DType *lhs,
                               const DType *rhs, DType *out, const Shape<ndim> lshape,
                               const Shape<ndim> rshape, const Shape<ndim> oshape) {
-  for (int idx = 0; idx < N; ++idx) {
+  for (size_t idx = 0; idx < N; ++idx) {
     binary_broadcast_assign<ndim, DType, OP>(idx, addto, lhs, rhs, out, lshape, rshape, oshape);
   }
 }
@@ -188,26 +188,26 @@ template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
   if (req == kNullOp) return;
-  int N = out.shape_.Size();
+  size_t N = out.shape_.Size();
   binary_broadcast_compute<ndim, DType, OP>(N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(),
                            out.dptr<DType>(), lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>(),
                            out.shape_.get<ndim>());
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP>
-void seq_reduce_compute(const int N, const int M, const bool addto,
+void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const DType *big, DType *small, const Shape<ndim> bshape,
                         const Shape<ndim> sshape, const Shape<ndim> rshape,
                         const Shape<ndim> rstride) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
       rstride);
   }
 }
 
 template <typename Reducer, int ndim, typename DType, typename OP>
-void seq_reduce_compute_extra_mem(const int N, const int M, const bool addto,
+void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool addto,
                                   const DType* big, DType* small,
                                   const Shape<ndim> bshape,
                                   const Shape<ndim> sshape,
@@ -215,12 +215,12 @@ void seq_reduce_compute_extra_mem(const int N, const int M, const bool addto,
                                   const Shape<ndim> rstride,
                                   const index_t* ws_dptr) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     Shape<ndim> coord = unravel(idx, sshape);
-    int j = ravel(coord, bshape);
+    index_t j = ravel(coord, bshape);
     DType val, residual;
     Reducer::SetInitValue(val, residual);
-    for (int k = 0; k < M; ++k) {
+    for (size_t k = 0; k < M; ++k) {
       Reducer::Reduce(val, OP::Map(big[j + ws_dptr[k]]), residual);
     }
     assign(&small[idx], addto, val);
@@ -233,7 +233,7 @@ void Reduce(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
-  int N = small.shape_.Size(), M = rshape.Size();
+  size_t N = small.shape_.Size(), M = rshape.Size();
   seq_reduce_compute<Reducer, ndim, DType, OP>(
     N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(),
     big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
@@ -247,9 +247,9 @@ void ReduceWithExtraMem(Stream<cpu>* s, const TBlob& small, const OpReqType req,
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
   index_t* ws_dptr = reinterpret_cast<index_t*>(workspace.dptr_);
-  int N = small.shape_.Size(), M = rshape.Size();
+  size_t N = small.shape_.Size(), M = rshape.Size();
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int k = 0; k < M; k++) {
+  for (index_t k = 0; k < static_cast<index_t>(M); k++) {
     Shape<ndim> coord = unravel(k, rshape);
     ws_dptr[k] = dot(coord, rstride);
   }
@@ -272,7 +272,7 @@ size_t ReduceWorkspaceSize(Stream<cpu> *s, const TShape& small, const OpReqType
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
-MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
+MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
                                        const DType* __restrict big, const DType* __restrict lhs,
                                        const DType* __restrict rhs, DType *small,
                                        const Shape<ndim>& big_shape, const Shape<ndim>& lhs_shape0,
@@ -282,20 +282,20 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
                                        const Shape<ndim>& rstride, const Shape<ndim>& lhs_stride,
                                        const Shape<ndim>& rhs_stride) {
   Shape<ndim> coord = unravel(idx, small_shape);
-  const int idx_big0 = ravel(coord, big_shape);
-  const int idx_lhs0 = ravel(coord, lhs_shape0);
-  const int idx_rhs0 = ravel(coord, rhs_shape0);
+  const index_t idx_big0 = ravel(coord, big_shape);
+  const index_t idx_lhs0 = ravel(coord, lhs_shape0);
+  const index_t idx_rhs0 = ravel(coord, rhs_shape0);
   DType val, residual;
   Reducer::SetInitValue(val, residual);
-  for (int k = 0; k < M; ++k) {
+  for (size_t k = 0; k < M; ++k) {
     Shape<ndim> coord_big = unravel(k, rshape);
-    int idx_big = idx_big0 + dot(coord_big, rstride);
+    index_t idx_big = idx_big0 + dot(coord_big, rstride);
 
     Shape<ndim> coord_lhs = unravel(k, lhs_shape);
-    int idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
+    index_t idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
 
     Shape<ndim> coord_rhs = unravel(k, rhs_shape);
-    int idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
+    index_t idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
 
     Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual);
   }
@@ -304,7 +304,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
-void seq_reduce_compute(const int N, const int M, const bool addto,
+void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
                         const DType *big, const DType *lhs, const DType *rhs, DType *small,
                         const Shape<ndim> big_shape, const Shape<ndim> small_shape,
                         const Shape<ndim> rshape, const Shape<ndim> rstride,
@@ -312,7 +312,7 @@ void seq_reduce_compute(const int N, const int M, const bool addto,
                         const Shape<ndim> rhs_shape, const Shape<ndim> rhs_stride,
                         const Shape<ndim>& lhs_shape0, const Shape<ndim>& rhs_shape0) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (int idx = 0; idx < N; ++idx) {
+  for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
     seq_reduce_assign<Reducer, ndim, DType, OP1, OP2>(idx, M, addto, big, lhs, rhs, small,
       big_shape, lhs_shape0, rhs_shape0, small_shape, rshape, lhs_shape, rhs_shape, rstride,
       lhs_stride, rhs_stride);
@@ -326,8 +326,8 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
-  int N = small.shape_.Size();
-  int M = rshape.Size();
+  size_t N = small.shape_.Size();
+  size_t M = rshape.Size();
 
   Shape<ndim> lhs_shape, lhs_stride;
   diff(small.shape_.get<ndim>(), lhs.shape_.get<ndim>(), &lhs_shape, &lhs_stride);
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 391c35117128..304422038b89 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -190,7 +190,7 @@ namespace mxnet_op {
 template<int ndim, typename DType, typename OP>
 struct binary_broadcast_kernel {
   /*! \brief Map function for binary_broadcast_kernel */
-  MSHADOW_XINLINE static void Map(int base, int length, OpReqType req,
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, DType *lhs, DType *rhs,
                                   DType *out) {
@@ -199,7 +199,7 @@ struct binary_broadcast_kernel {
     auto ridx = static_cast<index_t>(dot(coord, rstride));
     KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
     // starts from 1 to avoid extra inc at end of loop
-    for (int i = 1; i < length; ++i) {
+    for (index_t i = 1; i < length; ++i) {
       inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
       // When tuning, don't actually run the op, since it's not going to be tuned against
       // the actual op we'll eventually be using
@@ -208,7 +208,7 @@ struct binary_broadcast_kernel {
   }
 
   /*! \brief Map function for binary_broadcast_kernel */
-  MSHADOW_XINLINE static void Map(int base, int length, OpReqType req,
+  MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
                                   const Shape <ndim> &lstride, const Shape <ndim> &rstride,
                                   const Shape <ndim> &oshape, DType lhs, DType *rhs,
                                   DType *out) {
@@ -217,7 +217,7 @@ struct binary_broadcast_kernel {
     auto ridx = static_cast<index_t>(dot(coord, rstride));
     KERNEL_ASSIGN(out[base], req, OP::Map(lhs, rhs[ridx]));
     // starts from 1 to avoid extra inc at end of loop
-    for (int i = 1; i < length; ++i) {
+    for (index_t i = 1; i < length; ++i) {
       inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
       // When tuning, don't actually run the op, since it's not going to be tuned against
       // the actual op we'll eventually be using
@@ -238,7 +238,7 @@ struct csr_dns_csr_broadcast_kernel {
    * \param out          ptr to the data buffer of the result csr matrix
    */
   template<typename DType, typename CType, typename RType>
-  MSHADOW_XINLINE static void Map(int row, const DType *csr_data, const CType *csr_indices,
+  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
                                   const RType *csr_indptr, const DType *dns, DType *out) {
     const nnvm::dim_t curr_row_i = csr_indptr[row];
     const nnvm::dim_t next_row_i = csr_indptr[row + 1];
@@ -257,7 +257,7 @@ struct csr_dns_csr_broadcast_kernel {
    * \param nnz         number of non-zero elements in input csr matrix
    */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, const DType *csr_data, const DType* scalar_ptr,
+  MSHADOW_XINLINE static void Map(index_t i, const DType *csr_data, const DType* scalar_ptr,
                                   DType *out, const nnvm::dim_t nnz) {
     const DType scale = scalar_ptr[0];
     if (i < nnz) {
@@ -269,7 +269,7 @@ struct csr_dns_csr_broadcast_kernel {
 template<int req, typename OP, bool reverse = false>
 struct csr_dns_map_kernel {
   template <typename DType, typename CType, typename RType>
-  MSHADOW_XINLINE static void Map(int row, const DType *csr_data, const CType *csr_indices,
+  MSHADOW_XINLINE static void Map(index_t row, const DType *csr_data, const CType *csr_indices,
                                   const RType *csr_indptr, DType *out, const nnvm::dim_t num_rows,
                                   const nnvm::dim_t num_cols) {
     if (row < num_rows) {
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 77236e068f86..c39418dbe41d 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -36,7 +36,7 @@ struct TakeCPU {
   // K is the number of rows of in_data
   // i is the index of out_data
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
                                   const IType* idx, const size_t M, const int64_t K) {
     int64_t j = static_cast<int64_t>(idx[i]);
     if (clip) {
@@ -420,19 +420,19 @@ inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const bool deterministic,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<(!std::is_same<DType, mshadow::half::half_t>::value), void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
                      const IType* indices,
                      mshadow::Stream<cpu> *s) {
 #pragma omp parallel for
-  for (int i = 0; i < N; i++) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+  for (index_t i = 0; i < N; i++) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
 #pragma omp atomic
       out[offset + j] += data[i * K + j];
     }
@@ -441,18 +441,18 @@ GatherNDBackwardImpl(int N, int M, int K,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value, void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
                      const IType* indices,
                      mshadow::Stream<cpu> *s) {
-  for (int i = 0; i < N; i++) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+  for (index_t i = 0; i < N; i++) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       out[offset + j] += data[i * K + j];
     }
   }
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 0d72b1815fde..bad3e5a1a6c5 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -439,22 +439,22 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const bool deterministic,
 
 struct backward_gather_nd_gpu {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, int N, int M, int K,
+  MSHADOW_XINLINE static void Map(index_t i, index_t N, index_t M, index_t K,
                                   const mshadow::Shape<10> strides,
                                   DType* out, const DType* data,
                                   const IType* indices) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
       offset += strides[j] * static_cast<int>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       atomicAdd(out + (offset + j), data[i * K + j]);
     }
   }
 };
 
 template<typename DType, typename IType>
-inline void GatherNDBackwardImpl(int N, int M, int K,
+inline void GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                                  const mshadow::Shape<10> strides,
                                  DType* out,
                                  const DType* data,
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 92b6e21018e5..fba331e25705 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -314,7 +314,8 @@ struct Take {
    * \param axis        axis id
    */
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, const IType* idx,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
+                                  const IType* idx,
                                   const mshadow::Shape<10> in_stride,
                                   const mshadow::Shape<10> out_stride,
                                   const int in_ndims, const int out_ndims, const int idx_ndims,
@@ -361,7 +362,7 @@ struct TakeRspKernel {
    * \param nnr         number of non-zero rows
    */
   template<typename DType, typename IType, typename RType>
-  MSHADOW_XINLINE static void Map(int i,
+  MSHADOW_XINLINE static void Map(index_t i,
                                   const IType* data,
                                   DType* out,
                                   const RType* weight_idx,
@@ -1395,15 +1396,15 @@ inline bool ScatterNDType(const nnvm::NodeAttrs& attrs,
 
 struct scatter_nd {
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, OpReqType req, int N, int M, int K,
+  MSHADOW_XINLINE static void Map(index_t i, OpReqType req, index_t N, index_t M, index_t K,
                                   const mshadow::Shape<10> strides,
                                   DType* out, const DType* data,
                                   const IType* indices) {
-    int offset = 0;
-    for (int j = 0; j < M; ++j) {
-      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+    index_t offset = 0;
+    for (index_t j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<index_t>(indices[j*N + i]);
     }
-    for (int j = 0; j < K; ++j) {
+    for (index_t j = 0; j < K; ++j) {
       KERNEL_ASSIGN(out[offset+j], req, data[i*K + j]);
     }
   }
@@ -1416,17 +1417,18 @@ void ScatterNDForward(const nnvm::NodeAttrs& attrs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
+  using nnvm::dim_t;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp) return;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TShape& oshape = outputs[0].shape_;
   const TShape& ishape = inputs[1].shape_;
-  int M = ishape[0];
-  int N = ishape.Size() / M;
-  int K = oshape.ProdShape(M, oshape.ndim());
+  dim_t M = ishape[0];
+  dim_t N = ishape.Size() / M;
+  dim_t K = oshape.ProdShape(M, oshape.ndim());
   mshadow::Shape<10> strides;
-  for (int i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
+  for (dim_t i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
   if (kWriteTo == req[0]) {
     Fill<true>(s, outputs[0], req[0], 0);
   }
@@ -1441,7 +1443,7 @@ void ScatterNDForward(const nnvm::NodeAttrs& attrs,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<(!std::is_same<DType, mshadow::half::half_t>::value), void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
@@ -1450,7 +1452,7 @@ GatherNDBackwardImpl(int N, int M, int K,
 
 template<typename DType, typename IType>
 inline typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value, void>::type
-GatherNDBackwardImpl(int N, int M, int K,
+GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                      const mshadow::Shape<10> strides,
                      DType* out,
                      const DType* data,
@@ -1458,7 +1460,7 @@ GatherNDBackwardImpl(int N, int M, int K,
                      mshadow::Stream<cpu> *s);
 
 template<typename DType, typename IType>
-inline void GatherNDBackwardImpl(int N, int M, int K,
+inline void GatherNDBackwardImpl(index_t N, index_t M, index_t K,
                                  const mshadow::Shape<10> strides,
                                  DType* out,
                                  const DType* data,
@@ -1472,17 +1474,18 @@ void GatherNDBackward(const nnvm::NodeAttrs& attrs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   using namespace mshadow;
+  using nnvm::dim_t;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   if (req[0] == kNullOp) return;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TShape& oshape = outputs[0].shape_;
   const TShape& ishape = inputs[1].shape_;
-  int M = ishape[0];
-  int N = ishape.Size() / M;
-  int K = oshape.ProdShape(M, oshape.ndim());
+  dim_t M = ishape[0];
+  dim_t N = ishape.Size() / M;
+  dim_t K = oshape.ProdShape(M, oshape.ndim());
   mshadow::Shape<10> strides;
-  for (int i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
+  for (dim_t i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
   if (kWriteTo == req[0]) {
     Fill<true>(s, outputs[0], req[0], 0);
   }
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 4e52b087f10a..e9e67cb1a4c5 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -453,7 +453,7 @@ void EyeFill(const nnvm::NodeAttrs& attrs,
 
 struct range_fwd {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, int repeat, DType start, DType step,
+  MSHADOW_XINLINE static void Map(index_t i, int repeat, DType start, DType step,
                                   int req, DType* out) {
     KERNEL_ASSIGN(out[i], req, start + (i/repeat) * step);
   }
@@ -471,8 +471,8 @@ void RangeCompute(const nnvm::NodeAttrs& attrs,
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       // Force unsigned params to take two's complement form on ARM to ensure consistency with x86
       // results.  Casting negative floats to unsigned types is undefined in the CPP standard.
-      auto step = std::is_signed<DType>() ? param.step : static_cast<int>(param.step);
-      auto start = std::is_signed<DType>() ? param.start : static_cast<int>(param.start);
+      auto step = std::is_signed<DType>() ? param.step : static_cast<index_t>(param.step);
+      auto start = std::is_signed<DType>() ? param.start : static_cast<index_t>(param.start);
       Kernel<range_fwd, xpu>::Launch(s,
                                      outputs[0].Size(),
                                      static_cast<int>(param.repeat),
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 9c81d87464de..3b229cf38eba 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -626,9 +626,9 @@ inline void GetIndexRange(const TShape& dshape,
                           const nnvm::Tuple<dmlc::optional<int>>& param_begin,
                           const nnvm::Tuple<dmlc::optional<int>>& param_end,
                           const nnvm::Tuple<dmlc::optional<int>>& param_step,
-                          common::StaticArray<int, ndim>* begin,
-                          common::StaticArray<int, ndim>* end,
-                          common::StaticArray<int, ndim>* step) {
+                          common::StaticArray<index_t, ndim>* begin,
+                          common::StaticArray<index_t, ndim>* end,
+                          common::StaticArray<index_t, ndim>* step) {
   CHECK_NE(dshape.ndim(), 0U);
   CHECK_LE(param_begin.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
@@ -646,8 +646,8 @@ inline void GetIndexRange(const TShape& dshape,
   }
 
   for (index_t i = 0; i < param_begin.ndim(); ++i) {
-    int b = 0, e = dshape[i], s = 1;
-    const int len = dshape[i];
+    index_t b = 0, e = dshape[i], s = 1;
+    const index_t len = dshape[i];
     if (param_step.ndim() != 0U) {
       const auto& opt_step_val = param_step[i];
       if (opt_step_val.has_value()) {
@@ -724,7 +724,7 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   TShape oshape = dshape;
 
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -743,19 +743,19 @@ template<int ndim, int req>
 struct slice_forward<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = dshape[ndim-1];
-    const int out_last_dim_size = oshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    const int j = i % out_last_dim_size;
-    int irow = 0;  // row id of flattend 2D data
-    int stride = 1;
-    int idx = i / out_last_dim_size;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = dshape[ndim-1];
+    const index_t out_last_dim_size = oshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    const index_t j = i % out_last_dim_size;
+    index_t irow = 0;  // row id of flattend 2D data
+    index_t stride = 1;
+    index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
@@ -771,20 +771,20 @@ template<int ndim, int req>
 struct slice_forward<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* data,
                                   const mshadow::Shape<ndim> dshape,
                                   const mshadow::Shape<ndim> oshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = dshape[ndim-1];
-    const int out_last_dim_size = oshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    int out_offset = i * out_last_dim_size;
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D data
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = dshape[ndim-1];
+    const index_t out_last_dim_size = oshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    index_t out_offset = i * out_last_dim_size;
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D data
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
@@ -813,11 +813,11 @@ void SliceOpForward(const nnvm::NodeAttrs& attrs,
   const TBlob& out = outputs[0];
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        int num_threads = out.shape_.FlatTo2D()[0];
+        size_t num_threads = out.shape_.FlatTo2D()[0];
         if (std::is_same<xpu, gpu>::value) {
           num_threads *= out.shape_.get<ndim>()[ndim - 1];
         }
@@ -836,20 +836,20 @@ template<int ndim, int req>
 struct slice_assign<ndim, req, cpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    int offset = i * out_last_dim_size;
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D out
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    index_t offset = i * out_last_dim_size;
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D out
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -866,19 +866,19 @@ template<int ndim, int req>
 struct slice_assign<ndim, req, gpu> {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* val,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    const int j = i % out_last_dim_size;
-    int irow = 0;  // row id of flattend 2D out
-    int stride = 1;
-    int idx = i / out_last_dim_size;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    const index_t j = i % out_last_dim_size;
+    index_t irow = 0;  // row id of flattend 2D out
+    index_t stride = 1;
+    index_t idx = i / out_last_dim_size;
     #pragma unroll
     for (int k = ndim - 2; k >= 0; --k) {
       irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -911,7 +911,7 @@ void SliceOpBackward(const nnvm::NodeAttrs& attrs,
     LOG(FATAL) << "_slice_backward does not support kWriteInplace";
   }
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(igrad.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -937,7 +937,7 @@ inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
   TShape vshape = dshape;  // vshape is the value shape on the right hand side
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -975,7 +975,7 @@ void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
 
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1024,20 +1024,20 @@ template<int ndim>
 struct slice_assign_scalar {
   // i is the i-th row after flattening out into 2D tensor
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType val,
                                   const OpReqType req,
                                   const mshadow::Shape<ndim> oshape,
                                   const mshadow::Shape<ndim> vshape,
-                                  const common::StaticArray<int, ndim> begin,
-                                  const common::StaticArray<int, ndim> step) {
-    const int data_last_dim_size = oshape[ndim-1];
-    const int out_last_dim_size = vshape[ndim-1];
-    const int step_last_dim = step[ndim-1];
-    const int begin_last_dim = begin[ndim-1];
-    for (int j = 0; j < out_last_dim_size; ++j) {
-      int irow = 0;  // row id of flattend 2D out
-      int stride = 1;
-      int idx = i;
+                                  const common::StaticArray<index_t, ndim> begin,
+                                  const common::StaticArray<index_t, ndim> step) {
+    const index_t data_last_dim_size = oshape[ndim-1];
+    const index_t out_last_dim_size = vshape[ndim-1];
+    const index_t step_last_dim = step[ndim-1];
+    const index_t begin_last_dim = begin[ndim-1];
+    for (index_t j = 0; j < out_last_dim_size; ++j) {
+      index_t irow = 0;  // row id of flattend 2D out
+      index_t stride = 1;
+      index_t idx = i;
       #pragma unroll
       for (int k = ndim - 2; k >= 0; --k) {
         irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
@@ -1076,7 +1076,7 @@ void SliceAssignScalarOpForward(const nnvm::NodeAttrs& attrs,
   TShape vshape = data.shape_;
   const SliceAssignScalarParam& param = nnvm::get<SliceAssignScalarParam>(attrs.parsed);
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
     for (index_t i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
@@ -1107,7 +1107,7 @@ struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
 };
 
 inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape,
-                           int* axis, int* begin, int* end) {
+                           int* axis, index_t* begin, index_t* end) {
   *axis = param.axis;
   if (*axis < 0) {
     *axis += static_cast<int>(ishape.ndim());
@@ -1115,7 +1115,7 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape
   CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
-  int axis_size = static_cast<int>(ishape[*axis]);
+  index_t axis_size = static_cast<index_t>(ishape[*axis]);
   *begin = param.begin;
   *end = -1;
   if (*begin < 0) {
@@ -1149,7 +1149,8 @@ inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   TShape& ishape = (*in_attrs)[0];
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
   TShape shape(ishape.ndim());
   for (index_t i = 0; i < ishape.ndim(); ++i) {
@@ -1173,7 +1174,8 @@ void SliceAxis(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].ndim());
 
@@ -1207,7 +1209,8 @@ void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  int axis, begin, end;
+  int axis;
+  index_t begin, end;
   GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].shape_.ndim());
 
@@ -1354,7 +1357,7 @@ void SliceLikeForward(const nnvm::NodeAttrs& attrs,
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(data.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1400,7 +1403,7 @@ void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
-    common::StaticArray<int, ndim> begin, end, step;
+    common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(ograd.shape_, param_begin, param_end, param_step, &begin, &end, &step);
     MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
@@ -1429,7 +1432,7 @@ struct ClipParam : public dmlc::Parameter<ClipParam> {
 
 struct clip {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* datas,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
@@ -1445,7 +1448,7 @@ struct clip {
 
 struct clip_grad {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* grad, const DType* datas,
+  MSHADOW_XINLINE static void Map(index_t i, DType* out, const DType* grad, const DType* datas,
                                   DType a_min, DType a_max) {
     DType data = datas[i];
     if (data > a_max) {
@@ -1934,7 +1937,7 @@ struct reverse {
   }
 #ifdef __CUDACC__
   template<typename DType>
-  __device__  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+  __device__  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                               const index_t * stride_,
                               const index_t * trailing_) {
     __shared__ index_t stride_share[REVERSE_MAX_DIM];
@@ -1949,7 +1952,7 @@ struct reverse {
   }
 #else
   template<typename DType>
-  MSHADOW_XINLINE  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+  MSHADOW_XINLINE  static void Map(index_t index, index_t nreversedim, const DType *src, DType *dst,
                                    const index_t * stride_,
                                    const index_t * trailing_) {
     index_t new_idx = ReverseIndex(index, nreversedim, stride_, trailing_);
@@ -2141,10 +2144,10 @@ struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
 // move all the zeros to the last of the shape array
 // and keep the relative order of the non-zero values.
 // Returns the new shape size after moving all zeros to the end.
-inline uint32_t SqueezeShapeHelper(TShape* shape) {
+inline size_t SqueezeShapeHelper(TShape* shape) {
   CHECK(shape != nullptr);
-  uint32_t count = 0;
-  for (uint32_t i = 0; i < shape->ndim(); ++i) {
+  size_t count = 0;
+  for (size_t i = 0; i < shape->ndim(); ++i) {
     if ((*shape)[i] == 0) {
       ++count;
     } else {
@@ -2167,7 +2170,7 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
   if (param.axis.has_value()) {
     // preprocess axis
     TShape axes = param.axis.value();
-    for (uint32_t i = 0; i < axes.ndim(); ++i) {
+    for (size_t i = 0; i < axes.ndim(); ++i) {
       if (axes[i] < 0) {
         axes[i] += dndim;
         CHECK_GE(axes[i], 0)
@@ -2182,11 +2185,11 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       oshape[axes[i]] = 0;
     }
   } else {
-    for (uint32_t i = 0; i < oshape.ndim(); ++i) {
+    for (size_t i = 0; i < oshape.ndim(); ++i) {
       if (oshape[i] == 1) oshape[i] = 0;
     }
   }
-  uint32_t oshape_size = SqueezeShapeHelper(&oshape);
+  size_t oshape_size = SqueezeShapeHelper(&oshape);
   if (oshape_size == 0) {  // corner case when dshape is (1, 1, 1, 1)
     oshape[0] = 1;
     oshape_size = 1;
@@ -2229,7 +2232,7 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] / (block * block);
-  uint32_t i = 2;
+  size_t i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] * block;
     ++i;
@@ -2259,9 +2262,9 @@ inline bool DepthToSpaceOpType(const nnvm::NodeAttrs& attrs,
  * \param inp_index         index within input tensor from where value is retrieved
  * \param offset_arr        array containing the linear offset of input tensor
  */
-MSHADOW_XINLINE void update_index(int index_position, int dim_size, int *idx,
-                                  int *inp_index, const int* offset_arr) {
-  int next_idx_val = *idx / dim_size;
+MSHADOW_XINLINE void update_index(index_t index_position, index_t dim_size, index_t *idx,
+                                  index_t *inp_index, const index_t* offset_arr) {
+  index_t next_idx_val = *idx / dim_size;
   *inp_index += (*idx - next_idx_val * dim_size) * offset_arr[index_position];
   *idx = next_idx_val;
 }
@@ -2280,9 +2283,9 @@ MSHADOW_XINLINE void update_index(int index_position, int dim_size, int *idx,
 template<int req>
 struct depth_to_space_forward {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
-                                  const int block, const int* size, const int* offset_arr) {
-    int inp_index = 0, idx = i, dim_size;
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data,
+                                  const int block, const index_t* size, const index_t* offset_arr) {
+    index_t inp_index = 0, idx = i, dim_size;
     dim_size = block;
     update_index(2, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[3];
@@ -2315,9 +2318,9 @@ struct depth_to_space_forward {
 template<int req>
 struct compute_offset_for_depth_to_space {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
-                                  const int32_t size0, const int32_t size1, const int32_t size2,
-                                  const int32_t size3) {
+  MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
+                                  const index_t size0, const index_t size1, const index_t size2,
+                                  const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
@@ -2349,10 +2352,10 @@ void DepthToSpaceOpForward(const nnvm::NodeAttrs& attrs,
   int block = param.block_size;
 
   mshadow::Tensor<xpu, 1, char> workspace =
-    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
-  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
-  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+  index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
+  index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);
 
   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
@@ -2431,9 +2434,9 @@ inline bool SpaceToDepthOpType(const nnvm::NodeAttrs& attrs,
 template<int req>
 struct space_to_depth_forward {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data, const int block,
-                                  const int* size, const int* offset_arr) {
-    int inp_index = 0, idx = i, dim_size;
+  MSHADOW_XINLINE static void Map(index_t i, DType* out_data, const DType* in_data, const int block,
+                                  const index_t* size, const index_t* offset_arr) {
+    index_t inp_index = 0, idx = i, dim_size;
     dim_size = size[3] / block;
     update_index(4, dim_size, &idx, &inp_index, offset_arr);
     dim_size = size[2] / block;
@@ -2466,9 +2469,9 @@ struct space_to_depth_forward {
 template<int req>
 struct compute_offset_for_space_to_depth {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* offset_arr, DType* size, const int block,
-                                  const int32_t size0, const int32_t size1,
-                                  const int32_t size2, const int32_t size3) {
+  MSHADOW_XINLINE static void Map(index_t i, DType* offset_arr, DType* size, const int block,
+                                  const index_t size0, const index_t size1,
+                                  const index_t size2, const index_t size3) {
     size[0] = size0;
     size[1] = size1;
     size[2] = size2;
@@ -2500,10 +2503,10 @@ void SpaceToDepthOpForward(const nnvm::NodeAttrs& attrs,
   int block = param.block_size;
 
   mshadow::Tensor<xpu, 1, char> workspace =
-    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(int32_t) * 10), s);
+    ctx.requested[0].get_space_typed<xpu, 1, char>(mshadow::Shape1(sizeof(index_t) * 10), s);
   char* workspace_curr_ptr = workspace.dptr_;
-  int32_t* offset_arr = reinterpret_cast<int32_t*>(workspace_curr_ptr);
-  int32_t* size = reinterpret_cast<int32_t*>(workspace_curr_ptr + sizeof(int32_t) * 6);
+  index_t* offset_arr = reinterpret_cast<index_t*>(workspace_curr_ptr);
+  index_t* size = reinterpret_cast<index_t*>(workspace_curr_ptr + sizeof(index_t) * 6);
 
   MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 121acc174b51..a301362f2db7 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -15,20 +15,126 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import unittest
 import mxnet as mx
+import numpy as np
 from mxnet import gluon, nd
 
+# dimension constants
+MEDIUM_X = 10000
+LARGE_X = 100000000
+LARGE_Y = 50000000
+SMALL_Y = 50
+LARGE_SIZE = LARGE_X * SMALL_Y
+
+def test_gluon_embedding():
+    m = gluon.nn.Embedding(SMALL_Y, MEDIUM_X)
+    m.initialize()
+    a = nd.zeros((MEDIUM_X, SMALL_Y))
+    b = m(a)
+    assert b.shape == (MEDIUM_X, SMALL_Y, MEDIUM_X)
+    assert b.asnumpy().size == LARGE_SIZE
+
+def test_ndarray_zeros():
+    a = nd.zeros(shape=(LARGE_X, SMALL_Y))
+    assert a[-1][0] == 0
+    assert a.shape == (LARGE_X, SMALL_Y)
+    assert a.size == LARGE_SIZE
+
+def test_ndarray_ones():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    assert a[-1][0] == 1
+    assert nd.sum(a).asnumpy() == LARGE_SIZE
+
+def test_ndarray_random_uniform():
+    a = nd.random.uniform(shape=(LARGE_X, SMALL_Y))
+    assert a[-1][0] != 0
+
+def test_ndarray_empty():
+    a = nd.empty((LARGE_X, SMALL_Y))
+    assert a.shape == (LARGE_X, SMALL_Y)
+
+def test_elementwise():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.ones(shape=(LARGE_X, SMALL_Y))
+    res = a + b
+    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
+    res = a + 1
+    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
+    res = nd.sqrt(a + 3)
+    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
+
+def test_reduce():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    assert nd.sum(a).asnumpy() == a.shape[0] * a.shape[1]
+
+def test_dot():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    b = nd.ones(shape=(SMALL_Y, SMALL_Y))
+    res = nd.dot(a, b)
+    assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
+
+def test_FullyConnected():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
+    b = nd.ones(shape=(SMALL_Y, SMALL_Y)) 
+    res = nd.FullyConnected(a, b, num_hidden=b.shape[1], no_bias=True)
+    assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
+
+def test_broadcast():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
+    res = nd.broadcast_to(b, shape=(b.shape[0], SMALL_Y))
+    assert np.sum(res[-1].asnumpy() == LARGE_X) == res.shape[1]
+    res = mx.nd.broadcast_like(b, a)
+    assert np.sum(res[-1].asnumpy() == LARGE_X) == a.shape[1]
+
+def test_clip():
+    a = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
+    b = nd.broadcast_to(a, shape=(a.shape[0], SMALL_Y))
+    res = nd.clip(b, a_min=100, a_max=1000)
+    assert np.sum(res[-1].asnumpy() == 1000) == b.shape[1]
+
+def test_take():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    idx = nd.arange(LARGE_X-1000, LARGE_X)
+    res = nd.take(a, idx)
+    assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
+
+def test_slice():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    res = nd.slice(a, begin=(LARGE_X-1000, 1), end=(LARGE_X, SMALL_Y))
+    assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
+
+def test_slice_assign():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    a[LARGE_X-1:LARGE_X] = 1000
+    assert np.sum(a[-1].asnumpy() == 1000) == a.shape[1]
+ 
+def test_expand_dims():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    res = nd.expand_dims(a, axis=1)
+    assert res.shape == (a.shape[0], 1, a.shape[1])
+
+def test_squeeze():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    data = nd.expand_dims(a, axis=1)
+    res = nd.squeeze(data)
+    assert res.shape == a.shape
+
+def test_broadcast_div():
+    a = nd.ones(shape=(LARGE_X, SMALL_Y))
+    b = nd.ones(shape=(LARGE_X, 1)) * 2
+    res = a / b
+    assert np.sum(res[-1].asnumpy() == 0.5) == a.shape[1]
+
+def test_Dense(ctx=mx.cpu(0)):
+    data = mx.nd.ones(shape=(50*1000*1000, 100))
+    linear = gluon.nn.Dense(100)
+    linear.initialize(ctx=ctx)
+    res = linear(data)
+    res.wait_to_read()
+    assert res.shape == (50000000, 100)
 
-class TestLargeArray(unittest.TestCase):
-    def test_ndarray2numpy(self):
-        m = gluon.nn.Embedding(14000, 128)
-        m.initialize()
-        ind = nd.zeros((700000, 128))
-        x = m(ind)
-        x.shape
-        test = x.asnumpy()
-        assert (x.shape == test.shape)
 
 if __name__ == '__main__':
-    unittest.main()
+    import nose
+    nose.runmodule()

From baeada494046d91b580c7985af53e8941dbde104 Mon Sep 17 00:00:00 2001
From: Gaurav Gireesh <Gaurav.Gireesh@fox.com>
Date: Fri, 30 Nov 2018 22:53:46 -0800
Subject: [PATCH 08/54] [MXNET-1210 ] Gluon Audio - Example (#13325)

* Initialized the example

* Addressed PR comments, about existing synset.txt file - no overwrite

* RST - docstring issues fixed

* added README

* Addressed PR comments

* Addressed PR comments, checking Divide by 0

* Raising error if format is not supported.

* changed a line for ndarray of labels

* Trigger CI

* Trigger CI

* PR comments addressed around skip_header argument

* Addressed PR comments around librosa import

* PR Comments

* Passing lazy=lazy from argument

* Added PR comments, labels to README.MD

* Trigger CI

* Addressing PR Comments in README

* Modified README.md

* Added example under audio folder

* Retrigger CI

* Retrigger CI
---
 example/gluon/audio/transforms.py             | 205 ++++++++++++++++++
 example/gluon/audio/urban_sounds/README.md    | 100 +++++++++
 example/gluon/audio/urban_sounds/datasets.py  | 179 +++++++++++++++
 example/gluon/audio/urban_sounds/model.py     |  33 +++
 example/gluon/audio/urban_sounds/predict.py   |  92 ++++++++
 .../gluon/audio/urban_sounds/requirements.txt |   2 +
 example/gluon/audio/urban_sounds/train.py     | 157 ++++++++++++++
 7 files changed, 768 insertions(+)
 create mode 100644 example/gluon/audio/transforms.py
 create mode 100644 example/gluon/audio/urban_sounds/README.md
 create mode 100644 example/gluon/audio/urban_sounds/datasets.py
 create mode 100644 example/gluon/audio/urban_sounds/model.py
 create mode 100644 example/gluon/audio/urban_sounds/predict.py
 create mode 100644 example/gluon/audio/urban_sounds/requirements.txt
 create mode 100644 example/gluon/audio/urban_sounds/train.py

diff --git a/example/gluon/audio/transforms.py b/example/gluon/audio/transforms.py
new file mode 100644
index 000000000000..8b76d131cdb1
--- /dev/null
+++ b/example/gluon/audio/transforms.py
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""Audio transforms."""
+
+import warnings
+import numpy as np
+try:
+    import librosa
+except ImportError as e:
+    warnings.warn("librosa dependency could not be resolved or \
+    imported, could not provide some/all transform.")
+
+from mxnet import ndarray as nd
+from mxnet.gluon.block import Block
+
+class MFCC(Block):
+    """Extracts Mel frequency cepstrum coefficients from the audio data file
+    More details : https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
+
+    Attributes
+    ----------
+    sampling_rate: int, default 22050
+        sampling rate of the input audio signal
+    num_mfcc: int, default 20
+        number of mfccs to return
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array is a scaled NDArray with (samples, ) shape.
+
+    """
+
+    def __init__(self, sampling_rate=22050, num_mfcc=20):
+        self._sampling_rate = sampling_rate
+        self._num_fcc = num_mfcc
+        super(MFCC, self).__init__()
+
+    def forward(self, x):
+        if isinstance(x, np.ndarray):
+            y = x
+        elif isinstance(x, nd.NDArray):
+            y = x.asnumpy()
+        else:
+            warnings.warn("MFCC - allowed datatypes mx.nd.NDArray and numpy.ndarray")
+            return x
+
+        audio_tmp = np.mean(librosa.feature.mfcc(y=y, sr=self._sampling_rate, n_mfcc=self._num_fcc).T, axis=0)
+        return nd.array(audio_tmp)
+
+
+class Scale(Block):
+    """Scale audio numpy.ndarray from a 16-bit integer to a floating point number between
+    -1.0 and 1.0. The 16-bit integer is the sample resolution or bit depth.
+
+    Attributes
+    ----------
+    scale_factor : float
+        The factor to scale the input tensor by.
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array is a scaled NDArray with (samples, ) shape.
+
+    Examples
+    --------
+    >>> scale = audio.transforms.Scale(scale_factor=2)
+    >>> audio_samples = mx.nd.array([2,3,4])
+    >>> scale(audio_samples)
+    [1.  1.5 2. ]
+    <NDArray 3 @cpu(0)>
+
+    """
+
+    def __init__(self, scale_factor=2**31):
+        self.scale_factor = scale_factor
+        super(Scale, self).__init__()
+
+    def forward(self, x):
+        if self.scale_factor == 0:
+            warnings.warn("Scale factor cannot be 0.")
+            return x
+        if isinstance(x, np.ndarray):
+            return nd.array(x/self.scale_factor)
+        return x / self.scale_factor
+
+
+class PadTrim(Block):
+    """Pad/Trim a 1d-NDArray of NPArray (Signal or Labels)
+
+    Attributes
+    ----------
+    max_len : int
+        Length to which the array will be padded or trimmed to.
+    fill_value: int or float
+        If there is a need of padding, what value to pad at the end of the input array.
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array is a scaled NDArray with (max_len, ) shape.
+
+    Examples
+    --------
+    >>> padtrim = audio.transforms.PadTrim(max_len=9, fill_value=0)
+    >>> audio_samples = mx.nd.array([1,2,3,4,5])
+    >>> padtrim(audio_samples)
+    [1. 2. 3. 4. 5. 0. 0. 0. 0.]
+    <NDArray 9 @cpu(0)>
+
+    """
+
+    def __init__(self, max_len, fill_value=0):
+        self._max_len = max_len
+        self._fill_value = fill_value
+        super(PadTrim, self).__init__()
+
+    def forward(self, x):
+        if  isinstance(x, np.ndarray):
+            x = nd.array(x)
+        if self._max_len > x.size:
+            pad = nd.ones((self._max_len - x.size,)) * self._fill_value
+            x = nd.concat(x, pad, dim=0)
+        elif self._max_len < x.size:
+            x = x[:self._max_len]
+        return x
+
+
+class MEL(Block):
+    """Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.
+
+    Attributes
+    ----------
+    sampling_rate: int, default 22050
+        sampling rate of the input audio signal
+    num_fft: int, default 2048
+        length of the Fast Fourier transform window
+    num_mels: int, default 20
+        number of mel bands to generate
+    hop_length: int, default 512
+        total samples between successive frames
+
+
+    Inputs:
+        - **x**: input tensor (samples, ) shape.
+
+    Outputs:
+        - **out**: output array which consists of mel spectograms, shape = (n_mels, 1)
+
+       Usage (see librosa.feature.melspectrogram docs):
+           MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)
+
+    Examples
+    --------
+    >>> mel = audio.transforms.MEL()
+    >>> audio_samples = mx.nd.array([1,2,3,4,5])
+    >>> mel(audio_samples)
+    [[3.81801406e+04]
+    [9.86858240e-29]
+    [1.87405472e-29]
+    [2.38637225e-29]
+    [3.94043010e-29]
+    [3.67071565e-29]
+    [7.29390295e-29]
+    [8.84324438e-30]...
+    <NDArray 128x1 @cpu(0)>
+
+    """
+
+    def __init__(self, sampling_rate=22050, num_fft=2048, num_mels=20, hop_length=512):
+        self._sampling_rate = sampling_rate
+        self._num_fft = num_fft
+        self._num_mels = num_mels
+        self._hop_length = hop_length
+        super(MEL, self).__init__()
+
+    def forward(self, x):
+        if isinstance(x, nd.NDArray):
+            x = x.asnumpy()
+        specs = librosa.feature.melspectrogram(x, sr=self._sampling_rate,\
+        n_fft=self._num_fft, n_mels=self._num_mels, hop_length=self._hop_length)
+        return nd.array(specs)
diff --git a/example/gluon/audio/urban_sounds/README.md b/example/gluon/audio/urban_sounds/README.md
new file mode 100644
index 000000000000..c85d29db2e5a
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/README.md
@@ -0,0 +1,100 @@
+# Urban Sounds Classification in MXNet Gluon
+
+This example provides an end-to-end pipeline for a common datahack competition - Urban Sounds Classification Example.
+Below is the link to the competition:
+https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/
+
+After logging in, the data set can be downloaded.
+The details of the dataset and the link to download it are given below:
+
+
+## Urban Sounds Dataset:
+### Description
+  The dataset contains 8732 wav files which are audio samples(<= 4s)) of street sounds like engine_idling, car_horn, children_playing, dog_barking and so on.
+  The task is to classify these audio samples into one of the following 10 labels:
+  ```
+  siren,
+  street_music,
+  drilling,
+  dog_bark,
+  children_playing,
+  gun_shot,
+  engine_idling,
+  air_conditioner,
+  jackhammer,
+  car_horn
+  ```
+
+To be able to run this example:
+
+1. `pip install -r requirements.txt`
+
+    If you are in the directory where the requirements.txt file lies,
+    this step installs the required libraries to run the example.
+    The main dependency that is required is: Librosa. 
+    The version used to test the example is: `0.6.2`
+    For more details, refer here:
+https://librosa.github.io/librosa/install.html
+
+2. Download the dataset(train.zip, test.zip) required for this example from the location:
+https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU
+
+3. Extract both the zip archives into the **current directory** - after unzipping you would get 2 new folders namely,
+   **Train** and **Test** and two csv files - **train.csv**, **test.csv**
+
+   Assuming you are in a directory *"UrbanSounds"*, after downloading and extracting train.zip, the folder structure should be:
+   
+   ```
+        UrbanSounds        
+                    - Train
+                        - 0.wav, 1.wav ...
+                    - train.csv
+                    - train.py
+                    - predict.py ...
+    ```
+
+4. Apache MXNet is installed on the machine. For instructions, go to the link: https://mxnet.incubator.apache.org/install/
+
+
+
+For information on the current design of how the AudioFolderDataset is implemented, refer below:
+https://cwiki.apache.org/confluence/display/MXNET/Gluon+-+Audio
+
+### Usage 
+
+For training:
+
+- Arguments
+  - train : The folder/directory that contains the audio(wav) files locally. Default = "./Train"
+  - csv: The file name of the csv file that contains audio file name to label mapping. Default = "train.csv"
+  - epochs : Number of epochs to train the model. Default = 30
+  - batch_size : The batch size for training. Default = 32
+
+
+###### To use the default arguments, use:
+```
+python train.py
+``` 
+or
+
+###### To pass command-line arguments for training data directory, epochs, batch_size, csv file name, use :
+```
+python train.py --train ./Train --csv train.csv --batch_size 32 --epochs 30 
+```
+
+For prediction:
+
+- Arguments
+  - pred : The folder/directory that contains the audio(wav) files which are to be classified. Default = "./Test"
+
+
+###### To use the default arguments, use:
+```
+python predict.py
+``` 
+or
+
+###### To pass command-line arguments for test data directory, use :
+```
+python predict.py --pred ./Test
+```
\ No newline at end of file
diff --git a/example/gluon/audio/urban_sounds/datasets.py b/example/gluon/audio/urban_sounds/datasets.py
new file mode 100644
index 000000000000..51c040c8f162
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/datasets.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=
+""" Audio Dataset container."""
+from __future__ import print_function
+__all__ = ['AudioFolderDataset']
+
+import os
+import warnings
+from itertools import islice
+import csv
+from mxnet.gluon.data import Dataset
+from mxnet import ndarray as nd
+try:
+    import librosa
+except ImportError as e:
+    raise ImportError("librosa dependency could not be resolved or \
+    imported, could not load audio onto the numpy array. pip install librosa")
+
+
+
+class AudioFolderDataset(Dataset):
+    """A dataset for loading Audio files stored in a folder structure like::
+
+        root/children_playing/0.wav
+        root/siren/23.wav
+        root/drilling/26.wav
+        root/dog_barking/42.wav
+            OR
+        Files(wav) and a csv file that has file name and associated label
+
+    Parameters
+    ----------
+    root : str
+        Path to root directory.
+    transform : callable, default None
+        A function that takes data and label and transforms them
+    train_csv: str, default None
+       train_csv should be populated by the training csv filename
+    file_format: str, default '.wav'
+        The format of the audio files(.wav)
+    skip_header: boolean, default False
+        While reading from csv file, whether to skip at the start of the file to avoid reading in header
+
+
+    Attributes
+    ----------
+    synsets : list
+        List of class names. `synsets[i]` is the name for the  `i`th label
+    items : list of tuples
+        List of all audio in (filename, label) pairs.
+
+    """
+    def __init__(self, root, train_csv=None, file_format='.wav', skip_header=False):
+        if not librosa:
+            warnings.warn("pip install librosa to continue.")
+            raise RuntimeError("Librosa not installed. Run pip install librosa and retry this step.")
+        self._root = os.path.expanduser(root)
+        self._exts = ['.wav']
+        self._format = file_format
+        self._train_csv = train_csv
+        if file_format.lower() not in self._exts:
+            raise RuntimeError("Format {} not supported currently.".format(file_format))
+        skip_rows = 0
+        if skip_header:
+            skip_rows = 1
+        self._list_audio_files(self._root, skip_rows=skip_rows)
+
+
+    def _list_audio_files(self, root, skip_rows=0):
+        """Populates synsets - a map of index to label for the data items.
+        Populates the data in the dataset, making tuples of (data, label)
+        """
+        self.synsets = []
+        self.items = []
+        if not self._train_csv:
+            # The audio files are organized in folder structure with
+            # directory name as label and audios in them
+            self._folder_structure(root)
+        else:
+            # train_csv contains mapping between filename and label
+            self._csv_labelled_dataset(root, skip_rows=skip_rows)
+
+        # Generating the synset.txt file now
+        if not os.path.exists("./synset.txt"):
+            with open("./synset.txt", "w") as synsets_file:
+                for item in self.synsets:
+                    synsets_file.write(item+os.linesep)
+            print("Synsets is generated as synset.txt")
+        else:
+            warnings.warn("Synset file already exists in the current directory! Not generating synset.txt.")
+
+
+    def _folder_structure(self, root):
+        for folder in sorted(os.listdir(root)):
+            path = os.path.join(root, folder)
+            if not os.path.isdir(path):
+                warnings.warn('Ignoring {}, which is not a directory.'.format(path))
+                continue
+            label = len(self.synsets)
+            self.synsets.append(folder)
+            for filename in sorted(os.listdir(path)):
+                file_name = os.path.join(path, filename)
+                ext = os.path.splitext(file_name)[1]
+                if ext.lower() not in self._exts:
+                    warnings.warn('Ignoring {} of type {}. Only support {}'\
+                    .format(filename, ext, ', '.join(self._exts)))
+                    continue
+                self.items.append((file_name, label))
+
+
+    def _csv_labelled_dataset(self, root, skip_rows=0):
+        with open(self._train_csv, "r") as traincsv:
+            for line in islice(csv.reader(traincsv), skip_rows, None):
+                filename = os.path.join(root, line[0])
+                label = line[1].strip()
+                if label not in self.synsets:
+                    self.synsets.append(label)
+                if self._format not in filename:
+                    filename = filename+self._format
+                self.items.append((filename, nd.array([self.synsets.index(label)]).reshape((1,))))
+
+
+    def __getitem__(self, idx):
+        """Retrieve the item (data, label) stored at idx in items"""
+        filename, label = self.items[idx]
+        # resampling_type is passed as kaiser_fast for a better performance
+        X1, _ = librosa.load(filename, res_type='kaiser_fast')
+        return nd.array(X1), label
+
+
+    def __len__(self):
+        """Retrieves the number of items in the dataset"""
+        return len(self.items)
+
+
+    def transform_first(self, fn, lazy=False):
+        """Returns a new dataset with the first element of each sample
+        transformed by the transformer function `fn`.
+
+        This is useful, for example, when you only want to transform data
+        while keeping label as is.
+        lazy=False is passed to transform_first for dataset so that all tramsforms could be performed in
+        one shot and not during training. This is a performance consideration.
+
+        Parameters
+        ----------
+        fn : callable
+            A transformer function that takes the first element of a sample
+            as input and returns the transformed element.
+        lazy : bool, default False
+            If False, transforms all samples at once. Otherwise,
+            transforms each sample on demand. Note that if `fn`
+            is stochastic, you must set lazy to True or you will
+            get the same result on all epochs.
+
+        Returns
+        -------
+        Dataset
+            The transformed dataset.
+
+        """
+        return super(AudioFolderDataset, self).transform_first(fn, lazy=lazy)
diff --git a/example/gluon/audio/urban_sounds/model.py b/example/gluon/audio/urban_sounds/model.py
new file mode 100644
index 000000000000..af23cb946e2e
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/model.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""This module builds a model an MLP with a configurable output layer( number of units in the last layer).
+Users can pass any number of units in the last layer. SInce this dataset has 10 labels,
+the default value of num_labels = 10
+"""
+import mxnet as mx
+from mxnet import gluon
+
+# Defining a neural network with number of labels
+def get_net(num_labels=10):
+    net = gluon.nn.Sequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
+        net.add(gluon.nn.Dense(256, activation="relu")) # 2nd hidden layer ( 256 nodes )
+        net.add(gluon.nn.Dense(num_labels))
+    net.collect_params().initialize(mx.init.Xavier())
+    return net
diff --git a/example/gluon/audio/urban_sounds/predict.py b/example/gluon/audio/urban_sounds/predict.py
new file mode 100644
index 000000000000..0c3631173667
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/predict.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Prediction module for Urban Sounds Classification"""
+from __future__ import print_function
+import os
+import sys
+import warnings
+import mxnet as mx
+from mxnet import nd
+from model import get_net
+try:
+    import librosa
+except ImportError:
+    raise ImportError("Librosa is not installed! please run the following command:\
+     `pip install librosa`")
+sys.path.append('../')
+
+def predict(prediction_dir='./Test'):
+    """The function is used to run predictions on the audio files in the directory `pred_directory`.
+
+    Parameters
+    ----------
+    net:
+        The model that has been trained.
+    prediction_dir: string, default ./Test
+        The directory that contains the audio files on which predictions are to be made
+
+    """
+
+    if not os.path.exists(prediction_dir):
+        warnings.warn("The directory on which predictions are to be made is not found!")
+        return
+
+    if len(os.listdir(prediction_dir)) == 0:
+        warnings.warn("The directory on which predictions are to be made is empty! Exiting...")
+        return
+
+    # Loading synsets
+    if not os.path.exists('./synset.txt'):
+        warnings.warn("The synset or labels for the dataset do not exist. Please run the training script first.")
+        return
+
+    with open("./synset.txt", "r") as f:
+        synset = [l.rstrip() for l in f]
+    net = get_net(len(synset))
+    print("Trying to load the model with the saved parameters...")
+    if not os.path.exists("./net.params"):
+        warnings.warn("The model does not have any saved parameters... Cannot proceed! Train the model first")
+        return
+
+    net.load_parameters("./net.params")
+    file_names = os.listdir(prediction_dir)
+    full_file_names = [os.path.join(prediction_dir, item) for item in file_names]
+    from transforms import MFCC
+    mfcc = MFCC()
+    print("\nStarting predictions for audio files in ", prediction_dir, " ....\n")
+    for filename in full_file_names:
+        # Argument kaiser_fast to res_type is faster than 'kaiser_best'. To reduce the load time, passing kaiser_fast.
+        X1, _ = librosa.load(filename, res_type='kaiser_fast')
+        transformed_test_data = mfcc(mx.nd.array(X1))
+        output = net(transformed_test_data.reshape((1, -1)))
+        prediction = nd.argmax(output, axis=1)
+        print(filename, " -> ", synset[(int)(prediction.asscalar())])
+
+
+if __name__ == '__main__':
+    try:
+        import argparse
+        parser = argparse.ArgumentParser(description="Urban Sounds clsssification example - MXNet")
+        parser.add_argument('--pred', '-p', help="Enter the folder path that contains your audio files", type=str)
+        args = parser.parse_args()
+        pred_dir = args.pred
+
+    except ImportError:
+        warnings.warn("Argparse module not installed! passing default arguments.")
+        pred_dir = './Test'
+    predict(prediction_dir=pred_dir)
+    print("Urban sounds classification Prediction DONE!")
diff --git a/example/gluon/audio/urban_sounds/requirements.txt b/example/gluon/audio/urban_sounds/requirements.txt
new file mode 100644
index 000000000000..d885e0beec7e
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/requirements.txt
@@ -0,0 +1,2 @@
+librosa>=0.6.2 # librosa is a library that is used to load the audio(wav) files and provides capabilities of feature extraction.
+argparse # used for parsing arguments
\ No newline at end of file
diff --git a/example/gluon/audio/urban_sounds/train.py b/example/gluon/audio/urban_sounds/train.py
new file mode 100644
index 000000000000..c88f9fb55187
--- /dev/null
+++ b/example/gluon/audio/urban_sounds/train.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The module to run training on the Urban sounds dataset"""
+from __future__ import print_function
+import sys
+import os
+import time
+import warnings
+import mxnet as mx
+from mxnet import gluon, nd, autograd
+from datasets import AudioFolderDataset
+import model
+sys.path.append('../')
+
+def evaluate_accuracy(data_iterator, net):
+    """Function to evaluate accuracy of any data iterator passed to it as an argument"""
+    acc = mx.metric.Accuracy()
+    for data, label in data_iterator:
+        output = net(data)
+        predictions = nd.argmax(output, axis=1)
+        predictions = predictions.reshape((-1, 1))
+        acc.update(preds=predictions, labels=label)
+    return acc.get()[1]
+
+
+def train(train_dir=None, train_csv=None, epochs=30, batch_size=32):
+    """Function responsible for running the training the model."""
+
+    if not train_dir or not os.path.exists(train_dir) or not train_csv:
+        warnings.warn("No train directory could be found ")
+        return
+    # Make a dataset from the local folder containing Audio data
+    print("\nMaking an Audio Dataset...\n")
+    tick = time.time()
+    aud_dataset = AudioFolderDataset(train_dir, train_csv=train_csv, file_format='.wav', skip_header=True)
+    tock = time.time()
+
+    print("Loading the dataset took ", (tock-tick), " seconds.")
+    print("\n=======================================\n")
+    print("Number of output classes = ", len(aud_dataset.synsets))
+    print("\nThe labels are : \n")
+    print(aud_dataset.synsets)
+    # Get the model to train
+    net = model.get_net(len(aud_dataset.synsets))
+    print("\nNeural Network = \n")
+    print(net)
+    print("\nModel - Neural Network Generated!\n")
+    print("=======================================\n")
+
+    #Define the loss - Softmax CE Loss
+    softmax_loss = gluon.loss.SoftmaxCELoss(from_logits=False, sparse_label=True)
+    print("Loss function initialized!\n")
+    print("=======================================\n")
+
+    #Define the trainer with the optimizer
+    trainer = gluon.Trainer(net.collect_params(), 'adadelta')
+    print("Optimizer - Trainer function initialized!\n")
+    print("=======================================\n")
+    print("Loading the dataset to the Gluon's OOTB Dataloader...")
+
+    #Getting the data loader out of the AudioDataset and passing the transform
+    from transforms import MFCC
+    aud_transform = MFCC()
+    tick = time.time()
+
+    audio_train_loader = gluon.data.DataLoader(aud_dataset.transform_first(aud_transform), batch_size=32, shuffle=True)
+    tock = time.time()
+    print("Time taken to load data and apply transform here is ", (tock-tick), " seconds.")
+    print("=======================================\n")
+
+
+    print("Starting the training....\n")
+    # Training loop
+    tick = time.time()
+    batch_size = batch_size
+    num_examples = len(aud_dataset)
+
+    for epoch in range(epochs):
+        cumulative_loss = 0
+        for data, label in audio_train_loader:
+            with autograd.record():
+                output = net(data)
+                loss = softmax_loss(output, label)
+            loss.backward()
+
+            trainer.step(batch_size)
+            cumulative_loss += mx.nd.sum(loss).asscalar()
+
+        if epoch%5 == 0:
+            train_accuracy = evaluate_accuracy(audio_train_loader, net)
+            print("Epoch {}. Loss: {} Train accuracy : {} ".format(epoch, cumulative_loss/num_examples, train_accuracy))
+            print("\n------------------------------\n")
+
+    train_accuracy = evaluate_accuracy(audio_train_loader, net)
+    tock = time.time()
+    print("\nFinal training accuracy: ", train_accuracy)
+
+    print("Training the sound classification for ", epochs, " epochs, MLP model took ", (tock-tick), " seconds")
+    print("====================== END ======================\n")
+
+    print("Trying to save the model parameters here...")
+    net.save_parameters("./net.params")
+    print("Saved the model parameters in current directory.")
+
+
+if __name__ == '__main__':
+    training_dir = './Train'
+    training_csv = './train.csv'
+    epochs = 30
+    batch_size = 32
+
+    try:
+        import argparse
+        parser = argparse.ArgumentParser(description="Urban Sounds classification example - MXNet Gluon")
+        parser.add_argument('--train', '-t', help="Enter the folder path that contains your audio files", type=str)
+        parser.add_argument('--csv', '-c', help="Enter the filename of the csv that contains filename\
+        to label mapping", type=str)
+        parser.add_argument('--epochs', '-e', help="Enter the number of epochs \
+        you would want to run the training for.", type=int)
+        parser.add_argument('--batch_size', '-b', help="Enter the batch_size of data", type=int)
+        args = parser.parse_args()
+
+        if args:
+            if args.train:
+                training_dir = args.train
+
+            if args.csv:
+                training_csv = args.csv
+
+            if args.epochs:
+                epochs = args.epochs
+
+            if args.batch_size:
+                batch_size = args.batch_size
+
+
+    except ImportError as er:
+        warnings.warn("Argument parsing module could not be imported \
+        Passing default arguments.")
+
+
+    train(train_dir=training_dir, train_csv=training_csv, epochs=epochs, batch_size=batch_size)
+    print("Urban sounds classification Training DONE!")

From 77510d7b37a5da80bd43b3f1c21a39a52163dae8 Mon Sep 17 00:00:00 2001
From: Vandana Kannan <vandanavk@users.noreply.github.com>
Date: Fri, 30 Nov 2018 23:05:51 -0800
Subject: [PATCH 09/54] ONNX export: Instance normalization, Shape (#12920)

* ONNX import/export: Make backend_rep common

* ONNX export: Instance Normalization

* ONNX export: Shape operator
---
 .../contrib/onnx/mx2onnx/_op_translations.py  | 26 +++++
 .../onnx/{export => }/backend_rep.py          | 32 +++---
 tests/python-pytest/onnx/export/backend.py    |  4 +
 .../onnx/export/onnx_backend_test.py          |  4 +-
 .../onnx/import/mxnet_backend.py              |  6 +-
 .../onnx/import/mxnet_backend_rep.py          | 98 -------------------
 6 files changed, 54 insertions(+), 116 deletions(-)
 rename tests/python-pytest/onnx/{export => }/backend_rep.py (78%)
 delete mode 100644 tests/python-pytest/onnx/import/mxnet_backend_rep.py

diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index e2aab6b1efa7..facdcfedcbca 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -623,6 +623,23 @@ def convert_identity(node, **kwargs):
     """
     return create_basic_op_node('Identity', node, kwargs)
 
+@mx_op.register("InstanceNorm")
+def convert_instancenorm(node, **kwargs):
+    """Map MXNet's InstanceNorm operator attributes to onnx's InstanceNormalization operator
+    based on the input node's attributes and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    eps = float(attrs.get("eps", 0.001))
+
+    node = onnx.helper.make_node(
+        'InstanceNormalization',
+        inputs=input_nodes,
+        outputs=[name],
+        name=name,
+        epsilon=eps)
+
+    return [node]
 
 @mx_op.register("LeakyReLU")
 def convert_leakyrelu(node, **kwargs):
@@ -1546,6 +1563,15 @@ def convert_sum(node, **kwargs):
         )
     return [node]
 
+
+@mx_op.register("shape_array")
+def convert_shape(node, **kwargs):
+    """Map MXNet's shape_array operator attributes to onnx's Shape operator
+    and return the created node.
+    """
+    return create_basic_op_node('Shape', node, kwargs)
+
+
 @mx_op.register("hard_sigmoid")
 def convert_hardsigmoid(node, **kwargs):
     """Map MXNet's hard_sigmoid operator attributes to onnx's HardSigmoid operator
diff --git a/tests/python-pytest/onnx/export/backend_rep.py b/tests/python-pytest/onnx/backend_rep.py
similarity index 78%
rename from tests/python-pytest/onnx/export/backend_rep.py
rename to tests/python-pytest/onnx/backend_rep.py
index 8729eafea1a1..63836ac848df 100644
--- a/tests/python-pytest/onnx/export/backend_rep.py
+++ b/tests/python-pytest/onnx/backend_rep.py
@@ -16,16 +16,17 @@
 # under the License.
 
 # coding: utf-8
-"""backend rep for onnx test infrastructure"""
+"""MXNet backend rep for onnx test infrastructure"""
 try:
     from onnx.backend.base import BackendRep
 except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed")
+    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
+                      + " install - https://github.com/onnx/onnx#installation")
 import mxnet as mx
 
 # Using these functions for onnx test infrastructure.
 # Implemented by following onnx docs guide:
-# https://github.com/onnx/onnx/blob/master/docs/Implementing%20an%20ONNX%20backend.md
+# https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
 # MXNetBackendRep object will be returned by MXNetBackend's prepare method which is used to
 # execute a model repeatedly.
 # Inputs will be passed to the run method of MXNetBackendRep class, it will perform computation and
@@ -54,9 +55,6 @@ def run(self, inputs, **kwargs):
         params : numpy array
             result obtained after running the inference on mxnet
         """
-        data_forward = []
-        for val in inputs:
-            data_forward.append(mx.nd.array(val))
         # create module, passing cpu context
         if self.device == 'CPU':
             ctx = mx.cpu()
@@ -68,17 +66,19 @@ def run(self, inputs, **kwargs):
         data_names = [graph_input for graph_input in self.symbol.list_inputs()
                       if graph_input not in self.arg_params and graph_input not in self.aux_params]
 
-        data_shapes = []
+        data_forward = []
         for idx, input_name in enumerate(data_names):
-            data_shapes.append((input_name, inputs[idx].shape))
+            val = inputs[idx]
+            data_forward.append(mx.nd.array(val))
 
-        mod = mx.mod.Module(symbol=self.symbol, data_names=data_names, context=ctx,
-                            label_names=None)
-        mod.bind(for_training=False, data_shapes=data_shapes,
-                 label_shapes=None)
-        mod.set_params(arg_params=self.arg_params, aux_params=self.aux_params)
+        if self.arg_params:
+            for idx, input_name in enumerate(self.arg_params):
+                val = self.arg_params[input_name]
+                data_names.append(input_name)
+                data_forward.append(mx.nd.array(val))
 
-        # run inference
-        mod.forward(mx.io.DataBatch(data_forward))
-        result = mod.get_outputs()[0].asnumpy()
+        args = dict(zip(data_names, data_forward))
+        exe = self.symbol.bind(ctx, args=args, aux_states=self.aux_params)
+        exe.forward(is_train=False)
+        result = exe.outputs[0].asnumpy()
         return [result]
diff --git a/tests/python-pytest/onnx/export/backend.py b/tests/python-pytest/onnx/export/backend.py
index e23cc01494e9..3ea1dafca255 100644
--- a/tests/python-pytest/onnx/export/backend.py
+++ b/tests/python-pytest/onnx/export/backend.py
@@ -17,6 +17,8 @@
 
 # coding: utf-8
 """backend wrapper for onnx test infrastructure"""
+import os
+import sys
 import numpy as np
 from mxnet.contrib.onnx.onnx2mx.import_onnx import GraphProto
 from mxnet.contrib.onnx.mx2onnx.export_onnx import MXNetGraph
@@ -25,6 +27,8 @@
     from onnx.backend.base import Backend
 except ImportError:
     raise ImportError("Onnx and protobuf need to be installed")
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(CURR_PATH, '../'))
 from backend_rep import MXNetBackendRep
 
 # Using these functions for onnx test infrastructure.
diff --git a/tests/python-pytest/onnx/export/onnx_backend_test.py b/tests/python-pytest/onnx/export/onnx_backend_test.py
index ec9ddf23c252..be9273eb6fac 100644
--- a/tests/python-pytest/onnx/export/onnx_backend_test.py
+++ b/tests/python-pytest/onnx/export/onnx_backend_test.py
@@ -95,7 +95,9 @@
     'test_clip'
     'test_cast',
     'test_depthtospace',
-    'test_hardsigmoid'
+    'test_hardsigmoid',
+    'test_instancenorm',
+    'test_shape'
     ]
 
 BASIC_MODEL_TESTS = [
diff --git a/tests/python-pytest/onnx/import/mxnet_backend.py b/tests/python-pytest/onnx/import/mxnet_backend.py
index 10f89ecbbbc7..bd4910b64f85 100644
--- a/tests/python-pytest/onnx/import/mxnet_backend.py
+++ b/tests/python-pytest/onnx/import/mxnet_backend.py
@@ -17,6 +17,8 @@
 
 # coding: utf-8
 """MXNet backend wrapper for onnx test infrastructure"""
+import os
+import sys
 from mxnet.contrib.onnx.onnx2mx.import_onnx import GraphProto
 try:
     from onnx import helper, TensorProto
@@ -24,7 +26,9 @@
 except ImportError:
     raise ImportError("Onnx and protobuf need to be installed. Instructions to"
                       + " install - https://github.com/onnx/onnx#installation")
-from mxnet_backend_rep import MXNetBackendRep
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(CURR_PATH, '../'))
+from backend_rep import MXNetBackendRep
 
 # MXNetBackend class will take an ONNX model with inputs, perform a computation,
 # and then return the output.
diff --git a/tests/python-pytest/onnx/import/mxnet_backend_rep.py b/tests/python-pytest/onnx/import/mxnet_backend_rep.py
deleted file mode 100644
index 938f25d38bf3..000000000000
--- a/tests/python-pytest/onnx/import/mxnet_backend_rep.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""MXNet backend rep for onnx test infrastructure"""
-try:
-    from onnx.backend.base import BackendRep
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                      + " install - https://github.com/onnx/onnx#installation")
-import mxnet as mx
-
-# Using these functions for onnx test infrastructure.
-# Implemented by following onnx docs guide:
-# https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
-# MXNetBackendRep object will be returned by MXNetBackend's prepare method which is used to
-# execute a model repeatedly.
-# Inputs will be passed to the run method of MXNetBackendRep class, it will perform computation and
-# retrieve the corresponding results for comparison to the onnx backend.
-# https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py.
-
-class MXNetBackendRep(BackendRep):
-    """Running model inference on mxnet engine and return the result
-     to onnx test infrastructure for comparison."""
-    def __init__(self, symbol, arg_params, aux_params, device):
-        self.symbol = symbol
-        self.arg_params = arg_params
-        self.aux_params = aux_params
-        self.device = device
-
-    def run(self, inputs, **kwargs):
-        """Run model inference and return the result
-
-        Parameters
-        ----------
-        inputs : numpy array
-            input to run a layer on
-
-        Returns
-        -------
-        params : numpy array
-            result obtained after running the inference on mxnet
-        """
-        data_forward = []
-        for val in inputs:
-            data_forward.append(mx.nd.array(val))
-        # create module, passing cpu context
-        if self.device == 'CPU':
-            ctx = mx.cpu()
-        else:
-            raise NotImplementedError("ONNX tests are run only for CPU context.")
-
-        # To fetch the data names of the input to the model we list the inputs of the symbol graph
-        # and exclude the argument and auxiliary parameters from the list
-        data_names = [graph_input for graph_input in self.symbol.list_inputs()
-                      if graph_input not in self.arg_params and graph_input not in self.aux_params]
-
-        data_shapes = []
-        for idx, input_name in enumerate(data_names):
-            data_shapes.append((input_name, inputs[idx].shape))
-
-        # module bind method requires all data to have same batch size,
-        # using module if all data have same batch size
-        if len(set([data_shape[1][0] for data_shape in data_shapes])) == 1:
-            mod = mx.mod.Module(symbol=self.symbol, data_names=data_names, context=ctx,
-                                label_names=None)
-            mod.bind(for_training=False, data_shapes=data_shapes,
-                     label_shapes=None)
-            mod.set_params(arg_params=self.arg_params, aux_params=self.aux_params)
-
-            # run inference
-            mod.forward(mx.io.DataBatch(data_forward))
-            result = mod.get_outputs()[0].asnumpy()
-            # split operator inference returns 1 less dimension
-            if self.symbol.name.startswith('split'):
-                return [i.asnumpy() for i in mod.get_outputs()]
-            return [result]
-        # using symbol bind method if data have different batch size
-        else:
-            exec1 = self.symbol.bind(ctx, args=dict(zip(data_names, data_forward)))
-            exec1.forward(is_train=False)
-            result = exec1.outputs[0].asnumpy()
-            return [result]
-

From 2bc4430c9c9999640a34df4a606a89b74b12008f Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <40836875+vishaalkapoor@users.noreply.github.com>
Date: Fri, 30 Nov 2018 23:35:42 -0800
Subject: [PATCH 10/54] Clarify dependency on OpenCV in CNN Visualization
 tutorial. (#13495)

---
 docs/tutorials/vision/cnn_visualization.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/tutorials/vision/cnn_visualization.md b/docs/tutorials/vision/cnn_visualization.md
index 63d2b13271ba..5ded6f1587e0 100644
--- a/docs/tutorials/vision/cnn_visualization.md
+++ b/docs/tutorials/vision/cnn_visualization.md
@@ -1,16 +1,21 @@
 # Visualizing Decisions of Convolutional Neural Networks
 
-Convolutional Neural Networks have made a lot of progress in Computer Vision. Their accuracy is as good as humans in some tasks. However it remains hard to explain the predictions of convolutional neural networks, as they lack the interpretability offered by other models, for example decision trees.
+Convolutional Neural Networks have made a lot of progress in Computer Vision. Their accuracy is as good as humans in some tasks. However, it remains difficult to explain the predictions of convolutional neural networks, as they lack the interpretability offered by other models such as decision trees.
 
-It is often helpful to be able to explain why a model made the prediction it made. For example when a model misclassifies an image, it is hard to say why without visualizing the network's decision.
+It is often helpful to be able to explain why a model made the prediction it made. For example, when a model misclassifies an image, without visualizing the network's decision, it is hard to say why the misclassification was made.
 
 <img align="right" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/cnn_visualization/volcano_barn_spider.png" alt="Explaining the misclassification of volcano as spider" width=500px/>
 
-Visualizations also help build confidence about the predictions of a model. For example, even if a model correctly predicts birds as birds, we would want to confirm that the model bases its decision on the features of bird and not on the features of some other object that might occur together with birds in the dataset (like leaves).
+Visualizations can also build confidence about the predictions of a model. For example, even if a model correctly predicts birds as birds, we would want to confirm that the model bases its decision on the features of bird and not on the features of some other object that might occur together with birds in the dataset (like leaves).
 
-In this tutorial, we show how to visualize the predictions made by convolutional neural networks using [Gradient-weighted Class Activation Mapping](https://arxiv.org/abs/1610.02391). Unlike many other visualization methods, Grad-CAM can be used on a wide variety of CNN model families - CNNs with fully connected layers, CNNs used for structural outputs (e.g. captioning), CNNs used in tasks with multi-model input (e.g. VQA) or reinforcement learning without architectural changes or re-training.
+In this tutorial we show how to visualize the predictions made by convolutional neural networks using [Gradient-weighted Class Activation Mapping](https://arxiv.org/abs/1610.02391). Unlike many other visualization methods, Grad-CAM can be used on a wide variety of CNN model families - CNNs with fully connected layers, CNNs used for structural outputs (e.g. captioning), CNNs used in tasks with multi-model input (e.g. VQA) or reinforcement learning without architectural changes or re-training.
 
-In the rest of this notebook, we will explain how to visualize predictions made by [VGG-16](https://arxiv.org/abs/1409.1556). We begin by importing the required dependencies. `gradcam` module contains the implementation of visualization techniques used in this notebook.
+In the rest of this notebook, we will explain how to visualize predictions made by [VGG-16](https://arxiv.org/abs/1409.1556). We begin by importing the required dependencies. 
+
+## Prerequesites
+* OpenCV is required by `gradcam` (below) and can be installed with pip using `pip opencv-python`.
+
+* the `gradcam` module contains the implementation of visualization techniques used in this notebook. `gradcam` can be installed to a temporary directory by executing the following code block.
 
 ```python
 from __future__ import print_function

From 6990b7d8e27ca6bd79b6df2c4becc8782417a6f8 Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Fri, 30 Nov 2018 23:38:20 -0800
Subject: [PATCH 11/54] clarify ops faq regarding docs strings (#13492)

---
 docs/faq/add_op_in_backend.md | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/docs/faq/add_op_in_backend.md b/docs/faq/add_op_in_backend.md
index ed906da27377..c44a0aa05235 100644
--- a/docs/faq/add_op_in_backend.md
+++ b/docs/faq/add_op_in_backend.md
@@ -135,7 +135,7 @@ The last line of the above code snippet is a tuple of three lists returned
 by `d.infer_shape()`. The first list contains all the argument shapes
 of `a`, `b`, and `c`. The second contains the output shape of `d`. The
 third one represents the shapes of auxiliary states, which is not used
-in this case, and thus is empty. 
+in this case, and thus is empty.
 In this example, we only specified values for variable `a`'s first dimension
 and `c`'s second dimension. The `0` in shape `(2, 0)` indicates that the size
 of the second dimension is unknown, same meaning for shape `(0, 3)`.
@@ -437,10 +437,13 @@ NNVM_REGISTER_OP(quadratic)
 where :math:`x` is an input tensor and all operations
 in the function are element-wise.
 
-Example::
-  x = [[1, 2], [3, 4]]
-  y = quadratic(data=x, a=1, b=2, c=3)
-  y = [[6, 11], [18, 27]]
+Example:
+
+  .. code-block:: python
+     :emphasize-lines: 1,3
+     x = [[1, 2], [3, 4]]
+     y = quadratic(data=x, a=1, b=2, c=3)
+     y = [[6, 11], [18, 27]]
 
 )code" ADD_FILELINE)                                                               // 4
 .set_attr_parser(ParamParser<QuadraticParam>)                                      // 5
@@ -474,8 +477,11 @@ NNVM_REGISTER_OP(_backward_quadratic)
 of `Op` type and save it in the operator manager and return a reference
 of the just created operator object.
 - Lines 3-4: Add description as an operator attribute
-including examples of the operator. The documentation engine would extract
+including examples of the operator. The documentation engine will extract
 this description and display it on the documentation web page.
+`emphasize-lines` is optional.
+For more examples and troubleshooting with doc strings, refer to the [MXNet
+developer wiki's Documentation Guide](https://cwiki.apache.org/confluence/display/MXNET/Documentation+Guide).
 - Line 5: Set parameter struct parser for the operator. It is used for parsing
 the parameters `a`, `b`, and `c` input from frontend.
 - Line 6: Set the number of inputs for the operator.
@@ -630,7 +636,7 @@ python tools/flakiness_checker.py test_operator.test_quadratic_function
 
 Please note that for `check_symbolic_forward` and `check_symbolic_backward` we pass
 both the operator symbols and expected results for comparison, for
-`check_numeric_gradient` we only pass the operator symbol, as the 
+`check_numeric_gradient` we only pass the operator symbol, as the
 `check_numeric_gradient` computes the expected value using finite difference
 method. Which is why it is highly recommended to add `check_numeric_gradient`
 test for every operator with backward function implemented as it eliminates

From 4cd335513002dc64dcf842bc28526b5e41149db2 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Fri, 30 Nov 2018 23:46:50 -0800
Subject: [PATCH 12/54] Add graph_compact operator. (#13436)

* add graph_compact.

* fix.

* add doc.

* add tests for graph_compact.

* address comments.

* update docs.

* trigger CI
---
 docs/api/python/ndarray/contrib.md      |   5 +
 docs/api/python/symbol/contrib.md       |  11 ++
 src/operator/contrib/dgl_graph.cc       | 222 +++++++++++++++++++++++-
 tests/python/unittest/test_dgl_graph.py |  40 +++++
 4 files changed, 276 insertions(+), 2 deletions(-)

diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md
index 709ddae007c5..d7c9021b5957 100644
--- a/docs/api/python/ndarray/contrib.md
+++ b/docs/api/python/ndarray/contrib.md
@@ -61,6 +61,11 @@ In the rest of this document, we list routines provided by the `ndarray.contrib`
     index_copy
     getnnz
     edge_id
+    dgl_csr_neighbor_uniform_sample
+    dgl_csr_neighbor_non_uniform_sample
+    dgl_subgraph
+    dgl_adjacency
+    dgl_graph_compact
 ```
 
 ## API Reference
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index c0a4da54cbde..35cd11c89a70 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -55,6 +55,17 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     foreach
     while_loop
     cond
+    isinf
+    isfinite
+    isnan
+    index_copy
+    getnnz
+    edge_id
+    dgl_csr_neighbor_uniform_sample
+    dgl_csr_neighbor_non_uniform_sample
+    dgl_subgraph
+    dgl_adjacency
+    dgl_graph_compact
 ```
 
 ## API Reference
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index 74ad3d435648..ed7caacfdbae 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -768,7 +768,10 @@ static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
 NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_uniform_sample)
 .describe(R"code(This operator samples sub-graph from a csr graph via an
 uniform probability. 
-Example::
+
+Example:
+
+   .. code:: python
 
   shape = (5, 5)
   data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
@@ -850,7 +853,10 @@ static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs
 NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_non_uniform_sample)
 .describe(R"code(This operator samples sub-graph from a csr graph via an
 uniform probability. 
-Example::
+
+Example:
+
+   .. code:: python
 
   shape = (5, 5)
   prob = mx.nd.array([0.9, 0.8, 0.2, 0.4, 0.1], dtype=np.float32)
@@ -1379,6 +1385,8 @@ the data value of float32.
 
 Example:
 
+   .. code:: python
+
   x = [[ 1, 0, 0 ],
        [ 0, 2, 0 ],
        [ 0, 0, 3 ]]
@@ -1400,5 +1408,215 @@ the data value of float32.
 .set_attr<FComputeEx>("FComputeEx<cpu>", DGLAdjacencyForwardEx<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "Input ndarray");
 
+///////////////////////// Compact subgraphs ///////////////////////////
+
+struct SubgraphCompactParam : public dmlc::Parameter<SubgraphCompactParam> {
+  int num_args;
+  bool return_mapping;
+  nnvm::Tuple<nnvm::dim_t> graph_sizes;
+  DMLC_DECLARE_PARAMETER(SubgraphCompactParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
+    .describe("Number of input arguments.");
+    DMLC_DECLARE_FIELD(return_mapping)
+    .describe("Return mapping of vid and eid between the subgraph and the parent graph.");
+    DMLC_DECLARE_FIELD(graph_sizes)
+    .describe("the number of vertices in each graph.");
+  }
+};  // struct SubgraphCompactParam
+
+DMLC_REGISTER_PARAMETER(SubgraphCompactParam);
+
+static inline size_t get_num_graphs(const SubgraphCompactParam &params) {
+  // Each CSR needs a 1D array to store the original vertex Id for each row.
+  return params.num_args / 2;
+}
+
+static void CompactSubgraph(const NDArray &csr, const NDArray &vids,
+                            const NDArray &out_csr, size_t graph_size) {
+  TBlob in_idx_data = csr.aux_data(csr::kIdx);
+  TBlob in_ptr_data = csr.aux_data(csr::kIndPtr);
+  const dgl_id_t *indices_in = in_idx_data.dptr<dgl_id_t>();
+  const dgl_id_t *indptr_in = in_ptr_data.dptr<dgl_id_t>();
+  const dgl_id_t *row_ids = vids.data().dptr<dgl_id_t>();
+  size_t num_elems = csr.aux_data(csr::kIdx).shape_.Size();
+  // The last element in vids is the actual number of vertices in the subgraph.
+  CHECK_EQ(vids.shape()[0], in_ptr_data.shape_[0]);
+  CHECK_EQ(static_cast<size_t>(row_ids[vids.shape()[0] - 1]), graph_size);
+
+  // Prepare the Id map from the original graph to the subgraph.
+  std::unordered_map<dgl_id_t, dgl_id_t> id_map;
+  id_map.reserve(graph_size);
+  for (size_t i = 0; i < graph_size; i++) {
+    id_map.insert(std::pair<dgl_id_t, dgl_id_t>(row_ids[i], i));
+    CHECK_NE(row_ids[i], -1);
+  }
+
+  TShape nz_shape(1);
+  nz_shape[0] = num_elems;
+  TShape indptr_shape(1);
+  CHECK_EQ(out_csr.shape()[0], graph_size);
+  indptr_shape[0] = graph_size + 1;
+  CHECK_GE(in_ptr_data.shape_[0], indptr_shape[0]);
+
+  out_csr.CheckAndAllocData(nz_shape);
+  out_csr.CheckAndAllocAuxData(csr::kIdx, nz_shape);
+  out_csr.CheckAndAllocAuxData(csr::kIndPtr, indptr_shape);
+
+  dgl_id_t *indices_out = out_csr.aux_data(csr::kIdx).dptr<dgl_id_t>();
+  dgl_id_t *indptr_out = out_csr.aux_data(csr::kIndPtr).dptr<dgl_id_t>();
+  dgl_id_t *sub_eids = out_csr.data().dptr<dgl_id_t>();
+  std::copy(indptr_in, indptr_in + indptr_shape[0], indptr_out);
+  for (int64_t i = 0; i < nz_shape[0]; i++) {
+    dgl_id_t old_id = indices_in[i];
+    auto it = id_map.find(old_id);
+    CHECK(it != id_map.end());
+    indices_out[i] = it->second;
+    sub_eids[i] = i;
+  }
+}
+
+static void SubgraphCompactComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& inputs,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& outputs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  int num_g = get_num_graphs(params);
+#pragma omp parallel for
+  for (int i = 0; i < num_g; i++) {
+    CompactSubgraph(inputs[i], inputs[i + num_g], outputs[i], params.graph_sizes[i]);
+  }
+}
+
+static bool SubgraphCompactStorageType(const nnvm::NodeAttrs& attrs,
+                                       const int dev_mask,
+                                       DispatchMode* dispatch_mode,
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  size_t num_g = get_num_graphs(params);
+  CHECK_EQ(num_g * 2, in_attrs->size());
+  // These are the input subgraphs.
+  for (size_t i = 0; i < num_g; i++)
+    CHECK_EQ(in_attrs->at(i), kCSRStorage);
+  // These are the vertex Ids in the original graph.
+  for (size_t i = 0; i < num_g; i++)
+    CHECK_EQ(in_attrs->at(i + num_g), kDefaultStorage);
+
+  bool success = true;
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  for (size_t i = 0; i < out_attrs->size(); i++) {
+    if (!type_assign(&(*out_attrs)[i], mxnet::kCSRStorage))
+      success = false;
+  }
+  return success;
+}
+
+static bool SubgraphCompactShape(const nnvm::NodeAttrs& attrs,
+                                 std::vector<TShape> *in_attrs,
+                                 std::vector<TShape> *out_attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  size_t num_g = get_num_graphs(params);
+  CHECK_EQ(num_g * 2, in_attrs->size());
+  // These are the input subgraphs.
+  for (size_t i = 0; i < num_g; i++) {
+    CHECK_EQ(in_attrs->at(i).ndim(), 2U);
+    CHECK_GE(in_attrs->at(i)[0], params.graph_sizes[i]);
+    CHECK_GE(in_attrs->at(i)[1], params.graph_sizes[i]);
+  }
+  // These are the vertex Ids in the original graph.
+  for (size_t i = 0; i < num_g; i++) {
+    CHECK_EQ(in_attrs->at(i + num_g).ndim(), 1U);
+    CHECK_GE(in_attrs->at(i + num_g)[0], params.graph_sizes[i]);
+  }
+
+  for (size_t i = 0; i < num_g; i++) {
+    TShape gshape(2);
+    gshape[0] = params.graph_sizes[i];
+    gshape[1] = params.graph_sizes[i];
+    out_attrs->at(i) = gshape;
+    if (params.return_mapping)
+      out_attrs->at(i + num_g) = gshape;
+  }
+  return true;
+}
+
+static bool SubgraphCompactType(const nnvm::NodeAttrs& attrs,
+                                std::vector<int> *in_attrs,
+                                std::vector<int> *out_attrs) {
+  for (size_t i = 0; i < in_attrs->size(); i++) {
+    CHECK_EQ(in_attrs->at(i), mshadow::kInt64);
+  }
+  for (size_t i = 0; i < out_attrs->size(); i++) {
+    out_attrs->at(i) = mshadow::kInt64;
+  }
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_dgl_graph_compact)
+.describe(R"code(This operator compacts a CSR matrix generated by
+dgl_csr_neighbor_uniform_sample and dgl_csr_neighbor_non_uniform_sample.
+The CSR matrices generated by these two operators may have many empty
+rows at the end and many empty columns. This operator removes these
+empty rows and empty columns.
+
+Example:
+
+   .. code:: python
+
+  shape = (5, 5)
+  data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
+  indices_np = np.array([1,2,3,4,0,2,3,4,0,1,3,4,0,1,2,4,0,1,2,3], dtype=np.int64)
+  indptr_np = np.array([0,4,8,12,16,20], dtype=np.int64)
+  a = mx.nd.sparse.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+  seed = mx.nd.array([0,1,2,3,4], dtype=np.int64)
+  out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1,
+          num_neighbor=2, max_num_vertices=6)
+  subg_v = out[0]
+  subg = out[1]
+  compact = mx.nd.contrib.dgl_graph_compact(subg, subg_v,
+          graph_sizes=(subg_v[-1].asnumpy()[0]), return_mapping=False)
+
+  compact.asnumpy()
+  array([[0, 0, 0, 1, 0],
+         [2, 0, 3, 0, 0],
+         [0, 4, 0, 0, 5],
+         [0, 6, 0, 0, 7],
+         [8, 9, 0, 0, 0]])
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<SubgraphCompactParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  int num_varray = get_num_graphs(params);
+  if (params.return_mapping)
+    return num_varray * 2;
+  else
+    return num_varray;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const SubgraphCompactParam& params = nnvm::get<SubgraphCompactParam>(attrs.parsed);
+  std::vector<std::string> names;
+  names.reserve(params.num_args);
+  size_t num_graphs = get_num_graphs(params);
+  for (size_t i = 0; i < num_graphs; i++)
+    names.push_back("graph" + std::to_string(i));
+  for (size_t i = 0; i < num_graphs; ++i)
+    names.push_back("varray" + std::to_string(i));
+  return names;
+})
+.set_attr<FInferStorageType>("FInferStorageType", SubgraphCompactStorageType)
+.set_attr<nnvm::FInferShape>("FInferShape", SubgraphCompactShape)
+.set_attr<nnvm::FInferType>("FInferType", SubgraphCompactType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SubgraphCompactComputeExCPU)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("graph_data", "NDArray-or-Symbol[]", "Input graphs and input vertex Ids.")
+.add_arguments(SubgraphCompactParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_dgl_graph.py b/tests/python/unittest/test_dgl_graph.py
index f996d7f38de8..069fef6e32f0 100644
--- a/tests/python/unittest/test_dgl_graph.py
+++ b/tests/python/unittest/test_dgl_graph.py
@@ -63,6 +63,18 @@ def check_non_uniform(out, num_hops, max_num_vertices):
     for data in layer:
         assert(data <= num_hops)
 
+def check_compact(csr, id_arr, num_nodes):
+    compact = mx.nd.contrib.dgl_graph_compact(csr, id_arr, graph_sizes=num_nodes, return_mapping=False)
+    assert compact.shape[0] == num_nodes
+    assert compact.shape[1] == num_nodes
+    assert mx.nd.sum(compact.indptr == csr.indptr[0:(num_nodes + 1)]).asnumpy() == num_nodes + 1
+    sub_indices = compact.indices.asnumpy()
+    indices = csr.indices.asnumpy()
+    id_arr = id_arr.asnumpy()
+    for i in range(len(sub_indices)):
+        sub_id = sub_indices[i]
+        assert id_arr[sub_id] == indices[i]
+
 def test_uniform_sample():
     shape = (5, 5)
     data_np = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], dtype=np.int64)
@@ -74,36 +86,64 @@ def test_uniform_sample():
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=1, max_num_vertices=4)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=4)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=1, max_num_vertices=4)
     assert (len(out) == 3)
     check_uniform(out, num_hops=2, max_num_vertices=4)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,2,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=2, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0,4], dtype=np.int64)
     out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=1, num_neighbor=2, max_num_vertices=5)
     assert (len(out) == 3)
     check_uniform(out, num_hops=1, max_num_vertices=5)
+    num_nodes = out[0][-1].asnumpy()
+    assert num_nodes > 0
+    assert num_nodes < len(out[0])
+    check_compact(out[1], out[0], num_nodes)
 
 def test_non_uniform_sample():
     shape = (5, 5)

From 1f73c5d9d308a690b57ea1b474d2ba99ca06c476 Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Sat, 1 Dec 2018 21:57:33 +0100
Subject: [PATCH 13/54] Deprecate Jenkinsfile (#13474)

---
 Jenkinsfile | 1010 ---------------------------------------------------
 1 file changed, 1010 deletions(-)
 delete mode 100644 Jenkinsfile

diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index 015ca81bad76..000000000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,1010 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-
-/***
- *      _____  _                          
- *     |  __ \| |                         
- *     | |__) | | ___  __ _ ___  ___      
- *     |  ___/| |/ _ \/ _` / __|/ _ \     
- *     | | | || |  __/ (_| \__ \  __/     
- *     |_|_| ||_|\___|\__,_|___/\___|     
- *      / _` |/ _ \                       
- *     | (_| | (_) |_                     
- *      \__,_|\___/| |                    
- *      _ __   ___ | |_                   
- *     | '_ \ / _ \| __|    _ _  __       
- *     | | | | (_) | |_    | (_)/ _|      
- *     |_|_|_|\___/_\__| __| |_| |_ _   _ 
- *     | '_ ` _ \ / _ \ / _` | |  _| | | |
- *     | | | | | | (_) | (_| | | | | |_| |
- *     |_| |_| |_|\___/ \__,_|_|_|  \__, |
- *                                   __/ |
- *                                  |___/ 
- *
- * This file is about to be deprecated! See https://github.com/apache/incubator-mxnet/pull/13344
- * for more details
- */
-
-
-// mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-
-// Python wheels
-mx_pip = 'build/*.whl'
-
-// for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
-// mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-// mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/lenet, build/cpp-package/example/alexnet, build/cpp-package/example/googlenet, build/cpp-package/example/lenet_with_mxdataiter, build/cpp-package/example/resnet, build/cpp-package/example/mlp, build/cpp-package/example/mlp_cpu, build/cpp-package/example/mlp_gpu, build/cpp-package/example/test_score, build/cpp-package/example/test_optimizer'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/mlp_cpu'
-
-// timeout in minutes
-max_time = 120
-
-
-// Python unittest for CPU
-// Python 2
-def python2_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python2_cpu', false)
-  }
-}
-
-// Python 3
-def python3_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu', false)
-  }
-}
-
-// Python 3
-def python3_ut_asan(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_asan', false)
-  }
-}
-
-def python3_ut_mkldnn(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
-  }
-}
-
-// GPU test has two parts. 1) run unittest on GPU, 2) compare the results on
-// both CPU and GPU
-// Python 2
-def python2_gpu_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python2_gpu', true)
-  }
-}
-
-// Python 3
-def python3_gpu_ut(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu', true)
-  }
-}
-
-// Python 3 NOCUDNN
-def python3_gpu_ut_nocudnn(docker_container_name) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_gpu_nocudnn', true)
-  }
-}
-
-def deploy_docs() {
-  parallel 'Docs': {
-    node(NODE_LINUX_CPU) {
-      ws('workspace/docs') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.init_git()
-          utils.docker_run('ubuntu_cpu', 'deploy_docs', false)
-          sh "ci/other/ci_deploy_doc.sh ${env.BRANCH_NAME} ${env.BUILD_NUMBER}"
-        }
-      }
-    }
-  },
-  'Julia docs': {
-    node(NODE_LINUX_CPU) {
-      ws('workspace/julia-docs') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.unpack_and_init('cpu', mx_lib)
-          utils.docker_run('ubuntu_cpu', 'deploy_jl_docs', false)
-        }
-      }
-    }
-  }
-}
-
-node('utility') {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/Jenkinsfile_utils.groovy')
-}
-utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_gpu: 'mxnetlinux-gpu', linux_gpu_p3: 'mxnetlinux-gpu-p3', windows_cpu: 'mxnetwindows-cpu', windows_gpu: 'mxnetwindows-gpu')
-
-utils.main_wrapper(
-core_logic: {
-  stage('Sanity Check') {
-    parallel 'Lint': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/sanity-lint') {
-          utils.init_git()
-          utils.docker_run('ubuntu_cpu', 'sanity_check', false)
-        }
-      }
-    },
-    'RAT License': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/sanity-rat') {
-          utils.init_git()
-          utils.docker_run('ubuntu_rat', 'nightly_test_rat_check', false)
-        }
-      }
-    }
-  }
-
-  stage('Build') {
-    parallel 'CPU: CentOS 7': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('centos7_cpu', 'build_centos7_cpu', false)
-            utils.pack_lib('centos7_cpu', mx_dist_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: CentOS 7 MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-mkldnn') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('centos7_cpu', 'build_centos7_mkldnn', false)
-            utils.pack_lib('centos7_mkldnn', mx_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: CentOS 7': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('centos7_gpu', 'build_centos7_gpu', false)
-            utils.pack_lib('centos7_gpu', mx_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: Openblas': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-openblas') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            utils.pack_lib('cpu', mx_dist_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: ASAN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-asan') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_asan', false)
-            utils.pack_lib('cpu_asan', mx_lib_cpp_examples_cpu)
-          }
-        }
-      }
-    },
-    'CPU: Openblas, debug': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-openblas') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_debug', false)
-            utils.pack_lib('cpu_debug', mx_cmake_lib_debug, true)
-          }
-        }
-      }
-    },
-    'CPU: Clang 3.9': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-clang39') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39', false)
-          }
-        }
-      }
-    },
-    'CPU: Clang 6': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-clang60') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang60', false)
-          }
-        }
-      }
-    },
-    'CPU: Clang Tidy': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-clang60_tidy') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang_tidy', false)
-          }
-        }
-      }
-    },
-    'CPU: Clang 3.9 MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-mkldnn-clang39') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang39_mkldnn', false)
-            utils.pack_lib('mkldnn_cpu_clang3', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: Clang 6 MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cpu-mkldnn-clang60') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_clang60_mkldnn', false)
-            utils.pack_lib('mkldnn_cpu_clang6', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'CPU: MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-mkldnn-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkldnn', false)
-            utils.pack_lib('mkldnn_cpu', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-mkldnn-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn', false)
-            utils.pack_lib('mkldnn_gpu', mx_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: MKLDNN_CUDNNOFF': {
-       node(NODE_LINUX_CPU) {
-         ws('workspace/build-mkldnn-gpu-nocudnn') {
-           timeout(time: max_time, unit: 'MINUTES') {
-             utils.init_git()
-             utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_mkldnn_nocudnn', false)
-             utils.pack_lib('mkldnn_gpu_nocudnn', mx_mkldnn_lib, true)
-           }
-         }
-       }
-    },
-    'GPU: CUDA9.1+cuDNN7': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda91_cudnn7', false)
-            utils.pack_lib('gpu', mx_lib_cpp_examples, true)
-          }
-        }
-      }
-    },
-    'Amalgamation MIN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/amalgamationmin') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation_min', false)
-          }
-        }
-      }
-    },
-    'Amalgamation': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/amalgamation') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_cpu', 'build_ubuntu_amalgamation', false)
-          }
-        }
-      }
-    },
-
-    'GPU: CMake MKLDNN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-mkldnn-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake_mkldnn', false)
-            utils.pack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib, true)
-          }
-        }
-      }
-    },
-    'GPU: CMake': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_cmake', false)
-            utils.pack_lib('cmake_gpu', mx_cmake_lib, true)
-          }
-        }
-      }
-    },
-    'TensorRT': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-tensorrt') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu_tensorrt', 'build_ubuntu_gpu_tensorrt', false)
-            utils.pack_lib('tensorrt', mx_tensorrt_lib, true)
-          }
-        }
-      }
-    },
-    'Build CPU windows':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-cpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_CPU'
-              stash includes: 'windows_package.7z', name: 'windows_package_cpu'
-            }
-          }
-        }
-      }
-    },
-
-    'Build GPU windows':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-gpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_GPU'
-              stash includes: 'windows_package.7z', name: 'windows_package_gpu'
-            }
-          }
-        }
-      }
-    },
-    'Build GPU MKLDNN windows':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/build-gpu') {
-            withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0','BUILD_NAME=vc14_gpu_mkldnn']) {
-              utils.init_git_win()
-              powershell 'python ci/build_windows.py -f WIN_GPU_MKLDNN'
-              stash includes: 'windows_package.7z', name: 'windows_package_gpu_mkldnn'
-            }
-          }
-        }
-      }
-    },
-    //'NVidia Jetson / ARMv8':{
-    //  node(NODE_LINUX_CPU) {
-    //    ws('workspace/build-jetson-armv8') {
-    //      timeout(time: max_time, unit: 'MINUTES') {
-    //        utils.init_git()
-    //        utils.docker_run('jetson', 'build_jetson', false)
-    //      }
-    //    }
-    //  }
-    //},
-    'ARMv7':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv7') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('armv7', 'build_armv7', false)
-            utils.pack_lib('armv7', mx_pip)
-          }
-        }
-      }
-    },
-    'ARMv6':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv6') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('armv6', 'build_armv6', false)
-          }
-        }
-      }
-    },
-    'ARMv8':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-ARMv8') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('armv8', 'build_armv8', false)
-          }
-        }
-      }
-    },
-    'Android / ARMv8':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/android64') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('android_armv8', 'build_android_armv8', false)
-          }
-        }
-      }
-    },
-    'Android / ARMv7':{
-      node(NODE_LINUX_CPU) {
-        ws('workspace/androidv7') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('android_armv7', 'build_android_armv7', false)
-          }
-        }
-      }
-    }
-
-  } // End of stage('Build')
-
-  stage('Tests') {
-    parallel 'Python2: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python2-cpu') {
-          try {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            python2_ut('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_cpu_train.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_cpu_quantization.xml')
-          }
-        }
-      }
-    },
-    'Python3: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-cpu') {
-          try {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            python3_ut('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_quantization.xml')
-          }
-        }
-      }
-    },
-    'CPU ASAN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-cpu-asan') {
-            utils.unpack_and_init('cpu_asan', mx_lib_cpp_examples_cpu)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_asan', false)
-        }
-      }
-    },
-    'Python3: CPU debug': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-cpu-debug') {
-          try {
-            utils.unpack_and_init('cpu_debug', mx_cmake_lib_debug, true)
-            python3_ut('ubuntu_cpu')
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_debug_unittest.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_debug_quantization.xml')
-          }
-        }
-      }
-    },
-    'Python2: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python2-gpu') {
-          try {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            python2_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python3: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python3-gpu') {
-          try {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            python3_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python2: Quantize GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/ut-python2-quantize-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('gpu', mx_lib, true)
-              utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_python2_quantization_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python2_quantize_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python3: Quantize GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/ut-python3-quantize-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('gpu', mx_lib, true)
-              utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_python3_quantization_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_quantization_gpu.xml', 'nosetests_python3_quantize_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python2: MKLDNN-CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python2-mkldnn-cpu') {
-          try {
-            utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
-            python2_ut('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python2_mkldnn_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python2_mkldnn_cpu_train.xml')
-            utils.collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python2_mkldnn_cpu_quantization.xml')
-          }
-        }
-      }
-    },
-    'Python2: MKLDNN-GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python2-mkldnn-gpu') {
-          try {
-            utils.unpack_and_init('mkldnn_gpu', mx_mkldnn_lib, true)
-            python2_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python2_mkldnn_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python3: MKLDNN-CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-mkldnn-cpu') {
-          try {
-            utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
-            python3_ut_mkldnn('ubuntu_cpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_mkldnn_cpu_unittest.xml')
-            utils.collect_test_results_unix('nosetests_mkl.xml', 'nosetests_python3_mkldnn_cpu_mkl.xml')
-          }
-        }
-      }
-    },
-    'Python3: MKLDNN-GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python3-mkldnn-gpu') {
-          try {
-            utils.unpack_and_init('mkldnn_gpu', mx_mkldnn_lib, true)
-            python3_gpu_ut('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu.xml')
-          }
-        }
-      }
-    },
-    'Python3: MKLDNN-GPU-NOCUDNN': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python3-mkldnn-gpu-nocudnn') {
-          try {
-            utils.unpack_and_init('mkldnn_gpu_nocudnn', mx_mkldnn_lib, true)
-            python3_gpu_ut_nocudnn('ubuntu_gpu')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_mkldnn_gpu_nocudnn.xml')
-          }
-        }
-      }
-    },
-    'Python3: CentOS 7 CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-centos7-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('centos7_cpu', mx_lib, true)
-              utils.docker_run('centos7_cpu', 'unittest_centos7_cpu', false)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_centos7_cpu_unittest.xml')
-              utils.collect_test_results_unix('nosetests_train.xml', 'nosetests_python3_centos7_cpu_train.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python3: CentOS 7 GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/build-centos7-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('centos7_gpu', mx_lib, true)
-              utils.docker_run('centos7_gpu', 'unittest_centos7_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_gpu.xml', 'nosetests_python3_centos7_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python3: TensorRT GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/build-tensorrt') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init('tensorrt', mx_tensorrt_lib, true)
-              utils.docker_run('ubuntu_gpu_tensorrt', 'unittest_ubuntu_tensorrt_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('nosetests_tensorrt.xml', 'nosetests_python3_tensorrt_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Scala: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-scala-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_dist_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Scala: CentOS CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-scala-centos7-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('centos7_cpu', mx_dist_lib, true)
-            utils.docker_run('centos7_cpu', 'unittest_centos7_cpu_scala', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Clojure: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-clojure-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_dist_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Perl: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-perl-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpugpu_perl', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Perl: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-perl-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_cpugpu_perl', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Cpp: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-cpp-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cmake_gpu', mx_cmake_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Cpp: MKLDNN+GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-cpp-mkldnn-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'R: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-r-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_R', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'R: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-r-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_R', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Julia 0.6: CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-julia06-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_julia06', false)
-          }
-        }
-      }
-    },
-
-    'Python 2: CPU Win':{
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-cpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_cpu'
-              powershell 'ci/windows/test_py2_cpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python2_cpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 3: CPU Win': {
-      node(NODE_WINDOWS_CPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-cpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_cpu'
-              powershell 'ci/windows/test_py3_cpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python3_cpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 2: GPU Win':{
-      node(NODE_WINDOWS_GPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-gpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_gpu'
-              powershell 'ci/windows/test_py2_gpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
-              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 3: GPU Win':{
-      node(NODE_WINDOWS_GPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-gpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_gpu'
-              powershell 'ci/windows/test_py3_gpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
-              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
-            }
-          }
-        }
-      }
-    },
-    'Python 3: MKLDNN-GPU Win':{
-      node(NODE_WINDOWS_GPU) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          ws('workspace/ut-python-gpu') {
-            try {
-              utils.init_git_win()
-              unstash 'windows_package_gpu_mkldnn'
-              powershell 'ci/windows/test_py3_gpu.ps1'
-            } finally {
-              utils.collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
-              utils.collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
-            }
-          }
-        }
-      }
-    },
-    'Onnx CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/it-onnx-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_onnx', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'Python GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/it-python-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_python', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'cpp-package GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/it-cpp-package') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib_cpp_examples, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_cpp_package', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    // Disabled due to: https://github.com/apache/incubator-mxnet/issues/11407
-    // 'Caffe GPU': {
-    //   node(NODE_LINUX_GPU) {
-    //     ws('workspace/it-caffe') {
-    //       timeout(time: max_time, unit: 'MINUTES') {
-    //         utils.init_git()
-    //         utils.unpack_lib('gpu', mx_lib)
-    //         utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_caffe', true)
-    //         utils.publish_test_coverage()
-    //       }
-    //     }
-    //   }
-    // },
-    'dist-kvstore tests GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/it-dist-kvstore') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    /*  Disabled due to master build failure:
-     *  http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1221/pipeline/
-     *  https://github.com/apache/incubator-mxnet/issues/11801
-
-    'dist-kvstore tests CPU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/it-dist-kvstore') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib, true)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_dist_kvstore', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }, */
-    'Scala: GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-scala-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_dist_lib, true)
-            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_scala', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    },
-    'ARMv7 QEMU': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-armv7-qemu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('armv7', mx_pip)
-            sh "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu"
-          }
-        }
-      }
-    }
-  }
-
-  stage('Deploy') {
-    deploy_docs()
-  }
-}
-,
-failure_handler: {
-  // Only send email if master or release branches failed
-  if (currentBuild.result == "FAILURE" && (env.BRANCH_NAME == "master" || env.BRANCH_NAME.startsWith("v"))) {
-    emailext body: 'Build for MXNet branch ${BRANCH_NAME} has broken. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[BUILD FAILED] Branch ${BRANCH_NAME} build ${BUILD_NUMBER}', to: '${EMAIL}'
-  }
-}
-)

From d91284bdb1a2c3fac1bb4ba1bc0cac578ffc413a Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Sat, 1 Dec 2018 20:26:45 -0800
Subject: [PATCH 14/54] update github location for sampled_block.py (#13508)

Updated to https://github.com/dmlc/gluon-nlp/blob/master/src/gluonnlp/model/sampled_block.py
---
 example/recommenders/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/recommenders/README.md b/example/recommenders/README.md
index 4b1d5ca6da14..628182c849b8 100644
--- a/example/recommenders/README.md
+++ b/example/recommenders/README.md
@@ -17,7 +17,7 @@ The examples are driven by notebook files.
 ### Negative Sampling
 
 * A previous version of this example had an example of negative sampling. For example of negative sampling, please refer to:
-    [Gluon NLP Sampled Block](https://github.com/dmlc/gluon-nlp/blob/master/gluonnlp/model/sampled_block.py)
+    [Gluon NLP Sampled Block](https://github.com/dmlc/gluon-nlp/blob/master/src/gluonnlp/model/sampled_block.py)
     
 
 ## Acknowledgements

From b684c654f411dde232f36331cbc3c6eadf3efd54 Mon Sep 17 00:00:00 2001
From: Nicolas Modrzyk <hellonico@gmail.com>
Date: Sun, 2 Dec 2018 23:34:36 +0900
Subject: [PATCH 15/54] #13453 [Clojure] - Add Spec Validations to the
 Optimizer namespace (#13499)

---
 .../org/apache/clojure_mxnet/optimizer.clj    | 52 +++++++++++++++++--
 .../apache/clojure_mxnet/optimizer_test.clj   | 10 ++++
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
index f18ff40f5698..f77f5532bfb1 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
@@ -17,7 +17,19 @@
 
 (ns org.apache.clojure-mxnet.optimizer
   (:refer-clojure :exclude [update])
-  (:import (org.apache.mxnet.optimizer SGD DCASGD NAG AdaDelta RMSProp AdaGrad Adam SGLD)))
+  (:require  
+   [clojure.spec.alpha :as s]
+   [org.apache.clojure-mxnet.util :as util])
+  (:import 
+   (org.apache.mxnet.optimizer SGD DCASGD NAG AdaDelta RMSProp AdaGrad Adam SGLD)
+   (org.apache.mxnet FactorScheduler)))
+
+(s/def ::learning-rate float?)
+(s/def ::momentum float?)
+(s/def ::wd float?)
+(s/def ::clip-gradient float?)
+(s/def ::lr-scheduler #(instance? FactorScheduler))
+(s/def ::sgd-opts (s/keys :opt-un [::learning-rate ::momentum ::wd ::clip-gradient ::lr-scheduler]))
 
 (defn sgd
   "A very simple SGD optimizer with momentum and weight regularization."
@@ -26,10 +38,14 @@
           momentum 0.0
           wd 0.0001
           clip-gradient 0}}]
+   (util/validate! ::sgd-opts opts "Incorrect sgd optimizer options")
    (new SGD (float learning-rate) (float momentum) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (sgd {})))
 
+(s/def ::lambda float?)
+(s/def ::dcasgd-opts (s/keys :opt-un [::learning-rate ::momentum ::lambda ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn dcasgd
   "DCASGD optimizer with momentum and weight regularization.
   Implementation of paper 'Asynchronous Stochastic Gradient Descent with
@@ -40,10 +56,13 @@
           lambda 0.04
           wd 0.0
           clip-gradient 0}}]
+   (util/validate! ::sgd-opts opts "Incorrect dcasgd optimizer options")
    (new DCASGD (float learning-rate) (float lambda) (float momentum) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (dcasgd {})))
 
+(s/def ::nag-opts (s/keys :opt-un [::learning-rate ::momentum ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn nag
   "SGD with nesterov.
    It is implemented according to
@@ -53,10 +72,16 @@
           momentum 0.0
           wd 0.0001
           clip-gradient 0}}]
+   (util/validate! ::nag-opts opts "Incorrect nag optimizer options")
    (new NAG (float learning-rate) (float momentum) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (nag {})))
 
+(s/def ::rho float?)
+(s/def ::rescale-gradient float?)
+(s/def ::epsilon float?)
+(s/def ::ada-delta-opts (s/keys :opt-un [::rho ::rescale-gradient ::epsilon ::wd ::clip-gradient]))
+
 (defn ada-delta
   "AdaDelta optimizer as described in Matthew D. Zeiler, 2012.
    http://arxiv.org/abs/1212.5701"
@@ -66,10 +91,15 @@
           epsilon 1e-8
           wd 0.0
           clip-gradient 0}}]
+   (util/validate! ::ada-delta-opts opts "Incorrect ada-delta optimizer options")
    (new AdaDelta (float rho) (float rescale-gradient) (float epsilon) (float wd) (float clip-gradient)))
   ([]
    (ada-delta {})))
 
+(s/def gamma1 float?)
+(s/def gamma2 float?)
+(s/def ::rms-prop-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::gamma1 ::gamma2 ::wd ::clip-gradient]))
+
 (defn rms-prop
   "RMSProp optimizer as described in Tieleman & Hinton, 2012.
    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
@@ -80,18 +110,21 @@
    -  wd L2 regularization coefficient add to all the weights
    -  clip-gradient clip gradient in range [-clip_gradient, clip_gradient]
    -  lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate rescale-gradient gamma1 gamma2 wd lr-scheduler clip-gradient]
+  ([{:keys [learning-rate rescale-gradient gamma1 gamma2 wd lr-scheduler clip-gradient] :as opts
      :or {learning-rate 0.002
           rescale-gradient 1.0
           gamma1 0.95
           gamma2 0.9
           wd 0.0
           clip-gradient 0}}]
+   (util/validate! ::rms-prop-opts opts "Incorrect rms-prop optimizer options")
    (new RMSProp (float learning-rate) (float rescale-gradient) (float gamma1)
         (float gamma2) (float wd) lr-scheduler (float clip-gradient)))
   ([]
    (rms-prop {})))
 
+(s/def ::ada-grad-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::epsilon ::wd]))
+
 (defn ada-grad
   " AdaGrad optimizer as described in Duchi, Hazan and Singer, 2011.
    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
@@ -101,15 +134,20 @@
                 Default value is set to 1e-7.
    - rescale-gradient rescaling factor of gradient.
    - wd L2 regularization coefficient add to all the weights"
-  ([{:keys [learning-rate rescale-gradient epsilon wd]
+  ([{:keys [learning-rate rescale-gradient epsilon wd] :as opts
      :or {learning-rate 0.05
           rescale-gradient 1.0
           epsilon 1e-7
           wd 0.0}}]
+   (util/validate! ::ada-grad-opts opts "Incorrect ada-grad optimizer options")
    (new AdaGrad (float learning-rate) (float rescale-gradient) (float epsilon) (float wd)))
   ([]
    (ada-grad {})))
 
+(s/def ::beta1 float?)
+(s/def ::beta2 float?)
+(s/def ::adam-opts (s/keys :opt-un [::learning-rate ::beta1 ::beta2 ::epsilon ::decay-factor ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn adam
   "Adam optimizer as described in [King2014]
 
@@ -125,7 +163,7 @@
    - wd L2 regularization coefficient add to all the weights
    - clip-gradient  clip gradient in range [-clip_gradient, clip_gradient]
    - lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate beta1 beta2 epsilon decay-factor wd clip-gradient lr-scheduler]
+  ([{:keys [learning-rate beta1 beta2 epsilon decay-factor wd clip-gradient lr-scheduler] :as opts
      :or {learning-rate 0.002
           beta1 0.9
           beta2 0.999
@@ -133,11 +171,14 @@
           decay-factor (- 1 1e-8)
           wd 0
           clip-gradient 0}}]
+   (util/validate! ::adam-opts opts "Incorrect adam optimizer options")
    (new Adam (float learning-rate) (float beta1) (float beta2) (float epsilon)
         (float decay-factor) (float wd) (float clip-gradient) lr-scheduler))
   ([]
    (adam {})))
 
+(s/def ::sgld-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::wd ::clip-gradient ::lr-scheduler]))
+
 (defn sgld
   "Stochastic Langevin Dynamics Updater to sample from a distribution.
 
@@ -146,11 +187,12 @@
   - wd L2 regularization coefficient add to all the weights
   - clip-gradient Float, clip gradient in range [-clip_gradient, clip_gradient]
   - lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate rescale-gradient wd clip-gradient lr-scheduler]
+  ([{:keys [learning-rate rescale-gradient wd clip-gradient lr-scheduler] :as opts
      :or {learning-rate 0.01
           rescale-gradient 1
           wd 0.0001
           clip-gradient 0}}]
+   (util/validate! ::sgld-opts opts "Incorrect sgld optimizer options")
    (new SGLD (float learning-rate) (float rescale-gradient) (float wd)
         (float clip-gradient) lr-scheduler))
   ([]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
index f6461b10f028..599a0672bea5 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
@@ -44,3 +44,13 @@
               ["sgld" optimizer/sgld]]]
     (doseq [opt opts]
       (test-optimizer opt))))
+
+(deftest test-optimizers-parameters-specs
+  (is (thrown? Exception (optimizer/sgd {:wd 'a})))
+  (is (thrown? Exception (optimizer/dcasgd {:lambda 'a})))
+  (is (thrown? Exception (optimizer/nag {:momentum 'a})))
+  (is (thrown? Exception (optimizer/ada-delta {:epsilon 'a})))
+  (is (thrown? Exception (optimizer/rms-prop {:gamma1 'a})))
+  (is (thrown? Exception (optimizer/ada-grad {:rescale-gradient 'a})))
+  (is (thrown? Exception (optimizer/adam {:beta1 'a})))
+  (is (thrown? Exception (optimizer/sgld {:lr-scheduler 0.1}))))
\ No newline at end of file

From 7d44deb4b209cc313d74d76dc8c3c0efc83e1126 Mon Sep 17 00:00:00 2001
From: Vandana Kannan <vandanavk@users.noreply.github.com>
Date: Sun, 2 Dec 2018 20:30:35 -0800
Subject: [PATCH 16/54] ONNX export: Logical operators (#12852)

---
 .../contrib/onnx/mx2onnx/_op_translations.py  | 32 +++++++++++++++
 .../onnx/export/mxnet_export_test.py          | 39 +++++++++++++++++++
 tests/python-pytest/onnx/import/test_cases.py |  1 -
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index facdcfedcbca..86767a667128 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -1613,3 +1613,35 @@ def convert_broadcast_equal(node, **kwargs):
     and return the created node.
     """
     return create_basic_op_node('Equal', node, kwargs)
+
+
+@mx_op.register("broadcast_logical_and")
+def convert_broadcast_logical_and(node, **kwargs):
+    """Map MXNet's broadcast logical and operator attributes to onnx's Add operator
+    and return the created node.
+    """
+    return create_basic_op_node('And', node, kwargs)
+
+
+@mx_op.register("broadcast_logical_or")
+def convert_broadcast_logical_or(node, **kwargs):
+    """Map MXNet's broadcast logical or operator attributes to onnx's Or operator
+    and return the created node.
+    """
+    return create_basic_op_node('Or', node, kwargs)
+
+
+@mx_op.register("broadcast_logical_xor")
+def convert_broadcast_logical_xor(node, **kwargs):
+    """Map MXNet's broadcast logical xor operator attributes to onnx's Xor operator
+    and return the created node.
+    """
+    return create_basic_op_node('Xor', node, kwargs)
+
+
+@mx_op.register("logical_not")
+def convert_logical_not(node, **kwargs):
+    """Map MXNet's logical not operator attributes to onnx's Not operator
+    and return the created node.
+    """
+    return create_basic_op_node('Not', node, kwargs)
diff --git a/tests/python-pytest/onnx/export/mxnet_export_test.py b/tests/python-pytest/onnx/export/mxnet_export_test.py
index 964d0e760cae..6b858f05e24f 100644
--- a/tests/python-pytest/onnx/export/mxnet_export_test.py
+++ b/tests/python-pytest/onnx/export/mxnet_export_test.py
@@ -268,6 +268,45 @@ def test_ops(op_name, inputs, input_tensors, numpy_op):
     test_ops("Equal", input_data, input_tensor,
              np.equal(input_data[0], input_data[1]).astype(np.float32))
 
+
+def get_int_inputs(interval, shape):
+    """Helper to get integer input of given shape and range"""
+    assert len(interval) == len(shape)
+    inputs = []
+    input_tensors = []
+    for idx in range(len(interval)):
+        low, high = interval[idx]
+        inputs.append(np.random.randint(low, high, size=shape[idx]).astype("float32"))
+        input_tensors.append(helper.make_tensor_value_info("input"+str(idx+1),
+                                                        TensorProto.FLOAT, shape=shape[idx]))
+    return inputs, input_tensors
+
+
+@with_seed()
+def test_logical_ops():
+    """Test for logical and, or, not, xor operators"""
+    def test_ops(op_name, inputs, input_tensors, numpy_op):
+        outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=np.shape(inputs[0]))]
+        nodes = [helper.make_node(op_name, ["input"+str(i+1) for i in range(len(inputs))], ["output"])]
+        graph = helper.make_graph(nodes,
+                                  op_name + "_test",
+                                  input_tensors,
+                                  outputs)
+        model = helper.make_model(graph)
+        bkd_rep = backend.prepare(model)
+        output = bkd_rep.run(inputs)
+        npt.assert_almost_equal(output[0], numpy_op)
+    input_data, input_tensor = get_int_inputs([(0, 2), (0, 2)], [(3, 4, 5), (3, 4, 5)])
+    test_ops("And", input_data, input_tensor,
+             np.logical_and(input_data[0], input_data[1]).astype(np.float32))
+    test_ops("Or", input_data, input_tensor,
+             np.logical_or(input_data[0], input_data[1]).astype(np.float32))
+    test_ops("Xor", input_data, input_tensor,
+             np.logical_xor(input_data[0], input_data[1]).astype(np.float32))
+    test_ops("Not", [input_data[0]], [input_tensor[0]],
+             np.logical_not(input_data[0]).astype(np.float32))
+
+
 def _assert_sym_equal(lhs, rhs):
     assert lhs.list_inputs() == rhs.list_inputs()  # input names must be identical
     assert len(lhs.list_outputs()) == len(rhs.list_outputs())  # number of outputs must be identical
diff --git a/tests/python-pytest/onnx/import/test_cases.py b/tests/python-pytest/onnx/import/test_cases.py
index aed68ffa114c..f41fe92352db 100644
--- a/tests/python-pytest/onnx/import/test_cases.py
+++ b/tests/python-pytest/onnx/import/test_cases.py
@@ -55,7 +55,6 @@
     'test_argmax',
     'test_argmin',
     'test_min',
-    'test_logical_',
     # enabling partial test cases for matmul
     'test_matmul_3d',
     'test_matmul_4d',

From 9979c3cea0aac1a856f511fa195a2be71161ec23 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Mon, 3 Dec 2018 15:17:09 +0100
Subject: [PATCH 17/54] Fix cmake options parsing in dev_menu (#13458)

Add GPU+MKLDNN unittests to dev_menu
---
 cmake/cmake_options.yml | 63 +++++++++++++++++++++--------------------
 dev_menu.py             | 20 +++++++------
 2 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/cmake/cmake_options.yml b/cmake/cmake_options.yml
index 6fbf4e1d0617..01446f7b8f28 100644
--- a/cmake/cmake_options.yml
+++ b/cmake/cmake_options.yml
@@ -16,34 +16,35 @@
 # under the License.
 
 --- # CMake configuration
-USE_CUDA: OFF # Build with CUDA support
-USE_OLDCMAKECUDA: OFF # Build with old cmake cuda
-USE_NCCL: OFF # Use NVidia NCCL with CUDA
-USE_OPENCV: ON # Build with OpenCV support
-USE_OPENMP: ON # Build with Openmp support
-USE_CUDNN: ON # Build with cudnn support) # one could set CUDNN_ROOT for search path
-USE_SSE: ON # Build with x86 SSE instruction support IF NOT ARM
-USE_F16C: ON # Build with x86 F16C instruction support) # autodetects support if ON
-USE_LAPACK: ON # Build with lapack support
-USE_MKL_IF_AVAILABLE: ON # Use MKL if found
-USE_MKLML_MKL: ON # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
-USE_MKLDNN: ON # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
-USE_OPERATOR_TUNING: ON # Enable auto-tuning of operators IF NOT MSVC
-USE_GPERFTOOLS: ON # Build with GPerfTools support (if found)
-USE_JEMALLOC: ON # Build with Jemalloc support
-USE_PROFILER: ON # Build with Profiler support
-USE_DIST_KVSTORE: OFF # Build with DIST_KVSTORE support
-USE_PLUGINS_WARPCTC: OFF # Use WARPCTC Plugins
-USE_PLUGIN_CAFFE: OFF # Use Caffe Plugin
-USE_CPP_PACKAGE: OFF # Build C++ Package
-USE_MXNET_LIB_NAMING: ON # Use MXNet library naming conventions.
-USE_GPROF: OFF # Compile with gprof (profiling) flag
-USE_CXX14_IF_AVAILABLE: OFF # Build with C++14 if the compiler supports it
-USE_VTUNE: OFF # Enable use of Intel Amplifier XE (VTune)) # one could set VTUNE_ROOT for search path
-ENABLE_CUDA_RTC: ON # Build with CUDA runtime compilation support
-BUILD_CPP_EXAMPLES: ON # Build cpp examples
-INSTALL_EXAMPLES: OFF # Install the example source files.
-USE_SIGNAL_HANDLER: OFF # Print stack traces on segfaults.
-USE_TENSORRT: OFF # Enable infeference optimization with TensorRT.
-USE_ASAN: OFF # Enable Clang/GCC ASAN sanitizers.
-ENABLE_TESTCOVERAGE: OFF # Enable compilation with test coverage metric output
+USE_CUDA: "ON" # Build with CUDA support
+USE_OLDCMAKECUDA: "OFF" # Build with old cmake cuda
+USE_NCCL: "OFF" # Use NVidia NCCL with CUDA
+USE_OPENCV: "ON" # Build with OpenCV support
+USE_OPENMP: "ON" # Build with Openmp support
+USE_CUDNN: "ON" # Build with cudnn support) # one could set CUDNN_ROOT for search path
+USE_SSE: "ON" # Build with x86 SSE instruction support IF NOT ARM
+USE_F16C: "ON" # Build with x86 F16C instruction support) # autodetects support if "ON"
+USE_LAPACK: "ON" # Build with lapack support
+USE_MKL_IF_AVAILABLE: "ON" # Use MKL if found
+USE_MKLML_MKL: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
+USE_MKLDNN: "ON" # Use MKLDNN variant of MKL (if MKL found) IF USE_MKL_IF_AVAILABLE AND (NOT APPLE)
+USE_OPERATOR_TUNING: "ON" # Enable auto-tuning of operators IF NOT MSVC
+USE_GPERFTOOLS: "ON" # Build with GPerfTools support (if found)
+USE_JEMALLOC: "ON" # Build with Jemalloc support
+USE_PROFILER: "ON" # Build with Profiler support
+USE_DIST_KVSTORE: "OFF" # Build with DIST_KVSTORE support
+USE_PLUGINS_WARPCTC: "OFF" # Use WARPCTC Plugins
+USE_PLUGIN_CAFFE: "OFF" # Use Caffe Plugin
+USE_CPP_PACKAGE: "OFF" # Build C++ Package
+USE_MXNET_LIB_NAMING: "ON" # Use MXNet library naming conventions.
+USE_GPROF: "OFF" # Compile with gprof (profiling) flag
+USE_CXX14_IF_AVAILABLE: "OFF" # Build with C++14 if the compiler supports it
+USE_VTUNE: "OFF" # Enable use of Intel Amplifier XE (VTune)) # one could set VTUNE_ROOT for search path
+ENABLE_CUDA_RTC: "ON" # Build with CUDA runtime compilation support
+BUILD_CPP_EXAMPLES: "ON" # Build cpp examples
+INSTALL_EXAMPLES: "OFF" # Install the example source files.
+USE_SIGNAL_HANDLER: "ON" # Print stack traces on segfaults.
+USE_TENSORRT: "OFF" # Enable infeference optimization with TensorRT.
+USE_ASAN: "OFF" # Enable Clang/GCC ASAN sanitizers.
+ENABLE_TESTCOVERAGE: "OFF" # Enable compilation with test coverage metric output
+CMAKE_BUILD_TYPE: "Debug"
diff --git a/dev_menu.py b/dev_menu.py
index 27db9e8aca6f..0fd78cb222e3 100755
--- a/dev_menu.py
+++ b/dev_menu.py
@@ -46,8 +46,12 @@ def __call__(self):
                 resp = input("Please answer yes or no: ")
 
 class CMake(object):
-    def __init__(self, cmake_options_yaml='cmake/cmake_options.yml'):
-        self.cmake_options_yaml = cmake_options_yaml
+    def __init__(self, cmake_options_yaml='cmake_options.yml', cmake_options_yaml_default='cmake/cmake_options.yml'):
+        if os.path.exists(cmake_options_yaml):
+            self.cmake_options_yaml = cmake_options_yaml
+        else:
+            self.cmake_options_yaml = cmake_options_yaml_default
+        logging.info('Using {} for CMake configuration'.format(self.cmake_options_yaml))
         self.cmake_options = None
         self.read_config()
 
@@ -58,13 +62,8 @@ def read_config(self):
 
     def _cmdlineflags(self):
         res = []
-        def _bool_ON_OFF(x):
-            if x:
-                return 'ON'
-            else:
-                return 'OFF'
         for opt,v in self.cmake_options.items():
-            res.append('-D{}={}'.format(opt,_bool_ON_OFF(v)))
+            res.append('-D{}={}'.format(opt,v))
         return res
 
     def cmake_command(self) -> str:
@@ -103,6 +102,11 @@ def __call__(self, build_dir='build', generator='Ninja', build_cmd='ninja'):
         "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu",
         "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_gpu",
     ]),
+    ('[Docker] Python3 GPU+MKLDNN unittests',
+    [
+        "ci/build.py --platform ubuntu_gpu /work/runtime_functions.sh build_ubuntu_gpu_cmake_mkldnn",
+        "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_python3_gpu",
+    ]),
     ('[Docker] Python3 CPU Intel MKLDNN unittests',
     [
         "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh build_ubuntu_cpu_mkldnn",

From 89499896108ac37f87afbe03ddad658b466deba7 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@apache.org>
Date: Mon, 3 Dec 2018 13:50:12 -0800
Subject: [PATCH 18/54] Revert "Manually track num_max_thread (#12380)"
 (#13501)

This reverts commit 75410210e07a5fab5e044348aee276d578d5857e.
---
 src/engine/openmp.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/engine/openmp.cc b/src/engine/openmp.cc
index 64899b09660e..8fe3939892d2 100644
--- a/src/engine/openmp.cc
+++ b/src/engine/openmp.cc
@@ -73,14 +73,18 @@ void OpenMP::set_reserve_cores(int cores) {
   CHECK_GE(cores, 0);
   reserve_cores_ = cores;
 #ifdef _OPENMP
-  omp_thread_max_ = std::max(omp_thread_max_ - reserve_cores_, 1);
+  if (reserve_cores_ >= omp_thread_max_) {
+    omp_set_num_threads(1);
+  } else {
+    omp_set_num_threads(omp_thread_max_ - reserve_cores_);
+  }
 #endif
 }
 
 int OpenMP::GetRecommendedOMPThreadCount(bool exclude_reserved) const {
 #ifdef _OPENMP
   if (omp_num_threads_set_in_environment_) {
-    return omp_thread_max_;
+    return omp_get_max_threads();
   }
   if (enabled_) {
     int thread_count = omp_get_max_threads();
@@ -97,8 +101,10 @@ int OpenMP::GetRecommendedOMPThreadCount(bool exclude_reserved) const {
     }
     return omp_thread_max_;
   }
-#endif
   return 1;
+#else
+  return 1;
+#endif
 }
 
 OpenMP *__init_omp__ = OpenMP::Get();

From 65edc9500b10a3404945d6d79acbae54a2833890 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Mon, 3 Dec 2018 15:19:40 -0800
Subject: [PATCH 19/54] Feature/mkldnn static 2 (#13503)

* build mkldnn as static lib

* update makefile to statically build mkldnn

* build static mkldnn

* fix static name

* fix static name

* update static for mac

* rename mkldnn dep in ci

* remove moving mkldnn dynamic lib

* remove commented code

* remove mkldnn dnaymic for unitest

* force static for mkldnn lib

* remove dynamic mkldnn bind

* only link windows

* add mkldnn.mk

* try force linking

* remove mkldnn dynanmic check

* remove test mkldnn install

* fix spacing

* fix index

* add artifacts

* add comment about windows

* remove static

* update makefile
---
 CMakeLists.txt                          |  1 +
 Makefile                                |  9 +++-
 ci/docker/runtime_functions.sh          |  3 --
 ci/jenkins/Jenkins_steps.groovy         |  8 ++--
 mkldnn.mk                               | 12 ++++--
 tests/cpp/unittest.mk                   |  8 ++--
 tests/python/mkl/test_mkldnn.py         |  6 +--
 tests/python/mkl/test_mkldnn_install.py | 56 -------------------------
 8 files changed, 26 insertions(+), 77 deletions(-)
 delete mode 100644 tests/python/mkl/test_mkldnn_install.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b8bbd2e0272..161705643194 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,6 +227,7 @@ if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
+    set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
diff --git a/Makefile b/Makefile
index 16ea59f3d585..e424904ad785 100644
--- a/Makefile
+++ b/Makefile
@@ -131,8 +131,13 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	# MKLDNN but to needs to be dynamically linked for windows as not all VS compilers support static linking
+	ifneq ($(UNAME_S), Windows)
+		LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
+	else
+		CFLAGS += -I$(MKLDNNROOT)/include
+		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	endif
 endif
 
 # setup opencv
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1fc10bf0e085..5a44cccc6aa0 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -629,9 +629,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
-    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
-    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
-    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index f48a26737308..309775c88c85 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,19 +23,19 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
diff --git a/mkldnn.mk b/mkldnn.mk
index d79bbe7d2a0e..5af3e9b1d741 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,14 +19,20 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
+	MKLDNN_LIBRARY_TYPE=STATIC
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
+else ifeq ($(UNAME_S), Windows)
+	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
+	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
+	MKLDNN_LIBRARY_TYPE=SHARED
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
 endif
 endif
 
@@ -37,7 +43,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=$(MKLDNN_LIBRARY_TYPE)
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 746ee2f096f1..665ce6982874 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -41,22 +41,22 @@ gtest-all.o : $(GTEST_SRCS_)
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
-build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
+build/tests/cpp/%.o : tests/cpp/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
+build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
+build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
+build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index c6c0a0832f1f..d9d3abfc3ced 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -27,7 +27,6 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
-import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -441,7 +440,4 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
-
-
-if __name__ == '__main__':
-    install.test_mkldnn_install()
+    
diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
deleted file mode 100644
index c2f26df72f2e..000000000000
--- a/tests/python/mkl/test_mkldnn_install.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-MKL-DNN related test cases
-"""
-
-import sys
-import os
-import logging
-
-
-def test_mkldnn_install():
-    """
-    This test will verify that MXNet is built/installed correctly when
-    compiled with Intel MKL-DNN library. The method will try to import
-    the mxnet module and see if the mkldnn library is mapped to this
-    process's address space.
-    """
-    logging.basicConfig(level=logging.INFO)
-
-    if not sys.platform.startswith('linux'):
-        logging.info("Bypass mkldnn install test for non-Linux OS")
-        return
-
-    try:
-        #pylint: disable=unused-variable
-        import mxnet as mx
-    except (ImportError, OSError) as e:
-        assert 0, "Import mxnet error: %s. Please double check your build/" \
-            "install steps or environment variable settings" % str(e)
-
-    pid = os.getpid()
-    rc = os.system("cat /proc/" + str(pid) +
-                   "/maps | grep libmkldnn > /dev/null")
-
-    if rc == 0:
-        logging.info("MXNet is built/installed correctly with MKL-DNN")
-    else:
-        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
-            "double check your build/install steps or environment " \
-            "variable settings"

From f2dcd7c7b8676b55d912997fc3f9c62c55915307 Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Mon, 3 Dec 2018 17:27:41 -0800
Subject: [PATCH 20/54] fix toctree Sphinx errors (#13489)

* fix toctree errors

* nudging file for CI
---
 docs/api/index.md       | 2 ++
 docs/tutorials/index.md | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/api/index.md b/docs/api/index.md
index eff6807678ea..9e7a58f7778c 100644
--- a/docs/api/index.md
+++ b/docs/api/index.md
@@ -1,11 +1,13 @@
 # MXNet APIs
 
+
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
    c++/index.md
    clojure/index.md
+   java/index.md
    julia/index.md
    perl/index.md
    python/index.md
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 52e2be8f6a2b..7d102bb88f89 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -3,12 +3,13 @@
 ```eval_rst
 .. toctree::
    :hidden:
-   
+
    basic/index.md
    c++/index.md
    control_flow/index.md
    embedded/index.md
    gluon/index.md
+   java/index.md
    nlp/index.md
    onnx/index.md
    python/index.md

From 3d499cb3584919b767142c5596211a7f7fb18d50 Mon Sep 17 00:00:00 2001
From: Jose Luis Contreras <joseluis.contreras.santos@gmail.com>
Date: Tue, 4 Dec 2018 14:12:36 +0100
Subject: [PATCH 21/54] Disabled flaky test
 test_gluon_data.test_recordimage_dataset_with_data_loader_multiworker
 (#13527)

---
 tests/python/unittest/test_gluon_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index e4206095f9ba..d043a7c6b802 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -78,6 +78,7 @@ def _dataset_transform_fn(x, y):
     return x, y
 
 @with_seed()
+@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/13484")
 def test_recordimage_dataset_with_data_loader_multiworker():
     recfile = prepare_record()
     dataset = gluon.data.vision.ImageRecordDataset(recfile)

From 7dde0eb0e4dc910beabc023b45317bdb82d52a0f Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Tue, 4 Dec 2018 18:48:39 +0100
Subject: [PATCH 22/54] [MXNET-1234] Fix shape inference problems in Activation
 backward (#13409)

* Provide a failing test for ReLU activation shape inference bug

* Fix Activation backward shape inference

fixes: #13333

* Add softsign Activation to test_gluon.py

* Use activation in GPU if we are using CUDNN and not MKLDNN as it's happening right now

* Don't disable MKLDNN
---
 src/operator/elemwise_op_common.h     | 20 ++++---
 src/operator/nn/activation-inl.h      | 12 ++--
 src/operator/nn/activation.cc         | 79 ++++++++++++++++-----------
 src/operator/nn/activation.cu         | 30 ++++++----
 tests/cpp/operator/activation_perf.cc | 26 +++++++--
 tests/python/unittest/test_gluon.py   | 12 ++--
 6 files changed, 109 insertions(+), 70 deletions(-)

diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 4b8663bba6ea..e622ce216ad0 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -128,29 +128,33 @@ inline bool ElemwiseAttr(const nnvm::NodeAttrs& attrs,
   if (n_out != -1)
     out_size = static_cast<size_t>(n_out);
 
-  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
+  CHECK_LE(in_size, in_attrs->size());
+  CHECK_LE(out_size, out_attrs->size());
+  auto deduce = [&](const std::vector<AttrType>& vec, size_t size, const char *name) {
       for (size_t i = 0; i < size; ++i) {
-        CHECK(assign(&dattr, (*vec)[i]))
+        CHECK(assign(&dattr, vec.at(i)))
           << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
           << name << ": " << "expected " << attr_string(dattr)
-          << ", got " << attr_string((*vec)[i]);
+          << ", got " << attr_string(vec.at(i));
       }
     };
-  deduce(in_attrs, in_size, "input");
-  if (reverse_infer) deduce(out_attrs, out_size, "output");
+  deduce(*in_attrs, in_size, "input");
+  if (reverse_infer)
+      deduce(*out_attrs, out_size, "output");
 
   auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
       for (size_t i = 0; i < size; ++i) {
-        CHECK(assign(&(*vec)[i], dattr))
+        CHECK(assign(&(vec->at(i)), dattr))
           << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
           << name << ": " << "expected " << attr_string(dattr)
-          << ", got " << attr_string((*vec)[i]);
+          << ", got " << attr_string(vec->at(i));
       }
     };
   write(in_attrs, in_size, "input");
   write(out_attrs, out_size, "output");
 
-  if (is_none(dattr)) return false;
+  if (is_none(dattr))
+      return false;
   return true;
 }
 
diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h
index 2705177f951d..1d8e4c2b6cda 100644
--- a/src/operator/nn/activation-inl.h
+++ b/src/operator/nn/activation-inl.h
@@ -48,6 +48,9 @@ enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
 enum ActivationOpResource {kTempSpace};
 enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU, kSoftSign};
+
+// Get the number of inputs to the gradient depending on the activation type
+int GradNumInputs(int act_type);
 }  // activation
 
 struct ActivationParam : public dmlc::Parameter<ActivationParam> {
@@ -199,13 +202,8 @@ void ActivationGradCompute(const nnvm::NodeAttrs& attrs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
-  bool relu = param.act_type == activation::kReLU;
-  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
-#else
-  bool softsign = param.act_type == activation::kSoftSign;
-  CHECK_EQ(inputs.size(), softsign ? 3U : 2U);
-#endif
+  const int act_type = param.act_type;
+  CHECK_EQ(inputs.size(), activation::GradNumInputs(act_type));
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   ActivationGradComputeImpl<xpu>(attrs, ctx, inputs, req, outputs);
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index ba44ebd4ed4d..305eeab21176 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -30,13 +30,34 @@
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_base-inl.h"
 #include "./mkldnn/mkldnn_ops-inl.h"
-#endif  // MXNET_USE_MKLDNN
+#endif  // MXNET_USE_MKLDNN == 1
 #include "../operator_common.h"
 #include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
 
+namespace activation {
+
+int GradNumInputs(int act_type) {
+    // check activation.cu \sa ActivationGradCompute
+    switch (act_type) {
+        case kReLU:
+            return 2;
+        case kSoftReLU:
+        case kSoftSign:
+        case kTanh:
+        case kSigmoid:
+            return 3;
+        default:
+            CHECK(false) << "missing activation type";
+    }
+    // unreachable
+    return -1;
+}
+
+}  // namespace activation
+
 DMLC_REGISTER_PARAMETER(ActivationParam);
 
 // This will determine the order of the inputs for backward computation.
@@ -44,24 +65,28 @@ struct ActivationGrad {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
+    // ograds, output...
     std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
     heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0});
 
     const NodeAttrs& attrs = n->attrs;
+    using namespace activation;
     int act_type = dmlc::get<ActivationParam>(attrs.parsed).act_type;
-    if (act_type == activation::kSoftSign) {
-      // for softsign need the inputs to compute the activation.
-      heads.push_back(n->inputs[activation::kData]);
-    }
-
-#if (MXNET_USE_CUDNN == 1 || MXNET_USE_MKLDNN == 1)
     // for ReLU, no need to pass input data. This enables inplace optimization during the
     // forward pass.
-    if (act_type != activation::kReLU &&
-        act_type != activation::kSoftSign) {
-      heads.push_back(n->inputs[activation::kData]);
+    // check activation.cu \sa ActivationGradCompute
+    switch (act_type) {
+        case kReLU:
+            break;
+        case kSoftReLU:
+        case kSoftSign:
+        case kTanh:
+        case kSigmoid:
+            heads.push_back(n->inputs[activation::kData]);
+            break;
+        default:
+            CHECK(false) << "missing activation type";
     }
-#endif
     return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
 };
@@ -89,21 +114,19 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  bool relu = param.act_type == activation::kReLU;
-  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
+  CHECK_EQ(inputs.size(), activation::GradNumInputs(param.act_type));
   if (SupportMKLDNN(inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     // XXX: for y = relu(x), y is passed as "in_data" to Backward()
-    MKLDNNActivationBackward(attrs, ctx, inputs[0], relu ? inputs[1] : inputs[2], req[0],
+    const bool relu = param.act_type == activation::kReLU;
+    MKLDNNActivationBackward(attrs, ctx, inputs.at(0), relu ? inputs.at(1) : inputs.at(2), req[0],
                              outputs[0]);
-     MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(ActivationGradComputeImpl<cpu>, attrs, ctx, inputs, req, outputs);
 }
-#endif
 
-#if MXNET_USE_MKLDNN == 1
 inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
                                          const int dev_mask,
                                          DispatchMode* dispatch_mode,
@@ -122,16 +145,12 @@ inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
                                           std::vector<int> *in_attrs,
                                           std::vector<int> *out_attrs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  if (param.act_type != activation::kReLU) {
-    CHECK_EQ(in_attrs->size(), 3U);
-  } else {
-    // for ReLU activation, the backward pass only needs ograd and output
-    CHECK_EQ(in_attrs->size(), 2U);
-  }
+  CHECK_EQ(in_attrs->size(), activation::GradNumInputs(param.act_type));
   return MKLDNNStorageType(attrs, dev_mask, SupportMKLDNNAct(param),
                            dispatch_mode, in_attrs, out_attrs);
 }
-#endif
+#endif  // MXNET_USE_MKLDNN == 1
+
 
 MXNET_OPERATOR_REGISTER_UNARY(Activation)
 .describe(R"code(Applies an activation function element-wise to the input.
@@ -163,18 +182,16 @@ The following activation functions are supported:
 
 NNVM_REGISTER_OP(_backward_Activation)
 .set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    int act_type = dmlc::get<ActivationParam>(attrs.parsed).act_type;
-    // for ReLU activation, the backward pass only needs ograd and output
-    if (act_type == activation::kReLU) return 2;
-    return 3;
-  })
+    const int act_type = dmlc::get<ActivationParam>(attrs.parsed).act_type;
+    return activation::GradNumInputs(act_type);
+})
 .set_num_outputs(1)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", BackwardActStorageType)
 #endif
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<-1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu
index 8892cc34f710..ec7db844b100 100644
--- a/src/operator/nn/activation.cu
+++ b/src/operator/nn/activation.cu
@@ -54,12 +54,13 @@ void ActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  const int act_type = param.act_type;
 
   // SoftReLU and kSoftSign are both not supported by CUDNN yet
-  if (param.act_type == activation::kSoftReLU) {
+  if (act_type == activation::kSoftReLU) {
     ActivationForward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(ctx,
       inputs[0], req[0], outputs[0]);
-  } else if (param.act_type == activation::kSoftSign) {
+  } else if (act_type == activation::kSoftSign) {
     ActivationForward<gpu, mshadow_op::softsign, mshadow_op::softsign_grad>(ctx,
       inputs[0], req[0], outputs[0]);
   } else {
@@ -76,23 +77,28 @@ void ActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<TBlob>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
-  bool relu = param.act_type == activation::kReLU;
-  CHECK_EQ(inputs.size(), relu ? 2U : 3U);
+  const int act_type = param.act_type;
+  CHECK_EQ(inputs.size(), activation::GradNumInputs(act_type));
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
 
   // both SoftReLU and SoftSign not supported by CUDNN yet
-  if (param.act_type == activation::kSoftReLU) {
+  if (act_type == activation::kSoftReLU) {
     ActivationBackward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad>(
-      ctx, inputs[0], inputs[1], req[0], outputs[0]);
-  } else if (param.act_type == activation::kSoftSign) {
+      ctx, inputs.at(0), inputs.at(1), req[0], outputs[0]);
+  } else if (act_type == activation::kSoftSign) {
     ActivationBackward<gpu, mshadow_op::softsign, mshadow_op::softsign_grad>(
-      ctx, inputs[0], inputs[2], req[0], outputs[0]);
-  } else {
-    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      ctx, inputs.at(0), inputs.at(2), req[0], outputs[0]);
+  } else if (act_type == activation::kReLU) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs.at(0).type_flag_, DType, {
       // XXX: for y = relu(x), y is passed as "in_data" to Backward()
-      get_cudnn_op<DType>(param).Backward(ctx, inputs[0], relu ? inputs[1] : inputs[2],
-                                          inputs[1], req[0], outputs[0]);
+      get_cudnn_op<DType>(param).Backward(ctx, inputs.at(0), inputs.at(1),
+                                          inputs.at(1), req[0], outputs[0]);
+    });
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH(inputs.at(0).type_flag_, DType, {
+      get_cudnn_op<DType>(param).Backward(ctx, inputs.at(0), inputs.at(2),
+                                          inputs.at(1), req[0], outputs[0]);
     });
   }
 }
diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc
index 1bd8ca89c9f5..bba8a3ec5722 100644
--- a/tests/cpp/operator/activation_perf.cc
+++ b/tests/cpp/operator/activation_perf.cc
@@ -38,13 +38,27 @@ const kwargs_t basic_activation_args = { };
  * \brief Generic bidirectional sanity test
  */
 TEST(ACTIVATION_PERF, ExecuteBidirectional) {
+  using namespace std;
   TShape shape({5, 5});
-  kwargs_t kwargs = basic_activation_args;
-  kwargs.push_back({"act_type", "tanh"});
-
-  test::op::CoreOperatorRunner<float> runner;
-  runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
-          kwargs, "Activation", "_backward_Activation"), 1);
+  vector<string> activations = {
+    "relu",
+    "sigmoid",
+    "tanh",
+    "softrelu",
+    "softsign"
+  };
+  for (const string& activation : activations) {
+    kwargs_t activation_args = {{"act_type", activation}};
+    test::op::CoreOperatorRunner<float> runner;
+    runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
+            activation_args, "Activation", "_backward_Activation"), 1);
+  }
+  for (const string& activation : activations) {
+    kwargs_t activation_args = {{"act_type", activation}};
+    test::op::CoreOperatorRunner<float> runner;
+    runner.RunBidirectional(true, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
+            activation_args, "Activation", "_backward_Activation"), 1);
+  }
 }
 
 /*!
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 3049674821c9..abe6b136fe0c 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -2411,7 +2411,7 @@ def hybrid_forward(self, F, x):
             x_reshape = x.reshape(self.reshape)
             out = self.act(x_reshape)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for act in acts:
         x = mx.nd.random.uniform(-1, 1, shape=(4, 16, 32, 32))
         shape = (4, 32, 32, -1)
@@ -2433,7 +2433,7 @@ def hybrid_forward(self, F, x):
             out = self.act(x_slice)
             return out
 
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for act in acts:
         x = mx.nd.random.uniform(-1, 1, shape=(8, 32, 64, 64))
         slice = [(0, 16, 32, 32), (4, 32, 64, 64)]
@@ -2457,7 +2457,7 @@ def hybrid_forward(self, F, x):
             y_reshape = y.reshape(self.reshape[1])
             out = self.act1(y_reshape)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
@@ -2484,7 +2484,7 @@ def hybrid_forward(self, F, x):
             y_slice = y.slice(begin=self.slice[1][0], end=self.slice[1][1])
             out = self.act1(y_slice)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
@@ -2512,7 +2512,7 @@ def hybrid_forward(self, F, x):
             y_slice = y.slice(begin=self.slice[0], end=self.slice[1])
             out = self.act1(y_slice)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
@@ -2541,7 +2541,7 @@ def hybrid_forward(self, F, x):
             y_reshape = y.reshape(self.reshape)
             out = self.act1(y_reshape)
             return out
-    acts = ["relu", "sigmoid", "tanh", "softrelu"]
+    acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for idx0, act0 in enumerate(acts):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:

From d60f37bf317da6fc48e4fa4d52d02d51f13e95b9 Mon Sep 17 00:00:00 2001
From: Aaron Markham <markhama@amazon.com>
Date: Tue, 4 Dec 2018 11:16:09 -0800
Subject: [PATCH 23/54] =?UTF-8?q?Docs=20&=20website=20sphinx=20errors=20sq?=
 =?UTF-8?q?uished=20=F0=9F=8C=A6=20=20(#13488)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix scala ndarray docs; remove interpreter style

* fix docs error in kvstore

* remove interpreter format in examples

* remove python indicator for these non-functioning python code blocks; clears a sphinx error

* remove old table that was not being used and was triggering a sphinx error

* get rid of curly braces that was causing a pygments error

* fix ambiguous reference causing sphinx error

* nudging file for CI
---
 docs/api/scala/index.md                      |  18 +-
 docs/api/scala/kvstore.md                    |  98 +++++-----
 docs/api/scala/ndarray.md                    | 186 +++++++++----------
 docs/api/scala/symbol.md                     |  66 +++----
 docs/gluon/index.md                          |  10 +-
 docs/install/ubuntu_setup.md                 |  12 --
 docs/tutorials/r/fiveMinutesNeuralNetwork.md |   4 +-
 python/mxnet/gluon/parameter.py              |   2 +-
 8 files changed, 192 insertions(+), 204 deletions(-)

diff --git a/docs/api/scala/index.md b/docs/api/scala/index.md
index 8b32c9fe9e22..f7a150019872 100644
--- a/docs/api/scala/index.md
+++ b/docs/api/scala/index.md
@@ -19,6 +19,7 @@ See the [MXNet Scala API Documentation](docs/index.html#org.apache.mxnet.package
    symbol.md
 ```
 
+
 ## Image Classification with the Scala Infer API
 The Infer API can be used for single and batch image classification. More information can be found at the following locations:
 
@@ -32,20 +33,19 @@ The Infer API can be used for single and batch image classification. More inform
 You can perform tensor or matrix computation in pure Scala:
 
 ```scala
-   scala> import org.apache.mxnet._
    import org.apache.mxnet._
 
-   scala> val arr = NDArray.ones(2, 3)
-   arr: org.apache.mxnet.NDArray = org.apache.mxnet.NDArray@f5e74790
+   val arr = NDArray.ones(2, 3)
+   // arr: org.apache.mxnet.NDArray = org.apache.mxnet.NDArray@f5e74790
 
-   scala> arr.shape
-   res0: org.apache.mxnet.Shape = (2,3)
+   arr.shape
+   // org.apache.mxnet.Shape = (2,3)
 
-   scala> (arr * 2).toArray
-   res2: Array[Float] = Array(2.0, 2.0, 2.0, 2.0, 2.0, 2.0)
+   (arr * 2).toArray
+   // Array[Float] = Array(2.0, 2.0, 2.0, 2.0, 2.0, 2.0)
 
-   scala> (arr * 2).shape
-   res3: org.apache.mxnet.Shape = (2,3)
+   (arr * 2).shape
+   // org.apache.mxnet.Shape = (2,3)
 ```
 
 
diff --git a/docs/api/scala/kvstore.md b/docs/api/scala/kvstore.md
index 2157176d23bf..e195c4d7e720 100644
--- a/docs/api/scala/kvstore.md
+++ b/docs/api/scala/kvstore.md
@@ -16,13 +16,13 @@ Let's consider a simple example. It initializes
 a (`int`, `NDArray`) pair into the store, and then pulls the value out.
 
 ```scala
-    scala> val kv = KVStore.create("local") // create a local kv store.
-    scala> val shape = Shape(2,3)
-    scala> kv.init(3, NDArray.ones(shape)*2)
-    scala> val a = NDArray.zeros(shape)
-    scala> kv.pull(3, out = a)
-    scala> a.toArray
-    Array[Float] = Array(2.0, 2.0, 2.0, 2.0, 2.0, 2.0)
+val kv = KVStore.create("local") // create a local kv store.
+val shape = Shape(2,3)
+kv.init(3, NDArray.ones(shape)*2)
+val a = NDArray.zeros(shape)
+kv.pull(3, out = a)
+a.toArray
+// Array[Float] = Array(2.0, 2.0, 2.0, 2.0, 2.0, 2.0)
 ```
 
 ### Push, Aggregation, and Updater
@@ -30,10 +30,10 @@ a (`int`, `NDArray`) pair into the store, and then pulls the value out.
 For any key that's been initialized, you can push a new value with the same shape to the key, as follows:
 
 ```scala
-    scala> kv.push(3, NDArray.ones(shape)*8)
-    scala> kv.pull(3, out = a) // pull out the value
-    scala> a.toArray
-    Array[Float] = Array(8.0, 8.0, 8.0, 8.0, 8.0, 8.0)
+kv.push(3, NDArray.ones(shape)*8)
+kv.pull(3, out = a) // pull out the value
+a.toArray
+// Array[Float] = Array(8.0, 8.0, 8.0, 8.0, 8.0, 8.0)
 ```
 
 The data that you want to push can be stored on any device. Furthermore, you can push multiple
@@ -41,13 +41,13 @@ values into the same key, where KVStore first sums all of these
 values, and then pushes the aggregated value, as follows:
 
 ```scala
-    scala> val gpus = Array(Context.gpu(0), Context.gpu(1), Context.gpu(2), Context.gpu(3))
-    scala> val b = Array(NDArray.ones(shape, gpus(0)), NDArray.ones(shape, gpus(1)), \
-    scala> NDArray.ones(shape, gpus(2)), NDArray.ones(shape, gpus(3)))
-    scala> kv.push(3, b)
-    scala> kv.pull(3, out = a)
-    scala> a.toArray
-    Array[Float] = Array(4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
+val gpus = Array(Context.gpu(0), Context.gpu(1), Context.gpu(2), Context.gpu(3))
+val b = Array(NDArray.ones(shape, gpus(0)), NDArray.ones(shape, gpus(1)), \
+NDArray.ones(shape, gpus(2)), NDArray.ones(shape, gpus(3)))
+kv.push(3, b)
+kv.pull(3, out = a)
+a.toArray
+// Array[Float] = Array(4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
 ```
 
 For each push command, KVStore applies the pushed value to the value stored by an
@@ -55,22 +55,22 @@ For each push command, KVStore applies the pushed value to the value stored by a
 control how data is merged.
 
 ```scala
-    scala> val updater = new MXKVStoreUpdater {
-              override def update(key: Int, input: NDArray, stored: NDArray): Unit = {
-                println(s"update on key $key")
-                stored += input * 2
-              }
-              override def dispose(): Unit = {}
-           }
-    scala> kv.setUpdater(updater)
-    scala> kv.pull(3, a)
-    scala> a.toArray
-    Array[Float] = Array(4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
-    scala> kv.push(3, NDArray.ones(shape))
-    update on key 3
-    scala> kv.pull(3, a)
-    scala> a.toArray
-    Array[Float] = Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0)
+val updater = new MXKVStoreUpdater {
+          override def update(key: Int, input: NDArray, stored: NDArray): Unit = {
+            println(s"update on key $key")
+            stored += input * 2
+          }
+          override def dispose(): Unit = {}
+       }
+kv.setUpdater(updater)
+kv.pull(3, a)
+a.toArray
+// Array[Float] = Array(4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
+kv.push(3, NDArray.ones(shape))
+// update on key 3
+kv.pull(3, a)
+a.toArray
+// Array[Float] = Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0)
 ```
 
 ### Pull
@@ -79,11 +79,11 @@ You've already seen how to pull a single key-value pair. Similar to the way that
 pull the value into several devices with a single call.
 
 ```scala
-    scala> val b = Array(NDArray.ones(shape, gpus(0)), NDArray.ones(shape, gpus(1)),\
-    scala> NDArray.ones(shape, gpus(2)), NDArray.ones(shape, gpus(3)))
-    scala> kv.pull(3, outs = b)
-    scala> b(1).toArray
-    Array[Float] = Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0)
+val b = Array(NDArray.ones(shape, gpus(0)), NDArray.ones(shape, gpus(1)),\
+NDArray.ones(shape, gpus(2)), NDArray.ones(shape, gpus(3)))
+kv.pull(3, outs = b)
+b(1).toArray
+// Array[Float] = Array(6.0, 6.0, 6.0, 6.0, 6.0, 6.0)
 ```
 
 ## List Key-Value Pairs
@@ -92,14 +92,14 @@ All of the operations that we've discussed so far are performed on a single key.
 the interface for generating a list of key-value pairs. For a single device, use the following:
 
 ```scala
-    scala> val keys = Array(5, 7, 9)
-    scala> kv.init(keys, Array.fill(keys.length)(NDArray.ones(shape)))
-    scala> kv.push(keys, Array.fill(keys.length)(NDArray.ones(shape)))
-    update on key: 5
-    update on key: 7
-    update on key: 9
-    scala> val b = Array.fill(keys.length)(NDArray.zeros(shape))
-    scala> kv.pull(keys, outs = b)
-    scala> b(1).toArray
-    Array[Float] = Array(3.0, 3.0, 3.0, 3.0, 3.0, 3.0)
+val keys = Array(5, 7, 9)
+kv.init(keys, Array.fill(keys.length)(NDArray.ones(shape)))
+kv.push(keys, Array.fill(keys.length)(NDArray.ones(shape)))
+// update on key: 5
+// update on key: 7
+// update on key: 9
+val b = Array.fill(keys.length)(NDArray.zeros(shape))
+kv.pull(keys, outs = b)
+b(1).toArray
+// Array[Float] = Array(3.0, 3.0, 3.0, 3.0, 3.0, 3.0)
 ```
diff --git a/docs/api/scala/ndarray.md b/docs/api/scala/ndarray.md
index 3d4bc37a19ef..9e87d397c8b8 100644
--- a/docs/api/scala/ndarray.md
+++ b/docs/api/scala/ndarray.md
@@ -14,13 +14,13 @@ Topics:
 Create `mxnet.ndarray` as follows:
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> // all-zero array of dimension 100x50
-    scala> val a = NDArray.zeros(100, 50)
-    scala> // all-one array of dimension 256x32x128x1
-    scala> val b = NDArray.ones(256, 32, 128, 1)
-    scala> // initialize array with contents, you can specify dimensions of array using Shape parameter while creating array.
-    scala> val c = NDArray.array(Array(1, 2, 3, 4, 5, 6), shape = Shape(2, 3))
+import org.apache.mxnet._
+// all-zero array of dimension 100x50
+val a = NDArray.zeros(100, 50)
+// all-one array of dimension 256x32x128x1
+val b = NDArray.ones(256, 32, 128, 1)
+// initialize array with contents, you can specify dimensions of array using Shape parameter while creating array.
+val c = NDArray.array(Array(1, 2, 3, 4, 5, 6), shape = Shape(2, 3))
 ```
 This is similar to the way you use `numpy`.
 ## NDArray Operations
@@ -30,77 +30,77 @@ We provide some basic ndarray operations, like arithmetic and slice operations.
 ### Arithmetic Operations
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val a = NDArray.zeros(100, 50)
-    scala> a.shape
-    org.apache.mxnet.Shape = (100,50)
-    scala> val b = NDArray.ones(100, 50)
-    scala> // c and d will be calculated in parallel here!
-    scala> val c = a + b
-    scala> val d = a - b
-    scala> // inplace operation, b's contents will be modified, but c and d won't be affected.
-    scala> b += d
+import org.apache.mxnet._
+val a = NDArray.zeros(100, 50)
+a.shape
+// org.apache.mxnet.Shape = (100,50)
+val b = NDArray.ones(100, 50)
+// c and d will be calculated in parallel here!
+val c = a + b
+val d = a - b
+// inplace operation, b's contents will be modified, but c and d won't be affected.
+b += d
 ```
 
 ### Multiplication/Division Operations
 
 ```scala
-    scala> import org.apache.mxnet._
-    //Multiplication
-    scala> val ndones = NDArray.ones(2, 1)
-    scala> val ndtwos = ndones * 2
-    scala> ndtwos.toArray
-    Array[Float] = Array(2.0, 2.0)
-    scala> (ndones * ndones).toArray
-    Array[Float] = Array(1.0, 1.0)
-    scala> (ndtwos * ndtwos).toArray
-    Array[Float] = Array(4.0, 4.0)
-    scala> ndtwos *= ndtwos // inplace
-    scala> ndtwos.toArray
-    Array[Float] = Array(4.0, 4.0)
-
-    //Division
-    scala> val ndones = NDArray.ones(2, 1)
-    scala> val ndzeros = ndones - 1f
-    scala> val ndhalves = ndones / 2
-    scala> ndhalves.toArray
-    Array[Float] = Array(0.5, 0.5)
-    scala> (ndhalves / ndhalves).toArray
-    Array[Float] = Array(1.0, 1.0)
-    scala> (ndones / ndones).toArray
-    Array[Float] = Array(1.0, 1.0)
-    scala> (ndzeros / ndones).toArray
-    Array[Float] = Array(0.0, 0.0)
-    scala> ndhalves /= ndhalves
-    scala> ndhalves.toArray
-    Array[Float] = Array(1.0, 1.0)
+import org.apache.mxnet._
+// Multiplication
+val ndones = NDArray.ones(2, 1)
+val ndtwos = ndones * 2
+ndtwos.toArray
+// Array[Float] = Array(2.0, 2.0)
+(ndones * ndones).toArray
+// Array[Float] = Array(1.0, 1.0)
+(ndtwos * ndtwos).toArray
+// Array[Float] = Array(4.0, 4.0)
+ndtwos *= ndtwos // inplace
+ndtwos.toArray
+// Array[Float] = Array(4.0, 4.0)
+
+//Division
+val ndones = NDArray.ones(2, 1)
+val ndzeros = ndones - 1f
+val ndhalves = ndones / 2
+ndhalves.toArray
+// Array[Float] = Array(0.5, 0.5)
+(ndhalves / ndhalves).toArray
+// Array[Float] = Array(1.0, 1.0)
+(ndones / ndones).toArray
+// Array[Float] = Array(1.0, 1.0)
+(ndzeros / ndones).toArray
+// Array[Float] = Array(0.0, 0.0)
+ndhalves /= ndhalves
+ndhalves.toArray
+// Array[Float] = Array(1.0, 1.0)
 ```
 
 ### Slice Operations
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val a = NDArray.array(Array(1f, 2f, 3f, 4f, 5f, 6f), shape = Shape(3, 2))
-    scala> val a1 = a.slice(1)   
-    scala> assert(a1.shape === Shape(1, 2))
-    scala> assert(a1.toArray === Array(3f, 4f))
-
-    scala> val a2 = arr.slice(1, 3)
-    scala> assert(a2.shape === Shape(2, 2))
-    scala> assert(a2.toArray === Array(3f, 4f, 5f, 6f))
+import org.apache.mxnet._
+val a = NDArray.array(Array(1f, 2f, 3f, 4f, 5f, 6f), shape = Shape(3, 2))
+val a1 = a.slice(1)
+assert(a1.shape === Shape(1, 2))
+assert(a1.toArray === Array(3f, 4f))
+
+val a2 = arr.slice(1, 3)
+assert(a2.shape === Shape(2, 2))
+assert(a2.toArray === Array(3f, 4f, 5f, 6f))
 ```
 
 ### Dot Product
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val arr1 = NDArray.array(Array(1f, 2f), shape = Shape(1, 2))
-    scala> val arr2 = NDArray.array(Array(3f, 4f), shape = Shape(2, 1))   
-    scala> val res = NDArray.dot(arr1, arr2)
-    scala> res.shape
-    org.apache.mxnet.Shape = (1,1)
-    scala> res.toArray
-    Array[Float] = Array(11.0)
+import org.apache.mxnet._
+val arr1 = NDArray.array(Array(1f, 2f), shape = Shape(1, 2))
+val arr2 = NDArray.array(Array(3f, 4f), shape = Shape(2, 1))
+val res = NDArray.dot(arr1, arr2)
+res.shape
+// org.apache.mxnet.Shape = (1,1)
+res.toArray
+// Array[Float] = Array(11.0)
 ```
 
 ### Save and Load NDArray
@@ -108,18 +108,18 @@ We provide some basic ndarray operations, like arithmetic and slice operations.
 You can use MXNet functions to save and load a list or dictionary of NDArrays from file systems, as follows:
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val a = NDArray.zeros(100, 200)
-    scala> val b = NDArray.zeros(100, 200)
-    scala> // save list of NDArrays
-    scala> NDArray.save("/path/to/array/file", Array(a, b))
-    scala> // save dictionary of NDArrays to AWS S3
-    scala> NDArray.save("s3://path/to/s3/array", Map("A" -> a, "B" -> b))
-    scala> // save list of NDArrays to hdfs.
-    scala> NDArray.save("hdfs://path/to/hdfs/array", Array(a, b))
-    scala> val from_file = NDArray.load("/path/to/array/file")
-    scala> val from_s3 = NDArray.load("s3://path/to/s3/array")
-    scala> val from_hdfs = NDArray.load("hdfs://path/to/hdfs/array")
+import org.apache.mxnet._
+val a = NDArray.zeros(100, 200)
+val b = NDArray.zeros(100, 200)
+// save list of NDArrays
+NDArray.save("/path/to/array/file", Array(a, b))
+// save dictionary of NDArrays to AWS S3
+NDArray.save("s3://path/to/s3/array", Map("A" -> a, "B" -> b))
+// save list of NDArrays to hdfs.
+NDArray.save("hdfs://path/to/hdfs/array", Array(a, b))
+val from_file = NDArray.load("/path/to/array/file")
+val from_s3 = NDArray.load("s3://path/to/s3/array")
+val from_hdfs = NDArray.load("hdfs://path/to/hdfs/array")
 ```
 The good thing about using the `save` and `load` interface is that you can use the format across all `mxnet` language bindings. They also already support Amazon S3 and HDFS.
 
@@ -128,29 +128,29 @@ The good thing about using the `save` and `load` interface is that you can use t
 Device information is stored in the `mxnet.Context` structure. When creating NDArray in MXNet, you can use the context argument (the default is the CPU context) to create arrays on specific devices as follows:
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val cpu_a = NDArray.zeros(100, 200)
-    scala> cpu_a.context
-    org.apache.mxnet.Context = cpu(0)
-    scala> val ctx = Context.gpu(0)
-    scala> val gpu_b = NDArray.zeros(Shape(100, 200), ctx)
-    scala> gpu_b.context
-    org.apache.mxnet.Context = gpu(0)
+import org.apache.mxnet._
+val cpu_a = NDArray.zeros(100, 200)
+cpu_a.context
+// org.apache.mxnet.Context = cpu(0)
+val ctx = Context.gpu(0)
+val gpu_b = NDArray.zeros(Shape(100, 200), ctx)
+gpu_b.context
+// org.apache.mxnet.Context = gpu(0)
 ```
 
 Currently, we *do not* allow operations among arrays from different contexts. To manually enable this, use the `copyto` member function to copy the content to different devices, and continue computation:
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val x = NDArray.zeros(100, 200)
-    scala> val ctx = Context.gpu(0)
-    scala> val y = NDArray.zeros(Shape(100, 200), ctx)
-    scala> val z = x + y
-    mxnet.base.MXNetError: [13:29:12] src/ndarray/ndarray.cc:33:
-    Check failed: lhs.ctx() == rhs.ctx() operands context mismatch
-    scala> val cpu_y = NDArray.zeros(100, 200)
-    scala> y.copyto(cpu_y)
-    scala> val z = x + cpu_y
+import org.apache.mxnet._
+val x = NDArray.zeros(100, 200)
+val ctx = Context.gpu(0)
+val y = NDArray.zeros(Shape(100, 200), ctx)
+val z = x + y
+// mxnet.base.MXNetError: [13:29:12] src/ndarray/ndarray.cc:33:
+// Check failed: lhs.ctx() == rhs.ctx() operands context mismatch
+val cpu_y = NDArray.zeros(100, 200)
+y.copyto(cpu_y)
+val z = x + cpu_y
 ```
 
 ## Next Steps
diff --git a/docs/api/scala/symbol.md b/docs/api/scala/symbol.md
index 5b73ae5d6002..c10d5fb60a24 100644
--- a/docs/api/scala/symbol.md
+++ b/docs/api/scala/symbol.md
@@ -20,14 +20,14 @@ You can configure the graphs either at the level of neural network layer operati
 The following example configures a two-layer neural network.
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val data = Symbol.Variable("data")
-    scala> val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
-    scala> val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-    scala> val fc2 = Symbol.api.FullyConnected(some(act1), num_hidden = 64, name = "fc2")
-    scala> val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out")
-    scala> :type net
-    org.apache.mxnet.Symbol
+    import org.apache.mxnet._
+    val data = Symbol.Variable("data")
+    val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
+    val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
+    val fc2 = Symbol.api.FullyConnected(some(act1), num_hidden = 64, name = "fc2")
+    val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out")
+    :type net
+    // org.apache.mxnet.Symbol
 ```
 
 The basic arithmetic operators (plus, minus, div, multiplication) are overloaded for
@@ -36,10 +36,10 @@ The basic arithmetic operators (plus, minus, div, multiplication) are overloaded
 The following example creates a computation graph that adds two inputs together.
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val a = Symbol.Variable("a")
-    scala> val b = Symbol.Variable("b")
-    scala> val c = a + b
+    import org.apache.mxnet._
+    val a = Symbol.Variable("a")
+    val b = Symbol.Variable("b")
+    val c = a + b
 ```
 
 ## Symbol Attributes
@@ -54,7 +54,7 @@ For proper communication with the C++ backend, both the key and values of the at
 
 ```
     data.attr("mood")
-    res6: Option[String] = Some(angry)
+    // Option[String] = Some(angry)
 ```
 
 To attach attributes, you can use ```AttrScope```. ```AttrScope``` automatically adds the specified attributes to all of the symbols created within that scope. The user can also inherit this object to change naming behavior. For example:
@@ -71,7 +71,7 @@ To attach attributes, you can use ```AttrScope```. ```AttrScope``` automatically
 
     val exceedScopeData = Symbol.Variable("data3")
     assert(exceedScopeData.attr("group") === None, "No group attr in global attr scope")
-```  
+```
 
 ## Serialization
 
@@ -83,14 +83,14 @@ Refer to [API documentation](http://mxnet.incubator.apache.org/api/scala/docs/in
 The following example shows how to save a symbol to an S3 bucket, load it back, and compare two symbols using a JSON string.
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val a = Symbol.Variable("a")
-    scala> val b = Symbol.Variable("b")
-    scala> val c = a + b
-    scala> c.save("s3://my-bucket/symbol-c.json")
-    scala> val c2 = Symbol.load("s3://my-bucket/symbol-c.json")
-    scala> c.toJson == c2.toJson
-    Boolean = true
+    import org.apache.mxnet._
+    val a = Symbol.Variable("a")
+    val b = Symbol.Variable("b")
+    val c = a + b
+    c.save("s3://my-bucket/symbol-c.json")
+    val c2 = Symbol.load("s3://my-bucket/symbol-c.json")
+    c.toJson == c2.toJson
+    // Boolean = true
 ```
 
 ## Executing Symbols
@@ -101,25 +101,25 @@ handled by the high-level [Model class](model.md) and the [`fit()`] function.
 
 For neural networks used in "feed-forward", "prediction", or "inference" mode (all terms for the same
 thing: running a trained network), the input arguments are the
-input data, and the weights of the neural network that were learned during training.  
+input data, and the weights of the neural network that were learned during training.
 
 To manually execute a set of symbols, you need to create an [`Executor`] object,
-which is typically constructed by calling the [`simpleBind(<parameters>)`] method on a symbol.  
+which is typically constructed by calling the [`simpleBind(<parameters>)`] method on a symbol.
 
 ## Multiple Outputs
 
 To group the symbols together, use the [mxnet.symbol.Group](#mxnet.symbol.Group) function.
 
 ```scala
-    scala> import org.apache.mxnet._
-    scala> val data = Symbol.Variable("data")
-    scala> val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
-    scala> val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-    scala> val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
-    scala> val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out")
-    scala> val group = Symbol.Group(fc1, net)
-    scala> group.listOutputs()
-    IndexedSeq[String] = ArrayBuffer(fc1_output, out_output)
+    import org.apache.mxnet._
+    val data = Symbol.Variable("data")
+    val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
+    val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
+    val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
+    val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out")
+    val group = Symbol.Group(fc1, net)
+    group.listOutputs()
+    // IndexedSeq[String] = ArrayBuffer(fc1_output, out_output)
 ```
 
 After you get the ```group```, you can bind on ```group``` instead.
diff --git a/docs/gluon/index.md b/docs/gluon/index.md
index 4f6d3c10f38c..c34ee9c22739 100644
--- a/docs/gluon/index.md
+++ b/docs/gluon/index.md
@@ -43,7 +43,7 @@ The community is also working on parallel effort to create a foundational resour
 
 Use plug-and-play neural network building blocks, including predefined layers, optimizers, and initializers:
 
-```python
+```
 net = gluon.nn.Sequential()
 # When instantiated, Sequential stores a chain of neural network layers.
 # Once presented with data, Sequential executes each layer in turn, using
@@ -59,7 +59,7 @@ with net.name_scope():
 
 Prototype, build, and train neural networks in fully imperative manner using the MXNet autograd package and the Gluon trainer method:
 
-```python
+```
 epochs = 10
 
 for e in range(epochs):
@@ -76,7 +76,7 @@ for e in range(epochs):
 
 Build neural networks on the fly for use cases where neural networks must change in size and shape during model training:
 
-```python
+```
 def forward(self, F, inputs, tree):
     children_outputs = [self.forward(F, inputs, child)
                         for child in tree.children]
@@ -89,7 +89,7 @@ def forward(self, F, inputs, tree):
 
 Easily cache the neural network to achieve high performance by defining your neural network with ``HybridSequential`` and calling the ``hybridize`` method:
 
-```python
+```
 net = nn.HybridSequential()
 with net.name_scope():
     net.add(nn.Dense(256, activation="relu"))
@@ -97,7 +97,7 @@ with net.name_scope():
     net.add(nn.Dense(2))
 ```
 
-```python
+```
 net.hybridize()
 ```
 
diff --git a/docs/install/ubuntu_setup.md b/docs/install/ubuntu_setup.md
index 9961c706af1d..7d8da182b070 100644
--- a/docs/install/ubuntu_setup.md
+++ b/docs/install/ubuntu_setup.md
@@ -79,18 +79,6 @@ Alternatively, you can use the table below to select the package that suits your
 #### pip Package Availability
 
 The following table presents the pip packages that are recommended for each version of MXNet.
-<!-- Table does not render - using a picture alternative
-| Package / MXNet Version | 1.3.0 | 1.2.1 | 1.1.0 | 1.0.0 | 0.12.1 | 0.11.0 |
-|-|:-:|:-:|:-:|:-:|:-:|:-:|
-| mxnet-cu92mkl | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="far fa-times-circle"></i> | <i class="far fa-times-circle"></i> | <i class="far fa-times-circle"></i> | <i class="far fa-times-circle"></i> |
-| mxnet-cu92 | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="far fa-times-circle"></i> | <i class="far fa-times-circle"></i> | <i class="far fa-times-circle"></i> | <i class="far fa-times-circle"></i> |
-| mxnet-cu90mkl | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="far fa-times-circle"></i> |
-| mxnet-cu90 | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="far fa-times-circle"></i> |
-| mxnet-cu80mkl | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> |
-| mxnet-cu80 | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> |
-| mxnet-mkl | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> |
-| mxnet | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> | <i class="fas fa-check"></i> |
--->
 
 ![pip package table](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/install/pip-packages.png)
 
diff --git a/docs/tutorials/r/fiveMinutesNeuralNetwork.md b/docs/tutorials/r/fiveMinutesNeuralNetwork.md
index a2ce5ecd3761..6d79cd288d2c 100644
--- a/docs/tutorials/r/fiveMinutesNeuralNetwork.md
+++ b/docs/tutorials/r/fiveMinutesNeuralNetwork.md
@@ -1,7 +1,7 @@
 Develop a Neural Network with MXNet in Five Minutes
 =============================================
 
-This tutorial is designed for new users of the `mxnet` package for R. It shows how to construct a neural network to do regression in 5 minutes. It shows how to perform classification and regression tasks, respectively. The data we use is in the `mlbench` package. Instructions to install R and MXNet's R package in different environments can be found [here](http://mxnet.incubator.apache.org/install/index.html?platform=Linux&language=R&processor=CPU). 
+This tutorial is designed for new users of the `mxnet` package for R. It shows how to construct a neural network to do regression in 5 minutes. It shows how to perform classification and regression tasks, respectively. The data we use is in the `mlbench` package. Instructions to install R and MXNet's R package in different environments can be found [here](http://mxnet.incubator.apache.org/install/index.html?platform=Linux&language=R&processor=CPU).
 
 ## Classification
 
@@ -88,7 +88,7 @@ Note that `mx.set.seed` controls the random process in `mxnet`. You can see the
 
 To get an idea of what is happening, view the computation graph from R:
 
- ```{r}
+ ```r
     graph.viz(model$symbol)
  ```
 
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index b3d8f80318ba..2e130d498c14 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -755,7 +755,7 @@ def get_constant(self, name, value=None):
 
         Returns
         -------
-        Constant
+        :py:class:`.Constant`
             The created or retrieved :py:class:`.Constant`.
         """
         name = self.prefix + name

From 6ae647dde300453694aca2e65c586cd318acedf4 Mon Sep 17 00:00:00 2001
From: Denisa Roberts <d.roberts@vt.edu>
Date: Tue, 4 Dec 2018 14:18:29 -0500
Subject: [PATCH 24/54] [MXNET-1235] Add a test for AdaMax optimizer (#13467)

* Add a test for AdaMax optimizer

* Modify nested for loop with itertools.product and left tolerance to default

* Trigger
---
 tests/python/unittest/test_optimizer.py | 76 ++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 334b7d4c0fdb..b03dcdcfba44 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import numpy as np
+import itertools
 import mxnet as mx
 import mxnet.lr_scheduler as lr_scheduler
 from mxnet import gluon
@@ -501,7 +502,6 @@ def test_ftml():
 
 
 # ADAM
-
 class PyAdam(mx.optimizer.Optimizer):
     """python reference implemenation of adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
@@ -613,6 +613,80 @@ def test_adam():
                                           dtype, w_stype='default', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)
 
+
+# AdaMax
+class PyAdamax(mx.optimizer.Optimizer):
+    """The python reference of AdaMax optimizer.
+
+    This class implements the AdaMax optimizer, one variant of Adam based on the infinity norm,
+    available at http://arxiv.org/abs/1412.6980 Section 7.
+
+    The optimizer updates the weight by::
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    beta1 : float, optional
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, optional
+        Exponential decay rate for the second moment estimates.
+    """
+    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
+        super(PyAdamax, self).__init__(learning_rate=learning_rate, **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+
+    def create_state(self, index, weight):
+        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+
+        t = self._index_update_count[index]
+        lr /= (1. - self.beta1**t)
+
+        # preprocess grad
+        grad = grad * self.rescale_grad + wd * weight
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+        # update m_t and u_t
+        m_t, u_t = state
+        m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
+        u_t[:] = mx.nd.maximum(self.beta2 * u_t, mx.nd.abs(grad))
+
+        # update weight
+        weight[:] -= lr * m_t / u_t
+
+
+@with_seed()
+def test_adamax():
+    opt1 = PyAdamax
+    opt2 = mx.optimizer.Adamax
+    shape = (3, 4, 5)
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    for dtype in [np.float16, np.float32, np.float64]:
+        for params in itertools.product(cg_options, rg_options, wd_options, mp_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or
+                    not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+
+
 # Signum
 class PySignum(mx.optimizer.Optimizer):
     """The python reference of Signum optimizer.

From 06245b16bb2e2e8caabf03841e8341ac5f7f98c8 Mon Sep 17 00:00:00 2001
From: Anirudh <anirudhkrec@gmail.com>
Date: Tue, 4 Dec 2018 11:37:50 -0800
Subject: [PATCH 25/54] Adadelta optimizer test (#13443)

* adadelta test

* comments
---
 python/mxnet/optimizer/optimizer.py     |  2 +
 tests/python/unittest/test_optimizer.py | 73 ++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 442a11d02200..d7b6821ac8cc 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -637,6 +637,8 @@ class FTML(Optimizer):
         z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
         weight = - z / d_t
 
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index b03dcdcfba44..acf24ee1b794 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import itertools
 import numpy as np
 import itertools
 import mxnet as mx
@@ -1050,8 +1051,8 @@ def update(self, index, weight, grad, state):
         div = grad / mx.nd.sqrt(history + self.float_stable_eps)
         weight[:] += (div + weight * wd) * -lr
 
+@with_seed()
 def test_adagrad():
-    mx.random.seed(0)
     opt1 = PyAdaGrad
     opt2 = mx.optimizer.AdaGrad
     shape = (3, 4, 5)
@@ -1076,6 +1077,76 @@ def test_adagrad():
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
                                               g_stype='row_sparse')
 
+# AdaDelta
+class PyAdaDelta(mx.optimizer.Optimizer):
+    """The python reference of AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad ** 2
+        cur_delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * cur_delta ** 2
+        weight -= (cur_delta + wd * weight)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    rho: float
+        Decay rate for both squared gradients and delta.
+    epsilon : float
+        Small value to avoid division by 0.
+    """
+    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
+        super(PyAdaDelta, self).__init__(**kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (mx.nd.zeros(weight.shape, weight.context),
+                mx.nd.zeros(weight.shape, weight.context))
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        wd = self._get_wd(index)
+
+        grad *= self.rescale_grad
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+        acc_grad, acc_delta = state
+
+        acc_grad[:] = self.rho * acc_grad + (1. - self.rho) * grad ** 2
+        current_delta = (mx.nd.sqrt(acc_delta + self.epsilon) /
+                         mx.nd.sqrt(acc_grad + self.epsilon)) * grad
+        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta ** 2
+
+        # update weight
+        weight[:] -= current_delta + wd * weight
+
+@with_seed()
+def test_adadelta():
+    opt1 = PyAdaDelta
+    opt2 = mx.optimizer.AdaDelta
+    shape = (3, 4, 5)
+    rho_options = [{'rho': 0.9}]
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.0}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(rho_options, eps_options, cg_options,
+                                        rg_options, wd_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+
 
 def test_factor_scheduler():
     base_lr = 1

From f7192b238b8816a2c7733b208f85d4a5211eb9d4 Mon Sep 17 00:00:00 2001
From: Andrew Ayres <andrew.f.ayres@gmail.com>
Date: Tue, 4 Dec 2018 16:01:26 -0800
Subject: [PATCH 26/54] Update java setup docs for 1.4.0 (#13536)

* Update java setup docs for 1.4.0

* Update Java-demo to 1.4.0
---
 docs/install/index.md                         | 18 +++++++++---------
 docs/install/java_setup.md                    |  6 +++---
 docs/tutorials/java/mxnet_java_on_intellij.md |  2 +-
 scala-package/mxnet-demo/java-demo/Makefile   |  2 +-
 scala-package/mxnet-demo/java-demo/README.md  |  4 ++--
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/install/index.md b/docs/install/index.md
index 5067c5df4475..6491d46be5c4 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -473,15 +473,15 @@ You can use the Maven packages defined in the following dependency to include MX
 <br/>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.3.1-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
     <scope>system</scope>
-    <version>1.3.1</version>
-    <systemPath>/system/path/to/jar/mxnet-full_2.11-linux-x86_64-gpu-1.3.1-SNAPSHOT.jar</systemPath>
+    <version>1.4.0</version>
+    <systemPath>/system/path/to/jar/mxnet-full_2.11-linux-x86_64-gpu-1.4.0-SNAPSHOT.jar</systemPath>
 </dependency>
 ```
 
@@ -492,15 +492,15 @@ You can use the Maven packages defined in the following dependency to include MX
 <br/>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.3.1-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
     <scope>system</scope>
-    <version>1.3.1</version>
-    <systemPath>/system/path/to/jar/mxnet-full_2.11-linux-x86_64-cpu-1.3.1-SNAPSHOT.jar</systemPath>
+    <version>1.4.0</version>
+    <systemPath>/system/path/to/jar/mxnet-full_2.11-linux-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
 </dependency>
 ```
 <br>
@@ -776,15 +776,15 @@ Not available at this time. <br>
 </br>
 You can use the Maven packages defined in the following dependency to include MXNet in your Java project. The Java API is provided as a subset of the Scala API and is intended for inference only. Please refer to the <a href="java_setup.html">MXNet-Java setup guide</a> for a detailed set of instructions to help you with the setup process.
 
-<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.3.1-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
 
 ```html
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
     <scope>system</scope>
-    <version>1.3.1</version>
-    <systemPath>/system/path/to/jar/mxnet-full_2.11-osx-x86_64-cpu-1.3.1-SNAPSHOT.jar</systemPath>
+    <version>1.4.0</version>
+    <systemPath>/system/path/to/jar/mxnet-full_2.11-osx-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
 </dependency>
 ```
 <br>
diff --git a/docs/install/java_setup.md b/docs/install/java_setup.md
index fe55d074e754..34b0967c421e 100644
--- a/docs/install/java_setup.md
+++ b/docs/install/java_setup.md
@@ -59,7 +59,7 @@ Also, add the dependency which corresponds to your platform to the `dependencies
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-  <version>1.3.1-SNAPSHOT</version>
+  <version>1.4.0-SNAPSHOT</version>
 </dependency>
 ```
 
@@ -68,7 +68,7 @@ Also, add the dependency which corresponds to your platform to the `dependencies
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-  <version>1.3.1-SNAPSHOT</version>
+  <version>1.4.0-SNAPSHOT</version>
 </dependency>
 ```
 
@@ -77,7 +77,7 @@ Also, add the dependency which corresponds to your platform to the `dependencies
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <version>1.3.1-SNAPSHOT</version>
+  <version>1.4.0-SNAPSHOT</version>
 </dependency>
 ```
 
diff --git a/docs/tutorials/java/mxnet_java_on_intellij.md b/docs/tutorials/java/mxnet_java_on_intellij.md
index f4d4ea5ab839..ef2c009f66e8 100644
--- a/docs/tutorials/java/mxnet_java_on_intellij.md
+++ b/docs/tutorials/java/mxnet_java_on_intellij.md
@@ -102,7 +102,7 @@ Also, add this under the `dependencies` tag :
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <version>1.3.1-SNAPSHOT</version>
+  <version>1.4.0-SNAPSHOT</version>
 </dependency>
 ```
 The official Java Packages will be released with the release of MXNet 1.4 and will be available on  [MXNet Maven package repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.mxnet%22).
diff --git a/scala-package/mxnet-demo/java-demo/Makefile b/scala-package/mxnet-demo/java-demo/Makefile
index 6b76e272f9e0..bb47db1c6d27 100644
--- a/scala-package/mxnet-demo/java-demo/Makefile
+++ b/scala-package/mxnet-demo/java-demo/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 SCALA_VERSION_PROFILE := 2.11
-MXNET_VERSION := 1.3.1-SNAPSHOT
+MXNET_VERSION := 1.4.0-SNAPSHOT
 
 ifeq ($(OS),Windows_NT)
 	UNAME_S := Windows
diff --git a/scala-package/mxnet-demo/java-demo/README.md b/scala-package/mxnet-demo/java-demo/README.md
index 55b4d914b834..dbe18052a899 100644
--- a/scala-package/mxnet-demo/java-demo/README.md
+++ b/scala-package/mxnet-demo/java-demo/README.md
@@ -12,7 +12,7 @@ You can use the following instruction as an alternative to achieve the same resu
 User are required to use `mvn package` to build the package,
  which are shown below:
 ```Bash
-export SCALA_VERSION_PROFILE=2.11 MXNET_VERSION=1.3.1-SNAPSHOT
+export SCALA_VERSION_PROFILE=2.11 MXNET_VERSION=1.4.0-SNAPSHOT
 export SCALA_PKG_PROFILE=
 mvn package -Dmxnet.profile=$SCALA_PKG_PROFILE \
 		-Dmxnet.scalaprofile=$SCALA_VERSION_PROFILE \
@@ -80,5 +80,5 @@ sudo apt install libopencv-imgcodecs3.4
 
 Is there any other version available?
 
-You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.3.1-SNAPSHOT~~).
+You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~).
 Please keep the same version in the Makefile or [above version](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.

From a3eca5f5c96eed0bc29bd4e58e470997091a1fb3 Mon Sep 17 00:00:00 2001
From: Alexander Zai <azai91@gmail.com>
Date: Tue, 4 Dec 2018 16:26:11 -0800
Subject: [PATCH 27/54] Revert "Feature/mkldnn static 2 (#13503)" (#13540)

This reverts commit 65edc9500b10a3404945d6d79acbae54a2833890.
---
 CMakeLists.txt                          |  1 -
 Makefile                                |  9 +---
 ci/docker/runtime_functions.sh          |  3 ++
 ci/jenkins/Jenkins_steps.groovy         |  8 ++--
 mkldnn.mk                               | 12 ++----
 tests/cpp/unittest.mk                   |  8 ++--
 tests/python/mkl/test_mkldnn.py         |  6 ++-
 tests/python/mkl/test_mkldnn_install.py | 56 +++++++++++++++++++++++++
 8 files changed, 77 insertions(+), 26 deletions(-)
 create mode 100644 tests/python/mkl/test_mkldnn_install.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 161705643194..3b8bbd2e0272 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,7 +227,6 @@ if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
-    set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
diff --git a/Makefile b/Makefile
index e424904ad785..16ea59f3d585 100644
--- a/Makefile
+++ b/Makefile
@@ -131,13 +131,8 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	# MKLDNN but to needs to be dynamically linked for windows as not all VS compilers support static linking
-	ifneq ($(UNAME_S), Windows)
-		LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
-	else
-		CFLAGS += -I$(MKLDNNROOT)/include
-		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
-	endif
+	CFLAGS += -I$(MKLDNNROOT)/include
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
 # setup opencv
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 5a44cccc6aa0..1fc10bf0e085 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -629,6 +629,9 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 309775c88c85..f48a26737308 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,19 +23,19 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
diff --git a/mkldnn.mk b/mkldnn.mk
index 5af3e9b1d741..d79bbe7d2a0e 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,20 +19,14 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
-	MKLDNN_LIBRARY_TYPE=STATIC
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
-else ifeq ($(UNAME_S), Windows)
-	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
-	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
-	MKLDNN_LIBRARY_TYPE=SHARED
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
 endif
 endif
 
@@ -43,7 +37,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=$(MKLDNN_LIBRARY_TYPE)
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 665ce6982874..746ee2f096f1 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -41,22 +41,22 @@ gtest-all.o : $(GTEST_SRCS_)
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
-build/tests/cpp/%.o : tests/cpp/%.cc
+build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc
+build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc
+build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
+build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index d9d3abfc3ced..c6c0a0832f1f 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -27,6 +27,7 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
+import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -440,4 +441,7 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
-    
+
+
+if __name__ == '__main__':
+    install.test_mkldnn_install()
diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
new file mode 100644
index 000000000000..c2f26df72f2e
--- /dev/null
+++ b/tests/python/mkl/test_mkldnn_install.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+MKL-DNN related test cases
+"""
+
+import sys
+import os
+import logging
+
+
+def test_mkldnn_install():
+    """
+    This test will verify that MXNet is built/installed correctly when
+    compiled with Intel MKL-DNN library. The method will try to import
+    the mxnet module and see if the mkldnn library is mapped to this
+    process's address space.
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    if not sys.platform.startswith('linux'):
+        logging.info("Bypass mkldnn install test for non-Linux OS")
+        return
+
+    try:
+        #pylint: disable=unused-variable
+        import mxnet as mx
+    except (ImportError, OSError) as e:
+        assert 0, "Import mxnet error: %s. Please double check your build/" \
+            "install steps or environment variable settings" % str(e)
+
+    pid = os.getpid()
+    rc = os.system("cat /proc/" + str(pid) +
+                   "/maps | grep libmkldnn > /dev/null")
+
+    if rc == 0:
+        logging.info("MXNet is built/installed correctly with MKL-DNN")
+    else:
+        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
+            "double check your build/install steps or environment " \
+            "variable settings"

From 0f85f5de9eaa3c1b31bff9337268c071f634073d Mon Sep 17 00:00:00 2001
From: Anirudh <anirudhkrec@gmail.com>
Date: Tue, 4 Dec 2018 18:00:58 -0800
Subject: [PATCH 28/54] doc fix (#13465)

---
 python/mxnet/optimizer/optimizer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index d7b6821ac8cc..a085b6fe2ef6 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -688,8 +688,11 @@ class LBSGD(Optimizer):
         state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
         weight = weight - state
 
-    For details of the update algorithm see :class:`~mxnet.ndarray.lbsgd_update` and
-    :class:`~mxnet.ndarray.lbsgd_mom_update`.
+    For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update`
+    and :class:`~mxnet.ndarray.sgd_mom_update`.
+    In addition to the SGD updates the LBSGD optimizer uses the LARS, Layer-wise
+    Adaptive Rate Scaling, algorithm to have a separate learning rate for each
+    layer of the network, which leads to better stability over large batch sizes.
 
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.

From d2102faa228bdc6723a9da299c6ff5999cbbdcdb Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <40836875+vishaalkapoor@users.noreply.github.com>
Date: Tue, 4 Dec 2018 18:35:04 -0800
Subject: [PATCH 29/54] [MXAPPS-1020] Clean up some Sphinx warnings. (#13539)

---
 docs/api/python/symbol/contrib.md | 3 ---
 docs/faq/develop_and_hack.md      | 4 ++--
 docs/gluon/index.md               | 1 -
 src/operator/contrib/dgl_graph.cc | 4 +++-
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index 35cd11c89a70..a0253216f945 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -55,9 +55,6 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     foreach
     while_loop
     cond
-    isinf
-    isfinite
-    isnan
     index_copy
     getnnz
     edge_id
diff --git a/docs/faq/develop_and_hack.md b/docs/faq/develop_and_hack.md
index 8b7dd672eea9..da53fd010bb0 100644
--- a/docs/faq/develop_and_hack.md
+++ b/docs/faq/develop_and_hack.md
@@ -4,5 +4,5 @@
 - [Set environment variables of MXNet](env_var.md)
 
 # Other Resources
-- [MXNet System Architecture Overview](http://mxnet.io/architecture/overview.html)
-- [Contributor Guidelines](http://mxnet.io/community/contribute.html)
\ No newline at end of file
+- [MXNet System Architecture Overview](/architecture/overview.html)
+- [Contributor Guidelines](/community/contribute.html)
diff --git a/docs/gluon/index.md b/docs/gluon/index.md
index c34ee9c22739..96e8e36dbf20 100644
--- a/docs/gluon/index.md
+++ b/docs/gluon/index.md
@@ -82,7 +82,6 @@ def forward(self, F, inputs, tree):
                         for child in tree.children]
     #Recursively builds the neural network based on each input sentence’s
     #syntactic structure during the model definition and training process
-    …
 ```
 <br/>
 **__High Performance__**
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index ed7caacfdbae..6d586755c957 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -795,7 +795,8 @@ uniform probability.
 
   out[2]
   [0 0 0 0 0]
-<NDArray 5 @cpu(0)>
+  <NDArray 5 @cpu(0)>
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<NeighborSampleParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {
@@ -885,6 +886,7 @@ uniform probability.
   out[3]
   [0 0 0 0 0]
   <NDArray 5 @cpu(0)>
+
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<NeighborSampleParam>)
 .set_num_inputs([](const NodeAttrs& attrs) {

From 2f55488a446acadba02e0d8a6cb42edf47abe1d0 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 5 Dec 2018 09:38:25 -0800
Subject: [PATCH 30/54] [MXNET-1110] Add header files required by horovod
 (#13062)

* Add header files required by horovod

* Add symbolic link and cherry picked required header

* add python API to return include path

* update link

* fix windows CI

* fix windows build

* fix dlpack link

* merge with master

* exclude 3rd party header files from license check

* exclude license check

* exclude include directory

* remove commented lines
---
 ci/build_windows.py                                 | 3 ---
 include/dlpack                                      | 1 +
 include/dmlc                                        | 1 +
 include/mshadow                                     | 1 +
 include/nnvm                                        | 1 +
 tests/nightly/apache_rat_license_check/rat-excludes | 1 +
 6 files changed, 5 insertions(+), 3 deletions(-)
 create mode 120000 include/dlpack
 create mode 120000 include/dmlc
 create mode 120000 include/mshadow
 create mode 120000 include/nnvm

diff --git a/ci/build_windows.py b/ci/build_windows.py
index 56769f7cdaf0..b7d47fb1fde1 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -160,9 +160,6 @@ def windows_package(args):
         copy_tree('python', j(pkgdir, 'python'))
         logging.info('packing headers')
         copy_tree('include', j(pkgdir, 'include'))
-        copy_tree(j('3rdparty','dmlc-core','include'), j(pkgdir, 'include'))
-        copy_tree(j('3rdparty','mshadow', 'mshadow'), j(pkgdir, 'include', 'mshadow'))
-        copy_tree(j('3rdparty','tvm','nnvm', 'include'), j(pkgdir,'include', 'nnvm', 'include'))
         logging.info("Compressing package: %s", pkgfile)
         check_call(['7z', 'a', pkgfile, pkgdir])
 
diff --git a/include/dlpack b/include/dlpack
new file mode 120000
index 000000000000..e19164b88516
--- /dev/null
+++ b/include/dlpack
@@ -0,0 +1 @@
+../3rdparty/dlpack/include/dlpack
\ No newline at end of file
diff --git a/include/dmlc b/include/dmlc
new file mode 120000
index 000000000000..869c40b0e502
--- /dev/null
+++ b/include/dmlc
@@ -0,0 +1 @@
+../3rdparty/dmlc-core/include/dmlc
\ No newline at end of file
diff --git a/include/mshadow b/include/mshadow
new file mode 120000
index 000000000000..0ff1a4b9e3b4
--- /dev/null
+++ b/include/mshadow
@@ -0,0 +1 @@
+../3rdparty/mshadow/mshadow
\ No newline at end of file
diff --git a/include/nnvm b/include/nnvm
new file mode 120000
index 000000000000..779dd4459a3c
--- /dev/null
+++ b/include/nnvm
@@ -0,0 +1 @@
+../3rdparty/tvm/nnvm/include/nnvm
\ No newline at end of file
diff --git a/tests/nightly/apache_rat_license_check/rat-excludes b/tests/nightly/apache_rat_license_check/rat-excludes
index 0c305f498b34..0d95792efc15 100755
--- a/tests/nightly/apache_rat_license_check/rat-excludes
+++ b/tests/nightly/apache_rat_license_check/rat-excludes
@@ -58,3 +58,4 @@ moderngpu/*
 deformable_im2col.cuh
 deformable_im2col.h
 REQUIRE
+include/*
\ No newline at end of file

From 40db61908000ee86d21aac847ff2225807d6c168 Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Wed, 5 Dec 2018 10:11:25 -0800
Subject: [PATCH 31/54] Bumped minor version from 1.4.0 to 1.5.0 on master,
 updated License file (#13478)

* updated to v1.5.0

* Bumped minor version from 1.4.0 to 1.5.0 on master

* added Anirudh as maintainer for R package

... adding something useful and re-trigger PR check

* Updated license file for clojure, onnx-tensorrt, gtest, R-package

* Get the correct include path in pip package (#13452)

* add find_include_path API

* address reviewer comment

* change return type from list to string

* add unit test

* address reviewer comment

* address reviewer comment

* address reviewer comment

* address reviewer comment

* fix include path problem in pip package

* add comment

* fix lint error

* address reviewer comment

* address reviewer comment

* Use ~/.ccache as default ccache directory so is not cache is not erased on reboot (#13431)

* Skip flaky test https://github.com/apache/incubator-mxnet/issues/13446 (#13480)

* Rewrite dataloader with process pool, improves responsiveness and reliability (#13447)

* fix recordio.py

* rewrite dataloader with pool

* fix batch as tuple

* fix prefetching

* fix pylint

* picklable function

* use pickle

* add missing commit

* Fix errors in docstrings for subgraph op; use code directive (#13463)

* [MXNET-1158] JVM Memory Management Documentation (#13105)

* update train_mnist

* Add documentation for JVM Memory Management

* update doc

* address nit picks

* address nit picks

* Grammar and clarity edits for memory management doc

* Edits for scala memory management

* Update memory-management.md

* Update memory-management.md

* Update memory-management.md

* capitalization fix

* Update row_sparse tutorial (#13414)

Update row_sparse tutorial

* Add resiliency to onnx export code (#13426)

* Added resiliency to onnx export code

- With previous infer-shape implementation, if input shape was list instead of tuple or if extra non-existent parameters were provided, the code would still work. The fixes in this commit make sure that behavior is restored to prevent any compatibility issues with existing export code.

* Fixed name of net in unittest

* Fix pylint

* [MXNET-1185] Support large array in several operators (part 1) (#13418)

* fix a few operators with large arrays (# of elements)

* fix bug in broadcast_div and add tests

* address reviewer comment

* add unit test

* add empty line

* retrigger CI

* [MXNET-1210 ] Gluon Audio - Example (#13325)

* Initialized the example

* Addressed PR comments, about existing synset.txt file - no overwrite

* RST - docstring issues fixed

* added README

* Addressed PR comments

* Addressed PR comments, checking Divide by 0

* Raising error if format is not supported.

* changed a line for ndarray of labels

* Trigger CI

* Trigger CI

* PR comments addressed around skip_header argument

* Addressed PR comments around librosa import

* PR Comments

* Passing lazy=lazy from argument

* Added PR comments, labels to README.MD

* Trigger CI

* Addressing PR Comments in README

* Modified README.md

* Added example under audio folder

* Retrigger CI

* Retrigger CI

* ONNX export: Instance normalization, Shape (#12920)

* ONNX import/export: Make backend_rep common

* ONNX export: Instance Normalization

* ONNX export: Shape operator

* Clarify dependency on OpenCV in CNN Visualization tutorial. (#13495)

* clarify ops faq regarding docs strings (#13492)

* Add graph_compact operator. (#13436)

* add graph_compact.

* fix.

* add doc.

* add tests for graph_compact.

* address comments.

* update docs.

* trigger CI

* Deprecate Jenkinsfile (#13474)

* update github location for sampled_block.py (#13508)

Updated to https://github.com/dmlc/gluon-nlp/blob/master/src/gluonnlp/model/sampled_block.py

* #13453 [Clojure] - Add Spec Validations to the Optimizer namespace (#13499)

* ONNX export: Logical operators (#12852)

* Fix cmake options parsing in dev_menu (#13458)

Add GPU+MKLDNN unittests to dev_menu

* Revert "Manually track num_max_thread (#12380)" (#13501)

This reverts commit 75410210e07a5fab5e044348aee276d578d5857e.

* Feature/mkldnn static 2 (#13503)

* build mkldnn as static lib

* update makefile to statically build mkldnn

* build static mkldnn

* fix static name

* fix static name

* update static for mac

* rename mkldnn dep in ci

* remove moving mkldnn dynamic lib

* remove commented code

* remove mkldnn dnaymic for unitest

* force static for mkldnn lib

* remove dynamic mkldnn bind

* only link windows

* add mkldnn.mk

* try force linking

* remove mkldnn dynanmic check

* remove test mkldnn install

* fix spacing

* fix index

* add artifacts

* add comment about windows

* remove static

* update makefile

* fix toctree Sphinx errors (#13489)

* fix toctree errors

* nudging file for CI

* Disabled flaky test test_gluon_data.test_recordimage_dataset_with_data_loader_multiworker (#13527)

* [MXNET-1234] Fix shape inference problems in Activation backward (#13409)

* Provide a failing test for ReLU activation shape inference bug

* Fix Activation backward shape inference

fixes: #13333

* Add softsign Activation to test_gluon.py

* Use activation in GPU if we are using CUDNN and not MKLDNN as it's happening right now

* Don't disable MKLDNN
---
 CMakeLists.txt                                |  1 +
 LICENSE                                       | 94 +++++++++++++++++--
 Makefile                                      |  9 +-
 R-package/DESCRIPTION                         | 10 +-
 ci/docker/runtime_functions.sh                |  3 -
 ci/jenkins/Jenkins_steps.groovy               |  8 +-
 contrib/clojure-package/README.md             | 16 ++--
 .../cnn-text-classification/project.clj       |  2 +-
 .../clojure-package/examples/gan/project.clj  |  2 +-
 .../examples/imclassification/project.clj     |  2 +-
 .../examples/module/project.clj               |  2 +-
 .../examples/multi-label/project.clj          |  2 +-
 .../examples/neural-style/project.clj         |  2 +-
 .../examples/pre-trained-models/project.clj   |  2 +-
 .../examples/profiler/project.clj             |  2 +-
 .../clojure-package/examples/rnn/project.clj  |  2 +-
 .../examples/tutorial/project.clj             |  6 +-
 .../examples/visualization/project.clj        |  2 +-
 contrib/clojure-package/project.clj           |  4 +-
 docs/api/python/symbol/contrib.md             |  3 +
 .../scala/mxnet_scala_on_intellij.md          |  4 +-
 include/mxnet/base.h                          |  2 +-
 mkldnn.mk                                     | 12 ++-
 python/mxnet/libinfo.py                       |  2 +-
 .../assembly/linux-x86_64-cpu/pom.xml         |  8 +-
 .../assembly/linux-x86_64-gpu/pom.xml         |  8 +-
 scala-package/assembly/osx-x86_64-cpu/pom.xml |  8 +-
 scala-package/assembly/pom.xml                |  2 +-
 scala-package/core/pom.xml                    |  6 +-
 scala-package/examples/pom.xml                |  6 +-
 scala-package/infer/pom.xml                   |  4 +-
 .../init-native/linux-x86_64/pom.xml          |  4 +-
 scala-package/init-native/osx-x86_64/pom.xml  |  4 +-
 scala-package/init-native/pom.xml             |  2 +-
 scala-package/init/pom.xml                    |  2 +-
 scala-package/macros/pom.xml                  |  6 +-
 scala-package/native/linux-x86_64-cpu/pom.xml |  4 +-
 scala-package/native/linux-x86_64-gpu/pom.xml |  4 +-
 scala-package/native/osx-x86_64-cpu/pom.xml   |  4 +-
 scala-package/native/pom.xml                  |  2 +-
 scala-package/pom.xml                         |  2 +-
 scala-package/spark/pom.xml                   |  4 +-
 snapcraft.yaml                                |  2 +-
 tests/cpp/unittest.mk                         |  8 +-
 .../train_mxnet_legacy_models.sh              |  4 +-
 tests/python/mkl/test_mkldnn.py               |  6 +-
 tests/python/mkl/test_mkldnn_install.py       | 56 -----------
 47 files changed, 192 insertions(+), 158 deletions(-)
 delete mode 100644 tests/python/mkl/test_mkldnn_install.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b8bbd2e0272..161705643194 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,6 +227,7 @@ if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
+    set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
diff --git a/LICENSE b/LICENSE
index a8b57e583764..2eb9c329e532 100644
--- a/LICENSE
+++ b/LICENSE
@@ -218,16 +218,20 @@
     1. MXNet Cpp-package - For details, /cpp-package/LICENSE
     2. MXNet rcnn - For details, see, example/rcnn/LICENSE
     3. scala-package - For details, see, scala-package/LICENSE
-    4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE
+    4. Warp-CTC - For details, see, 3rdparty/ctc_include/LICENSE
     5. 3rdparty/dlpack - For details, see, 3rdparty/dlpack/LICENSE
     6. 3rdparty/dmlc-core - For details, see, 3rdparty/dmlc-core/LICENSE
     7. 3rdparty/mshadow - For details, see, 3rdparty/mshadow/LICENSE
     8. 3rdparty/tvm - For details, see, 3rdparty/tvm/LICENSE
     9. 3rdparty/tvm/dmlc-core - For details, see, 3rdparty/tvm/dmlc-core/LICENSE
-    10. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
-    11. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
-    12. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
-    13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+    10. 3rdparty/tvm/dlpack - For details, see, 3rdparty/tvm/3rdparty/dlpack/LICENSE
+    11. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
+    12. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
+    13. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
+    14. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+    15. clojure-package - For details, see, contrib/clojure-package/LICENSE
+    16. R-package - For details, see, R-package/LICENSE
+    17. ONNX-TensorRT benchmark package - For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE
 
 
     =======================================================================================
@@ -239,6 +243,9 @@
     3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE
     4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt
     5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE
+    6. HalideIR - For details, see 3rdparty/tvm/3rdparty/HalideIR/LICENSE
+    7. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/LICENSE
+    8. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/third_party/onnx/LICENSE
 
 
     =======================================================================================
@@ -246,7 +253,7 @@
     =======================================================================================
 
     1. Moderngpu
-    For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE
+    For details, see, 3rdparty/ctc_include/contrib/moderngpu/LICENSE
 
     /******************************************************************************
     * Redistribution and use in source and binary forms, with or without
@@ -559,4 +566,79 @@
     #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+    =======================================================================================
+
+    12. Google tests
+        For details, ses, 3rdparty/mkldnn/tests/gtests/gtest/LICENSE
+
+    Copyright 2008, Google Inc.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+        * Neither the name of Google Inc. nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    13. ONNX python bindings
+    For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this
+       list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    3. Neither the name of the copyright holder nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You are under no obligation whatsoever to provide any bug fixes, patches, or
+    upgrades to the features, functionality or performance of the source code
+    ("Enhancements") to anyone; however, if you choose to make your Enhancements
+    available either publicly, or directly to the author of this software, without
+    imposing a separate written license agreement for such Enhancements, then you
+    hereby grant the following license: a non-exclusive, royalty-free perpetual
+    license to install, use, modify, prepare derivative works, incorporate into
+    other computer software, distribute, and sublicense such enhancements or
+    derivative works thereof, in binary and source code form.
 
diff --git a/Makefile b/Makefile
index 16ea59f3d585..e424904ad785 100644
--- a/Makefile
+++ b/Makefile
@@ -131,8 +131,13 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	CFLAGS += -I$(MKLDNNROOT)/include
-	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	# MKLDNN but to needs to be dynamically linked for windows as not all VS compilers support static linking
+	ifneq ($(UNAME_S), Windows)
+		LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
+	else
+		CFLAGS += -I$(MKLDNNROOT)/include
+		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+	endif
 endif
 
 # setup opencv
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 46702eff9ed7..da098996c68b 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,17 +1,17 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 1.4.0
+Version: 1.5.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
-Maintainer: Qiang Kou <qkou@qkou.info>
-Repository: DMLC
+Maintainer: Qiang Kou <qkou@qkou.info>, anirudhacharya <https://github.com/anirudhacharya>
+Repository: Apache
 Description: MXNet is a deep learning framework designed for both efficiency
     and flexibility. It allows you to mix the flavours of deep learning programs
     together to maximize the efficiency and your productivity.
 License: Apache License (== 2.0)
-URL: https://github.com/dmlc/mxnet/tree/master/R-package
-BugReports: https://github.com/dmlc/mxnet/issues
+URL: https://github.com/apache/incubator-mxnet/tree/master/R-package
+BugReports: https://github.com/apache/incubator-mxnet/issues
 Imports:
     methods,
     Rcpp (>= 0.12.1),
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1fc10bf0e085..5a44cccc6aa0 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -629,9 +629,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
-    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
-    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
-    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index f48a26737308..309775c88c85 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,19 +23,19 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index bc6100b86123..10b3ed770582 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -105,9 +105,9 @@ brew install opencv
 - Create a new project with `lein new my-mxnet`
 - Edit your `project.clj` and add one of the following entries to `:dependencies`, based on your system and the compute device you want to use:
 
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.4.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.4.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.5.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.5.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.5.0"]`
 
 After making this change and running `lein deps`, you should be able to run example code like this [NDArray Tutorial](https://github.com/apache/incubator-mxnet/blob/master/contrib/clojure-package/examples/tutorial/src/tutorial/ndarray.clj).
 
@@ -116,20 +116,20 @@ After making this change and running `lein deps`, you should be able to run exam
 With this option, you will install a Git revision of the Clojure package source and a [Scala package jar from Maven](https://search.maven.org/search?q=g:org.apache.mxnet) with native dependencies baked in.
 
 - Install additional dependencies as described in [the corresponding section for Option 1](#installing-additional-dependencies),
-- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.4.0` tag and a clone into the `~/mxnet` directory:
+- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.5.0` tag and a clone into the `~/mxnet` directory:
 
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
   git tag --list  # Find the tag that matches the Scala package version
-  git checkout tags/1.4.0 -b my_mxnet
+  git checkout tags/1.5.0 -b my_mxnet
   git submodule update --init --recursive
   cd contrib/clojure
   ```
 
 - Edit `project.clj` to include the desired Scala jar from Maven:
 
-      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0”]
+      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0”]
 
 - Run `lein test`. All the tests should run without error.
 - At this point you can run `lein install` to build and install the Clojure jar locally.
@@ -147,7 +147,7 @@ The first step is to recursively clone the MXNet repository and checkout the des
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
-  git checkout tags/1.4.0 -b my_mxnet  # this is optional
+  git checkout tags/1.5.0 -b my_mxnet  # this is optional
   git submodule update --init --recursive
   ```
 
@@ -176,7 +176,7 @@ The outcome of this step will be a shared library `lib/libmxnet.so` that is used
 
 #### Building the Clojure jar
  
-- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.4.0-SNAPSHOT"]`, to the `:dependencies`.
+- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.5.0-SNAPSHOT"]`, to the `:dependencies`.
 - Run `lein test`. All the tests should run without an error.
 - Run `lein install` to build and install the Clojure jar locally.
 
diff --git a/contrib/clojure-package/examples/cnn-text-classification/project.clj b/contrib/clojure-package/examples/cnn-text-classification/project.clj
index 3eed0ddf9d9c..29ebefe5d200 100644
--- a/contrib/clojure-package/examples/cnn-text-classification/project.clj
+++ b/contrib/clojure-package/examples/cnn-text-classification/project.clj
@@ -19,6 +19,6 @@
   :description "CNN text classification with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main cnn-text-classification.classifier)
diff --git a/contrib/clojure-package/examples/gan/project.clj b/contrib/clojure-package/examples/gan/project.clj
index 36b7c6cb3089..b8f6903cabba 100644
--- a/contrib/clojure-package/examples/gan/project.clj
+++ b/contrib/clojure-package/examples/gan/project.clj
@@ -19,6 +19,6 @@
   :description "GAN MNIST with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [nu.pattern/opencv "2.4.9-7"]]
   :main gan.gan-mnist)
diff --git a/contrib/clojure-package/examples/imclassification/project.clj b/contrib/clojure-package/examples/imclassification/project.clj
index 0dbede5052ac..5f77cf55cf35 100644
--- a/contrib/clojure-package/examples/imclassification/project.clj
+++ b/contrib/clojure-package/examples/imclassification/project.clj
@@ -19,6 +19,6 @@
   :description "Clojure examples for image classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main imclassification.train-mnist)
diff --git a/contrib/clojure-package/examples/module/project.clj b/contrib/clojure-package/examples/module/project.clj
index a9a0a5f23e6e..b667a2a4e122 100644
--- a/contrib/clojure-package/examples/module/project.clj
+++ b/contrib/clojure-package/examples/module/project.clj
@@ -19,7 +19,7 @@
   :description "Clojure examples for module"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main mnist-mlp)
 
diff --git a/contrib/clojure-package/examples/multi-label/project.clj b/contrib/clojure-package/examples/multi-label/project.clj
index 8923738b946d..6e6a14340d36 100644
--- a/contrib/clojure-package/examples/multi-label/project.clj
+++ b/contrib/clojure-package/examples/multi-label/project.clj
@@ -19,5 +19,5 @@
   :description "Example of multi-label classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main multi-label.core)
diff --git a/contrib/clojure-package/examples/neural-style/project.clj b/contrib/clojure-package/examples/neural-style/project.clj
index 5a8eebea783f..b6d29f7c0e87 100644
--- a/contrib/clojure-package/examples/neural-style/project.clj
+++ b/contrib/clojure-package/examples/neural-style/project.clj
@@ -19,7 +19,7 @@
   :description "Neural Style Transfer with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main neural-style.core)
diff --git a/contrib/clojure-package/examples/pre-trained-models/project.clj b/contrib/clojure-package/examples/pre-trained-models/project.clj
index 58b591ce5307..11e002503464 100644
--- a/contrib/clojure-package/examples/pre-trained-models/project.clj
+++ b/contrib/clojure-package/examples/pre-trained-models/project.clj
@@ -19,7 +19,7 @@
   :description "Example of using pre-trained models with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main pre-trained-models.fine-tune)
diff --git a/contrib/clojure-package/examples/profiler/project.clj b/contrib/clojure-package/examples/profiler/project.clj
index fa30eafa0daf..cc50482d0418 100644
--- a/contrib/clojure-package/examples/profiler/project.clj
+++ b/contrib/clojure-package/examples/profiler/project.clj
@@ -18,5 +18,5 @@
 (defproject profiler "0.1.0-SNAPSHOT"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main profiler.core)
diff --git a/contrib/clojure-package/examples/rnn/project.clj b/contrib/clojure-package/examples/rnn/project.clj
index 291f2bd46e3a..64f4c290741c 100644
--- a/contrib/clojure-package/examples/rnn/project.clj
+++ b/contrib/clojure-package/examples/rnn/project.clj
@@ -19,5 +19,5 @@
   :description "RNN example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main rnn.train-char-rnn)
diff --git a/contrib/clojure-package/examples/tutorial/project.clj b/contrib/clojure-package/examples/tutorial/project.clj
index 8a78ec6a6abf..9c4f1b96f9e0 100644
--- a/contrib/clojure-package/examples/tutorial/project.clj
+++ b/contrib/clojure-package/examples/tutorial/project.clj
@@ -20,6 +20,6 @@
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
                  ;; Uncomment the one appropriate for your machine & configuration:
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.4.0"]
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.4.0"]])
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.5.0"]
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.5.0"]
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.5.0"]])
diff --git a/contrib/clojure-package/examples/visualization/project.clj b/contrib/clojure-package/examples/visualization/project.clj
index d56ddfb23f0c..d91ace3188e6 100644
--- a/contrib/clojure-package/examples/visualization/project.clj
+++ b/contrib/clojure-package/examples/visualization/project.clj
@@ -19,5 +19,5 @@
   :description "Visualization example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main visualization.core)
diff --git a/contrib/clojure-package/project.clj b/contrib/clojure-package/project.clj
index ae7ccd67fd9c..12a0504e02d5 100644
--- a/contrib/clojure-package/project.clj
+++ b/contrib/clojure-package/project.clj
@@ -15,7 +15,7 @@
 ;; limitations under the License.
 ;;
 
-(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"
+(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"
   :description "Clojure package for MXNet"
   :url "https://github.com/apache/incubator-mxnet"
   :license {:name "Apache License"
@@ -29,7 +29,7 @@
                  ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu "1.2.1"]
 
                  ;;; CI
-                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0-SNAPSHOT"]
 
                  [org.clojure/tools.logging "0.4.0"]
                  [org.apache.logging.log4j/log4j-core "2.8.1"]
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index a0253216f945..35cd11c89a70 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -55,6 +55,9 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     foreach
     while_loop
     cond
+    isinf
+    isfinite
+    isnan
     index_copy
     getnnz
     edge_id
diff --git a/docs/tutorials/scala/mxnet_scala_on_intellij.md b/docs/tutorials/scala/mxnet_scala_on_intellij.md
index 174e3018098b..a0bf24e34e28 100644
--- a/docs/tutorials/scala/mxnet_scala_on_intellij.md
+++ b/docs/tutorials/scala/mxnet_scala_on_intellij.md
@@ -385,14 +385,14 @@ If you chose to "Build from Source" when following the [install instructions](ht
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.version}-${platform}-sources</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT-sources.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT-sources.jar</systemPath>
     </dependency>
 
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-full_${scala.version}-${platform}</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT.jar</systemPath>
     </dependency>
 ```
 
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index f773139d6c3e..92d9c2699d63 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -102,7 +102,7 @@
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 4
+#define MXNET_MINOR 5
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
diff --git a/mkldnn.mk b/mkldnn.mk
index d79bbe7d2a0e..5af3e9b1d741 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,14 +19,20 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
+	MKLDNN_LIBRARY_TYPE=STATIC
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
+else ifeq ($(UNAME_S), Windows)
+	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
+	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
+	MKLDNN_LIBRARY_TYPE=SHARED
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
 endif
 endif
 
@@ -37,7 +43,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=$(MKLDNN_LIBRARY_TYPE)
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 57c73e5943af..ff795f914a4b 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -111,4 +111,4 @@ def find_include_path():
 
 
 # current version
-__version__ = "1.4.0"
+__version__ = "1.5.0"
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index fbc0ab027ac7..abefead175c7 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index a1a94808e918..96ffa38c6af2 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index bb6af0353762..5c5733a9a4ce 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index 8de320eb2ade..c1d1a3b8e721 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 3425bb15f62a..484fbbd96790 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -100,13 +100,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 9e8e119c3c4f..8d3d156a0b18 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -149,13 +149,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index 3e6980cb6f4b..ac76cdd19f3b 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <artifactId>mxnet-parent_2.11</artifactId>
         <groupId>org.apache.mxnet</groupId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.5.0-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -89,7 +89,7 @@
         <dependency>
             <groupId>org.apache.mxnet</groupId>
             <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.5.0-SNAPSHOT</version>
             <scope>provided</scope>
         </dependency>
         <!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 12a36bd6e944..b71d7cf71528 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index d0290942ef84..b4a0b1d6584a 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 17a829c0c217..bed216e45035 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index a5b88c308637..4278df6f2e73 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
 <!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index d435e211ceeb..cd56060b4b36 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -53,13 +53,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index ac8e4a45e67a..2415cf7d26db 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index cdba5774f6a0..0186217234bc 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 333486c67392..0ab7ca1dd0f0 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index e267c8d797ab..2f6425d21104 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 76bf00b54ba6..151462cbcc68 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -10,7 +10,7 @@
   </parent>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.5.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/apache/incubator-mxnet/tree/master/scala-package</url>
   <description>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index ee4f3efa98e4..2db3bee8c78d 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -40,7 +40,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/snapcraft.yaml b/snapcraft.yaml
index e70bf6e5b4b3..d8d0e301e6b1 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '1.4.0'
+version: '1.5.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 746ee2f096f1..665ce6982874 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -41,22 +41,22 @@ gtest-all.o : $(GTEST_SRCS_)
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
-build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
+build/tests/cpp/%.o : tests/cpp/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
+build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
+build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
+build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
index 02d480d9d3ba..bda47f9e650d 100755
--- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
+++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
@@ -61,8 +61,8 @@ echo `pwd`
 ## This list is sorted in descending order chronologically.
 ## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1
 ## so from this sample, we will pick up all the versions matching with the current latest version
-## Now while performing inference the latest version could be 1.4.0, which will help in validating models trained
-## on 1.1.0 and 1.2.0 by loading them on the latest version (1.4.0)
+## Now while performing inference the latest version could be 1.5.0, which will help in validating models trained
+## on 1.1.0 and 1.2.0 by loading them on the latest version (1.5.0)
 ## Over a period of time, the model repository will grow since with every new release we
 ## upload models trained on newer versions as well through this script
 previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc))
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index c6c0a0832f1f..d9d3abfc3ced 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -27,7 +27,6 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
-import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -441,7 +440,4 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
-
-
-if __name__ == '__main__':
-    install.test_mkldnn_install()
+    
diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
deleted file mode 100644
index c2f26df72f2e..000000000000
--- a/tests/python/mkl/test_mkldnn_install.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-MKL-DNN related test cases
-"""
-
-import sys
-import os
-import logging
-
-
-def test_mkldnn_install():
-    """
-    This test will verify that MXNet is built/installed correctly when
-    compiled with Intel MKL-DNN library. The method will try to import
-    the mxnet module and see if the mkldnn library is mapped to this
-    process's address space.
-    """
-    logging.basicConfig(level=logging.INFO)
-
-    if not sys.platform.startswith('linux'):
-        logging.info("Bypass mkldnn install test for non-Linux OS")
-        return
-
-    try:
-        #pylint: disable=unused-variable
-        import mxnet as mx
-    except (ImportError, OSError) as e:
-        assert 0, "Import mxnet error: %s. Please double check your build/" \
-            "install steps or environment variable settings" % str(e)
-
-    pid = os.getpid()
-    rc = os.system("cat /proc/" + str(pid) +
-                   "/maps | grep libmkldnn > /dev/null")
-
-    if rc == 0:
-        logging.info("MXNet is built/installed correctly with MKL-DNN")
-    else:
-        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
-            "double check your build/install steps or environment " \
-            "variable settings"

From cb0db290adcfd0fce956d02c234f81d453e41013 Mon Sep 17 00:00:00 2001
From: Andrew Ayres <andrew.f.ayres@gmail.com>
Date: Wed, 5 Dec 2018 10:35:27 -0800
Subject: [PATCH 32/54] Fixing a 404 in the ubuntu setup doc (#13542)

---
 docs/install/ubuntu_setup.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install/ubuntu_setup.md b/docs/install/ubuntu_setup.md
index 7d8da182b070..bd1b441d5556 100644
--- a/docs/install/ubuntu_setup.md
+++ b/docs/install/ubuntu_setup.md
@@ -392,7 +392,7 @@ If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on
 
 To use the MXNet-Java package, you can acquire the Maven package as a dependency.
 
-Further information is in the [MXNet-Java Setup Instructions](java.html).
+Further information is in the [MXNet-Java Setup Instructions](java_setup.html).
 
 If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Java on IntelliJ tutorial](../tutorials/java/mxnet_java_on_intellij.html) instead.
 <hr>

From 1c8972c3c8f832519364916865541f48597581c7 Mon Sep 17 00:00:00 2001
From: Lanking <lanking520@live.com>
Date: Wed, 5 Dec 2018 10:35:50 -0800
Subject: [PATCH 33/54] [MXNET-1249] Fix Object Detector Performance with GPU
 (#13522)

* Reduce post processing time

* fix ssd

* fix the CI

* add comments
---
 .../org/apache/mxnet/infer/Classifier.scala   | 18 +++++++----
 .../apache/mxnet/infer/ObjectDetector.scala   | 31 ++++++++++---------
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
index adeb33d34a95..cf55bc10d97e 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
@@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory
 
 import scala.io
 import scala.collection.mutable.ListBuffer
+import scala.collection.parallel.mutable.ParArray
 
 trait ClassifierBase {
 
@@ -110,16 +111,21 @@ class Classifier(modelPathPrefix: String,
   : IndexedSeq[IndexedSeq[(String, Float)]] = {
 
     // considering only the first output
-    val predictResultND: NDArray = predictor.predictWithNDArray(input)(0)
-
-    val predictResult: ListBuffer[Array[Float]] = ListBuffer[Array[Float]]()
+    // Copy NDArray to CPU to avoid frequent GPU to CPU copying
+    val predictResultND: NDArray =
+      predictor.predictWithNDArray(input)(0).asInContext(Context.cpu())
+    // Parallel Execution with ParArray for better performance
+    val predictResultPar: ParArray[Array[Float]] =
+      new ParArray[Array[Float]](predictResultND.shape(0))
 
     // iterating over the individual items(batch size is in axis 0)
-    for (i <- 0 until predictResultND.shape(0)) {
+    (0 until predictResultND.shape(0)).toVector.par.foreach( i => {
       val r = predictResultND.at(i)
-      predictResult += r.toArray
+      predictResultPar(i) = r.toArray
       r.dispose()
-    }
+    })
+
+    val predictResult = predictResultPar.toArray
 
     var result: ListBuffer[IndexedSeq[(String, Float)]] =
       ListBuffer.empty[IndexedSeq[(String, Float)]]
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
index a9b21f8c1dcd..78b237a4a9c6 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
@@ -19,6 +19,8 @@ package org.apache.mxnet.infer
 
 // scalastyle:off
 import java.awt.image.BufferedImage
+
+import scala.collection.parallel.mutable.ParArray
 // scalastyle:on
 import org.apache.mxnet.NDArray
 import org.apache.mxnet.DataDesc
@@ -94,39 +96,39 @@ class ObjectDetector(modelPathPrefix: String,
   def objectDetectWithNDArray(input: IndexedSeq[NDArray], topK: Option[Int])
   : IndexedSeq[IndexedSeq[(String, Array[Float])]] = {
 
-    val predictResult = predictor.predictWithNDArray(input)(0)
-    var batchResult = ListBuffer[IndexedSeq[(String, Array[Float])]]()
-    for (i <- 0 until predictResult.shape(0)) {
+    // Copy NDArray to CPU to avoid frequent GPU to CPU copying
+    val predictResult = predictor.predictWithNDArray(input)(0).asInContext(Context.cpu())
+    // Parallel Execution with ParArray for better performance
+    var batchResult = new ParArray[IndexedSeq[(String, Array[Float])]](predictResult.shape(0))
+    (0 until predictResult.shape(0)).toArray.par.foreach( i => {
       val r = predictResult.at(i)
-      batchResult += sortAndReformat(r, topK)
+      batchResult(i) = sortAndReformat(r, topK)
       handler.execute(r.dispose())
-    }
+    })
     handler.execute(predictResult.dispose())
     batchResult.toIndexedSeq
   }
 
   private[infer] def sortAndReformat(predictResultND: NDArray, topK: Option[Int])
   : IndexedSeq[(String, Array[Float])] = {
-    val predictResult: ListBuffer[Array[Float]] = ListBuffer[Array[Float]]()
-    val accuracy: ListBuffer[Float] = ListBuffer[Float]()
-
     // iterating over the all the predictions
     val length = predictResultND.shape(0)
 
-    for (i <- 0 until length) {
+    val predictResult = (0 until length).toArray.par.flatMap( i => {
       val r = predictResultND.at(i)
       val tempArr = r.toArray
-      if (tempArr(0) != -1.0) {
-        predictResult += tempArr
-        accuracy += tempArr(1)
+      val res = if (tempArr(0) != -1.0) {
+        Array[Array[Float]](tempArr)
       } else {
         // Ignore the minus 1 part
+        Array[Array[Float]]()
       }
       handler.execute(r.dispose())
-    }
+      res
+    }).toArray
     var result = IndexedSeq[(String, Array[Float])]()
     if (topK.isDefined) {
-      var sortedIndices = accuracy.zipWithIndex.sortBy(-_._1).map(_._2)
+      var sortedIndices = predictResult.zipWithIndex.sortBy(-_._1(1)).map(_._2)
       sortedIndices = sortedIndices.take(topK.get)
       // takeRight(5) would provide the output as Array[Accuracy, Xmin, Ymin, Xmax, Ymax
       result = sortedIndices.map(idx
@@ -136,7 +138,6 @@ class ObjectDetector(modelPathPrefix: String,
       result = predictResult.map(ele
       => (synset(ele(0).toInt), ele.takeRight(5))).toIndexedSeq
     }
-
     result
   }
 

From bd8e0f8356676749ecae16ec38a366b4cc00bf15 Mon Sep 17 00:00:00 2001
From: Pedro Larroy <928489+larroy@users.noreply.github.com>
Date: Wed, 5 Dec 2018 19:49:39 +0100
Subject: [PATCH 34/54] =?UTF-8?q?[MXNET-769]=20Use=20MXNET=5FHOME=20in=20a?=
 =?UTF-8?q?=20tempdir=20in=20windows=20to=20prevent=20access=20denied=20du?=
 =?UTF-8?q?e=20t=E2=80=A6=20(#13531)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Use MXNET_HOME in cwd in windows to prevent access denied due to concurrent data downloads

Fixes #13484

* Revert "Disabled flaky test test_gluon_data.test_recordimage_dataset_with_data_loader_multiworker (#13527)"

This reverts commit 3d499cb3584919b767142c5596211a7f7fb18d50.
---
 ci/windows/test_py2_cpu.ps1              | 3 +++
 ci/windows/test_py2_gpu.ps1              | 3 +++
 ci/windows/test_py3_cpu.ps1              | 3 +++
 ci/windows/test_py3_gpu.ps1              | 3 +++
 tests/python/unittest/test_gluon_data.py | 1 -
 5 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
index 702a2db90ed7..46e49baeadbb 100644
--- a/ci/windows/test_py2_cpu.ps1
+++ b/ci/windows/test_py2_cpu.ps1
@@ -16,9 +16,12 @@
 # under the License.
 
 7z x -y windows_package.7z
+
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
+
 c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
 c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
index 0cf2717fa208..d362c61da02b 100644
--- a/ci/windows/test_py2_gpu.ps1
+++ b/ci/windows/test_py2_gpu.ps1
@@ -16,9 +16,12 @@
 # under the License.
 
 7z x -y windows_package.7z
+
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
+
 c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
 c:\Anaconda3\envs\py2\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
index a7774a63dd86..32da4885fe0a 100644
--- a/ci/windows/test_py3_cpu.ps1
+++ b/ci/windows/test_py3_cpu.ps1
@@ -16,9 +16,12 @@
 # under the License.
 
 7z x -y windows_package.7z
+
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
+
 c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
 c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index f9955ef6473f..b30b22ae90e4 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -16,9 +16,12 @@
 # under the License.
 
 7z x -y windows_package.7z
+
 $env:MXNET_LIBRARY_PATH=join-path $pwd.Path windows_package\lib\libmxnet.dll
 $env:PYTHONPATH=join-path $pwd.Path windows_package\python
 $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+$env:MXNET_HOME=[io.path]::combine($PSScriptRoot, 'mxnet_home')
+
 c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
 c:\Anaconda3\envs\py3\python.exe -m nose -v --with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
 if (! $?) { Throw ("Error running unittest") }
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index d043a7c6b802..e4206095f9ba 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -78,7 +78,6 @@ def _dataset_transform_fn(x, y):
     return x, y
 
 @with_seed()
-@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/13484")
 def test_recordimage_dataset_with_data_loader_multiworker():
     recfile = prepare_record()
     dataset = gluon.data.vision.ImageRecordDataset(recfile)

From f6f840110d74111f98c20eab5b08d64a46ebf0cd Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Wed, 5 Dec 2018 21:42:24 +0100
Subject: [PATCH 35/54] Add a retry to qemu_provision (#13551)

Fixes #13504
---
 ci/docker/qemu/vmcontrol.py | 42 +++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/ci/docker/qemu/vmcontrol.py b/ci/docker/qemu/vmcontrol.py
index a7e8c0ff0122..d80e22b1db85 100644
--- a/ci/docker/qemu/vmcontrol.py
+++ b/ci/docker/qemu/vmcontrol.py
@@ -69,6 +69,46 @@
   -nographic
 """
 
+def retry(target_exception, tries=4, delay_s=1, backoff=2):
+    """Retry calling the decorated function using an exponential backoff.
+
+    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+    original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
+
+    :param target_exception: the exception to check. may be a tuple of
+        exceptions to check
+    :type target_exception: Exception or tuple
+    :param tries: number of times to try (not retry) before giving up
+    :type tries: int
+    :param delay_s: initial delay between retries in seconds
+    :type delay_s: int
+    :param backoff: backoff multiplier e.g. value of 2 will double the delay
+        each retry
+    :type backoff: int
+    """
+    import time
+    from functools import wraps
+
+    def decorated_retry(f):
+        @wraps(f)
+        def f_retry(*args, **kwargs):
+            mtries, mdelay = tries, delay_s
+            while mtries > 1:
+                try:
+                    return f(*args, **kwargs)
+                except target_exception as e:
+                    logging.warning("Exception: %s, Retrying in %d seconds...", str(e), mdelay)
+                    time.sleep(mdelay)
+                    mtries -= 1
+                    mdelay *= backoff
+            return f(*args, **kwargs)
+
+        return f_retry  # true decorator
+
+    return decorated_retry
+
+
+
 
 class VMError(RuntimeError):
     pass
@@ -177,6 +217,8 @@ def qemu_rsync(ssh_port, local_path, remote_path):
 def qemu_rsync_to_host(ssh_port, remote_path, local_path):
     check_call(['rsync', '-e', 'ssh -o StrictHostKeyChecking=no -p{}'.format(ssh_port), '-va', 'qemu@localhost:{}'.format(remote_path), local_path])
 
+
+@retry(subprocess.CalledProcessError)
 def qemu_provision(ssh_port=QEMU_SSH_PORT):
     import glob
     logging.info("Provisioning the VM with artifacts and sources")

From f6b4665995f8f8ff32862a029b2074475d8467eb Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Wed, 5 Dec 2018 13:39:17 -0800
Subject: [PATCH 36/54] Fix #13521 (#13537)

* fix pool release

* fix
---
 python/mxnet/gluon/data/dataloader.py    |  6 ++++++
 tests/python/unittest/test_gluon_data.py | 11 +++++++++++
 2 files changed, 17 insertions(+)

diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index ad0f534d16dd..586e620470d3 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -556,3 +556,9 @@ def same_process_iter():
 
     def __len__(self):
         return len(self._batch_sampler)
+
+    def __del__(self):
+        if self._worker_pool:
+            # manually terminate due to a bug that pool is not automatically terminated on linux
+            assert isinstance(self._worker_pool, multiprocessing.pool.Pool)
+            self._worker_pool.terminate()
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index e4206095f9ba..a3ba222c71d8 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -244,6 +244,17 @@ def test_multi_worker_forked_data_loader():
         for i, data in enumerate(loader):
             pass
 
+@with_seed()
+def test_multi_worker_dataloader_release_pool():
+    # will trigger too many open file if pool is not released properly
+    for _ in range(100):
+        A = np.random.rand(999, 2000)
+        D = mx.gluon.data.DataLoader(A, batch_size=8, num_workers=8)
+        the_iter = iter(D)
+        next(the_iter)
+        del the_iter
+        del D
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()

From 8bbac827742c21607a863137792f03bd09847419 Mon Sep 17 00:00:00 2001
From: Holger Kohr <ho.kohr@zoho.com>
Date: Thu, 6 Dec 2018 01:38:46 +0100
Subject: [PATCH 37/54] Simplifications and some fun stuff for the MNIST Gluon
 tutorial (#13094)

* Simplify mnist Gluon tutorial and add mislabelled sample plotting

* Add mnist Gluon tutorial images

* Gluon MNIST tutorial: Use modern Gluon constructs, fix some wordings

* [Gluon] Move to data loaders and improve wording in MNIST tutorial

* Fix broken links

* Fix spelling of mislabeled

* Final rewordings and code simplifications

* Fix things according to review

- Apply hybrid blocks
- Move outputs outside of code blocks and mark for notebooks
  to ignore
- Remove images, use external link
- Fix a few formulations

* Change activations to sigmoid in MNIST tutorial

* Remove superfluous last layer activations in MNIST tutorial
---
 docs/tutorials/gluon/mnist.md | 554 ++++++++++++++++++++--------------
 1 file changed, 332 insertions(+), 222 deletions(-)

diff --git a/docs/tutorials/gluon/mnist.md b/docs/tutorials/gluon/mnist.md
index 5b8a98a3d668..35fb40521f62 100644
--- a/docs/tutorials/gluon/mnist.md
+++ b/docs/tutorials/gluon/mnist.md
@@ -1,24 +1,22 @@
-# Handwritten Digit Recognition
+# Hand-written Digit Recognition
 
-In this tutorial, we'll give you a step by step walk-through of how to build a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset.
+In this tutorial, we'll give you a step-by-step walkthrough of building a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset.
 
-MNIST is a widely used dataset for the hand-written digit classification task. It consists of 70,000 labeled 28x28 pixel grayscale images of hand-written digits. The dataset is split into 60,000 training images and 10,000 test images. There are 10 classes (one for each of the 10 digits). The task at hand is to train a model using the 60,000 training images and subsequently test its classification accuracy on the 10,000 test images.
+MNIST is a widely used dataset for the hand-written digit classification task. It consists of 70,000 labeled grayscale images of hand-written digits, each 28x28 pixels in size. The dataset is split into 60,000 training images and 10,000 test images. There are 10 classes (one for each of the 10 digits). The task at hand is to train a model that can correctly classify the images into the digits they represent. The 60,000 training images are used to fit the model, and its performance in terms of classification accuracy is subsequently validated on the 10,000 test images.
 
 ![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/mnist.png)
 
 **Figure 1:** Sample images from the MNIST dataset.
 
-This tutorial uses MXNet's new high-level interface, gluon package to implement MLP using
-imperative fashion.
-
-This is based on the Mnist tutorial with symbolic approach. You can find it [here](http://mxnet.io/tutorials/python/mnist.html).
+This tutorial uses MXNet's high-level *Gluon* interface to implement neural networks in an imperative fashion. It is based on [the corresponding tutorial written with the symbolic approach](https://mxnet.incubator.apache.org/tutorials/python/mnist.html).
 
 ## Prerequisites
-To complete this tutorial, we need:
 
-- MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html).
+To complete this tutorial, you need:
 
-- [Python Requests](http://docs.python-requests.org/en/master/) and [Jupyter Notebook](http://jupyter.org/index.html).
+- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.incubator.apache.org/install/index.html).
+- The Python [`requests`](http://docs.python-requests.org/en/master/) library.
+- (Optional) The [Jupyter Notebook](https://jupyter.org/index.html) software for interactively running the provided `.ipynb` file.
 
 ```
 $ pip install requests jupyter
@@ -26,308 +24,420 @@ $ pip install requests jupyter
 
 ## Loading Data
 
-Before we define the model, let's first fetch the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset.
+The following code downloads the MNIST dataset to the default location (`.mxnet/datasets/mnist/` in your home directory) and creates `Dataset` objects `train_data` and `val_data` for training and validation, respectively.
+These objects can later be used to get one image or a batch of images at a time, together with their corresponding labels.
 
-The following source code downloads and loads the images and the corresponding labels into memory.
+We also immediately apply the `transform_first()` method and supply a function that moves the channel axis of the images to the beginning (`(28, 28, 1) -> (1, 28, 28)`), casts them to `float32` and rescales them from `[0, 255]` to `[0, 1]`.
+The name `transform_first` reflects the fact that these datasets contain images and labels, and that the transform should only be applied to the first of each `(image, label)` pair.
 
 ```python
 import mxnet as mx
 
-# Fixing the random seed
+# Select a fixed random seed for reproducibility
 mx.random.seed(42)
 
-mnist = mx.test_utils.get_mnist()
+def data_xform(data):
+    """Move channel axis to the beginning, cast to float32, and normalize to [0, 1]."""
+    return nd.moveaxis(data, 2, 0).astype('float32') / 255
+
+train_data = mx.gluon.data.vision.MNIST(train=True).transform_first(data_xform)
+val_data = mx.gluon.data.vision.MNIST(train=False).transform_first(data_xform)
 ```
 
-After running the above source code, the entire MNIST dataset should be fully loaded into memory. Note that for large datasets it is not feasible to pre-load the entire dataset first like we did here. What is needed is a mechanism by which we can quickly and efficiently stream data directly from the source. MXNet Data iterators come to the rescue here by providing exactly that. Data iterator is the mechanism by which we feed input data into an MXNet training algorithm and they are very simple to initialize and use and are optimized for speed. During training, we typically process training samples in small batches and over the entire training lifetime will end up processing each training example multiple times. In this tutorial, we'll configure the data iterator to feed examples in batches of 100. Keep in mind that each example is a 28x28 grayscale image and the corresponding label.
+Since the MNIST dataset is relatively small, the `MNIST` class loads it into memory all at once, but for larger datasets like ImageNet, this would no longer be possible.
+The Gluon `Dataset` class from which `MNIST` derives supports both cases.
+In general, `Dataset` and `DataLoader` (which we will encounter next) are the machinery in MXNet that provides a stream of input data to be consumed by a training algorithm, typically in batches of multiple data entities at once for better efficiency.
+In this tutorial, we will configure the data loader to feed examples in batches of 100.
+
+An image batch is commonly represented as a 4-D array with shape `(batch_size, num_channels, height, width)`.
+This convention is denoted by "NCHW", and it is the default in MXNet.
+For the MNIST dataset, each image has a size of 28x28 pixels and one color channel (grayscale), hence the shape of an input batch will be `(batch_size, 1, 28, 28)`.
 
-Image batches are commonly represented by a 4-D array with shape `(batch_size, num_channels, width, height)`. For the MNIST dataset, since the images are grayscale, there is only one color channel. Also, the images are 28x28 pixels, and so each image has width and height equal to 28. Therefore, the shape of input is `(batch_size, 1, 28, 28)`. Another important consideration is the order of input samples. When feeding training examples, it is critical that we don't feed samples with the same label in succession. Doing so can slow down training.
-Data iterators take care of this by randomly shuffling the inputs. Note that we only need to shuffle the training data. The order does not matter for test data.
+Another important consideration is the order of input samples.
+When feeding training examples, it is critical not feed samples with the same label in succession since doing so can slow down training progress.
+Data iterators, i.e., instances of [`DataLoader`](https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.DataLoader), take care of this issue by randomly shuffling the inputs.
+Note that we only need to shuffle the training data -- for validation data, the order does not matter.
 
-The following source code initializes the data iterators for the MNIST dataset. Note that we initialize two iterators: one for train data and one for test data.
+The following code initializes the data iterators for the MNIST dataset.
 
 ```python
 batch_size = 100
-train_data = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True)
-val_data = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
+train_loader = mx.gluon.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)
+val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_size)
 ```
 
 ## Approaches
 
-We will cover a couple of approaches for performing the hand written digit recognition task. The first approach makes use of a traditional deep neural network architecture called Multilayer Perceptron (MLP). We'll discuss its drawbacks and use that as a motivation to introduce a second more advanced approach called Convolution Neural Network (CNN) that has proven to work very well for image classification tasks.
+We will cover two approaches for performing the hand-written digit recognition task.
+In our first attempt, we will make use of a traditional neural network architecture called [Multilayer Perceptron (MLP)](https://en.wikipedia.org/wiki/Multilayer_perceptron).
+Although this architecture lets us achieve over 95 % accuracy on the validation set, we will recognize and discuss some of its drawbacks and use them as a motivation for using a different network.
+In the subsequent second attempt, we introduce the more advanced and very widely used [Convolutional Neural Network (CNN)](https://en.wikipedia.org/wiki/Convolutional_neural_network) architecture that has proven to work very well for image classification tasks.
 
-Now, let's import required nn modules
+As a first step, we run some convenience imports of frequently used modules.
 
 ```python
-from __future__ import print_function
+from __future__ import print_function  # only relevant for Python 2
 import mxnet as mx
-from mxnet import gluon
+from mxnet import nd, gluon, autograd
 from mxnet.gluon import nn
-from mxnet import autograd as ag
 ```
 
-### Define a network: Multilayer Perceptron
+### Defining a network: Multilayer Perceptron (MLP)
 
-The first approach makes use of a [Multilayer Perceptron](https://en.wikipedia.org/wiki/Multilayer_perceptron) to solve this problem. We'll define the MLP using MXNet's imperative approach.
+MLPs consist of several fully connected layers.
+In a fully connected (short: FC) layer, each neuron is connected to every neuron in its preceding layer.
+From a linear algebra perspective, an FC layer applies an [affine transform](https://en.wikipedia.org/wiki/Affine_transformation) *Y = X W + b* to an input matrix *X* of size (*n x m*) and outputs a matrix *Y* of size (*n x k*).
+The number *k*, also referred to as *hidden size*, corresponds to the number of neurons in the FC layer.
+An FC layer has two learnable parameters: the (*m x k*) weight matrix *W* and the (*1 x k*) bias vector *b*.
 
-MLPs consist of several fully connected layers. A fully connected layer or FC layer for short, is one where each neuron in the layer is connected to every neuron in its preceding layer. From a linear algebra perspective, an FC layer applies an [affine transform](https://en.wikipedia.org/wiki/Affine_transformation) to the *n x m* input matrix *X* and outputs a matrix *Y* of size *n x k*, where *k* is the number of neurons in the FC layer. *k* is also referred to as the hidden size. The output *Y* is computed according to the equation *Y = W X + b*. The FC layer has two learnable parameters, the *m x k* weight matrix *W* and the *m x 1* bias vector *b*.
+In an MLP, the outputs of FC layers are typically fed into an activation function that applies an elementwise nonlinearity.
+This step is crucial since it gives neural networks the ability to classify inputs that are not linearly separable.
+Common choices for activation functions are [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function), [hyperbolic tangent ("tanh")](https://en.wikipedia.org/wiki/Hyperbolic_function#Definitions), and [rectified linear unit (ReLU)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)).
+In this example, we'll use the ReLU activation function since it has several nice properties that make it a good default choice.
 
-In an MLP, the outputs of most FC layers are fed into an activation function, which applies an element-wise non-linearity. This step is critical and it gives neural networks the ability to classify inputs that are not linearly separable. Common choices for activation functions are sigmoid, tanh, and [rectified linear unit](https://en.wikipedia.org/wiki/Rectifier_%28neural_networks%29) (ReLU). In this example, we'll use the ReLU activation function which has several desirable properties and is typically considered a default choice.
+The following code snippet declares three fully connected (or *dense*) layers with 128, 64 and 10 neurons each, where the last number of neurons matches the number of output classes in our dataset.
+Note that the last layer uses no activation function since the [softmax](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.softmax) activation will be implicitly applied by the loss function later on.
+To build the neural network, we use a [`HybridSequential`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.HybridSequential) layer, which is a convenience class to build a linear stack of layers, often called a *feed-forward neural net*.
 
-The following code declares three fully connected layers with 128, 64 and 10 neurons each.
-The last fully connected layer often has its hidden size equal to the number of output classes in the dataset. Furthermore, these FC layers uses ReLU activation for performing an element-wise ReLU transformation on the FC layer output.
-
-To do this, we will use [Sequential layer](http://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential) type. This is simply a linear stack of neural network layers. `nn.Dense` layers are nothing but the fully connected layers we discussed above.
+The "Hybrid" part of name `HybridSequential` refers to the fact that such a layer can be used with both the Gluon API and the Symbol API.
+Using hybrid blocks over dynamic-only blocks (e.g. [`Sequential`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential)) has several advantages apart from being compatible with a wider range of existing code: for instance, the computation graph of the network can be visualized with `mxnet.viz.plot_network()` and inspected for errors.
+Unless a network requires non-static runtime elements like loops, conditionals or random layer selection in its forward pass, it is generally a good idea to err on the side of hybrid blocks.
+For details on the differences, see the documentation on [`Block`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block) and [`HybridBlock`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#mxnet.gluon.HybridBlock).
 
 ```python
-# define network
-net = nn.Sequential()
+net = nn.HybridSequential(prefix='MLP_')
 with net.name_scope():
-    net.add(nn.Dense(128, activation='relu'))
-    net.add(nn.Dense(64, activation='relu'))
-    net.add(nn.Dense(10))
+    net.add(
+        nn.Flatten(),
+        nn.Dense(128, activation='relu'),
+        nn.Dense(64, activation='relu'),
+        nn.Dense(10, activation=None)  # loss function includes softmax already, see below
+    )
 ```
 
-#### Initialize parameters and optimizer
+**Note**: using the `name_scope()` context manager is optional.
+It is, however, good practice since it uses a common prefix for the names of all layers generated in that scope, which can be very helpful during debugging.
 
-The following source code initializes all parameters received from parameter dict using [Xavier](http://mxnet.io/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initializer
-to train the MLP network we defined above.
+#### Initializing parameters and optimizer
 
-For our training, we will make use of the stochastic gradient descent (SGD) optimizer. In particular, we'll be using mini-batch SGD. Standard SGD processes train data one example at a time. In practice, this is very slow and one can speed up the process by processing examples in small batches. In this case, our batch size will be 100, which is a reasonable choice. Another parameter we select here is the learning rate, which controls the step size the optimizer takes in search of a solution. We'll pick a learning rate of 0.02, again a reasonable choice. Settings such as batch size and learning rate are what are usually referred to as hyper-parameters. What values we give them can have a great impact on training performance.
+Before the network can be used, its parameters (weights and biases) need to be set to initial values that are sufficiently random while keeping the magnitude of gradients limited.
+The [Xavier](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.initializer.Xavier) initializer is usually a good default choice.
 
-We will use [Trainer](http://mxnet.io/api/python/gluon/gluon.html#trainer) class to apply the
-[SGD optimizer](http://mxnet.io/api/python/optimization/optimization.html#mxnet.optimizer.SGD) on the
-initialized parameters.
+Since the `net.initialize()` method creates arrays for its parameters, it needs to know where to store the values: in CPU or GPU memory.
+Like many other functions and classes that deal with memory management in one way or another, the `initialize()` method takes an optional `ctx` (short for *context*) argument, where the return value of either `mx.cpu()` or `mx.gpu()` can be provided.
 
 ```python
-gpus = mx.test_utils.list_gpus()
-ctx =  [mx.gpu()] if gpus else [mx.cpu(0), mx.cpu(1)]
-net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
-trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02})
+ctx = mx.gpu(0) if mx.context.num_gpus() > 0 else mx.cpu(0)
+net.initialize(mx.init.Xavier(), ctx=ctx)
 ```
 
-#### Train the network
+To train the network parameters, we will make use of the [stochastic gradient descent (SGD)](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) optimizer.
+More specifically, we use mini-batch SGD in contrast to the classical SGD that processes one example at a time, which is very slow in practice.
+(Recall that we set the batch size to 100 in the ["Loading Data"](#loading-data) part.)
+
+Besides the batch size, the SGD algorithm has one important *hyperparameter*: the *learning rate*.
+It determines the size of steps that the algorithm takes in search of parameters that allow the network to optimally fit the training data.
+Therefore, this value has great influence on both the course of the training process and its final outcome.
+In general, hyperparameters refer to *non-learnable* values that need to be chosen before training and that have a potential effect on the outcome.
+In this example, further hyperparameters are the number of layers in the network, the number of neurons of the first two layers, the activation function and (later) the loss function.
+
+The SGD optimization method can be accessed in MXNet Gluon through the [`Trainer`](https://mxnet.incubator.apache.org/api/python/gluon/gluon.html#trainer) class.
+Internally, it makes use of the [`SGD`](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.SGD) optimizer class.
 
-Typically, one runs the training until convergence, which means that we have learned a good set of model parameters (weights + biases) from the train data. For the purpose of this tutorial, we'll run training for 10 epochs and stop. An epoch is one full pass over the entire train data.
+```python
+trainer = gluon.Trainer(
+    params=net.collect_params(),
+    optimizer='sgd',
+    optimizer_params={'learning_rate': 0.04},
+)
+```
+
+#### Training
 
-We will take following steps for training:
+Training the network requires a way to tell how well the network currently fits the training data.
+Following common practice in optimization, this quality of fit is expressed through a *loss value* (also referred to as badness-of-fit or data discrepancy), which the algorithm then tries to minimize by adjusting the weights of the model.
 
-- Define [Accuracy evaluation metric](http://mxnet.io/api/python/metric/metric.html#mxnet.metric.Accuracy) over training data.
-- Loop over inputs for every epoch.
-- Forward input through network to get output.
-- Compute loss with output and label inside record scope.
-- Backprop gradient inside record scope.
-- Update evaluation metric and parameters with gradient descent.
+Ideally, in a classification task, we would like to use the prediction inaccuracy, i.e., the fraction of incorrectly classified samples, to guide the training to a lower value.
+Unfortunately, inaccuracy is a poor choice for training since it contains almost no information that can be used to update the network parameters (its gradient is zero almost everywhere).
+As a better behaved proxy for inaccuracy, the [softmax cross-entropy loss](https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss) is a popular choice.
+It has the essential property of being minimal for the correct prediction, but at the same time, it is everywhere differentiable with nonzero gradient.
+The [accuracy](https://mxnet.incubator.apache.org/api/python/metric/metric.html#mxnet.metric.Accuracy) metric is still useful for monitoring the training progress, since it is more intuitively interpretable than a loss value.
 
-Loss function takes (output, label) pairs and computes a scalar loss for each sample in the mini-batch. The scalars measure how far each output is from the label.
-There are many predefined loss functions in gluon.loss. Here we use
-[softmax_cross_entropy_loss](http://mxnet.io/api/python/gluon/gluon.html#mxnet.gluon.loss.softmax_cross_entropy_loss) for digit classification. We will compute loss and do backward propagation inside
-training scope which is defined by `autograd.record()`.
+**Note:** `SoftmaxCrossEntropyLoss` combines the softmax activation and the cross entropy loss function in one layer, therefore the last layer in our network has no activation function.
 
 ```python
-%%time
-epoch = 10
-# Use Accuracy as the evaluation metric.
 metric = mx.metric.Accuracy()
-softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss()
-for i in range(epoch):
-    # Reset the train data iterator.
-    train_data.reset()
-    # Loop over the train data iterator.
-    for batch in train_data:
-        # Splits train data into multiple slices along batch_axis
-        # and copy each slice into a context.
-        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-        # Splits train labels into multiple slices along batch_axis
-        # and copy each slice into a context.
-        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-        outputs = []
-        # Inside training scope
-        with ag.record():
-            for x, y in zip(data, label):
-                z = net(x)
-                # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y)
-                # Backpropagate the error for one iteration.
-                loss.backward()
-                outputs.append(z)
-        # Updates internal evaluation
-        metric.update(label, outputs)
-        # Make one step of parameter update. Trainer needs to know the
-        # batch size of data to normalize the gradient by 1/batch_size.
-        trainer.step(batch.data[0].shape[0])
-    # Gets the evaluation result.
+loss_function = gluon.loss.SoftmaxCrossEntropyLoss()
+```
+
+Typically, the training is run until convergence, which means that further iterations will no longer lead to improvements of the loss function, and that the network has probably learned a good set of model parameters from the train data.
+For the purpose of this tutorial, we only loop 10 times over the entire dataset.
+One such pass over the data is usually called an *epoch*.
+
+The following steps are taken in each `epoch`:
+
+- Get a minibatch of `inputs` and `labels` from the `train_loader`.
+- Feed the `inputs` to the network, producing `outputs`.
+- Compute the minibatch `loss` value by comparing `outputs` to `labels`.
+- Use backpropagation to compute the gradients of the loss with respect to each of the network parameters by calling `loss.backward()`.
+- Update the parameters of the network according to the optimizer rule with `trainer.step(batch_size=inputs.shape[0])`.
+- Print the current accuracy over the training data, i.e., the fraction of correctly classified training examples.
+
+```python
+num_epochs = 10
+
+for epoch in range(num_epochs):
+    for inputs, labels in train_loader:
+        # Possibly copy inputs and labels to the GPU
+        inputs = inputs.as_in_context(ctx)
+        labels = labels.as_in_context(ctx)
+
+        # The forward pass and the loss computation need to be wrapped
+        # in a `record()` scope to make sure the computational graph is
+        # recorded in order to automatically compute the gradients
+        # during the backward pass.
+        with autograd.record():
+            outputs = net(inputs)
+            loss = loss_function(outputs, labels)
+
+        # Compute gradients by backpropagation and update the evaluation
+        # metric
+        loss.backward()
+        metric.update(labels, outputs)
+
+        # Update the parameters by stepping the trainer; the batch size
+        # is required to normalize the gradients by `1 / batch_size`.
+        trainer.step(batch_size=inputs.shape[0])
+
+    # Print the evaluation metric and reset it for the next epoch
     name, acc = metric.get()
-    # Reset evaluation result to initial state.
+    print('After epoch {}: {} = {}'.format(epoch + 1, name, acc))
     metric.reset()
-    print('training acc at epoch %d: %s=%f'%(i, name, acc))
 ```
 
-#### Prediction
+#### Validation
+
+When the above training has completed, we can evaluate the trained model by comparing predictions from the validation dataset with their respective correct labels.
+It is important to notice that the validation data was not used during training, i.e., the network has not seen the images and their true labels yet.
+Keeping a part of the data aside for validation is crucial for detecting *overfitting* of a network: If a neural network has enough parameters, it can simply memorize the training data and look up the true label for a given training image.
+While this results in 100 % training accuracy, such an overfit model would perform very poorly on new data.
+In other words, an overfit model does not generalize to a broader class of inputs than the training set, and such an outcome is almost always undesirable.
+Therefore, having a subset of "unseen" data for validation is an important part of good practice in machine learning.
 
-After the above training completes, we can evaluate the trained model by running predictions on validation dataset. Since the dataset also has labels for all test images, we can compute the accuracy metric over validation data as follows:
+To validate our model on the validation data, we can run the following snippet of code:
 
 ```python
-# Use Accuracy as the evaluation metric.
 metric = mx.metric.Accuracy()
-# Reset the validation data iterator.
-val_data.reset()
-# Loop over the validation data iterator.
-for batch in val_data:
-    # Splits validation data into multiple slices along batch_axis
-    # and copy each slice into a context.
-    data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-    # Splits validation label into multiple slices along batch_axis
-    # and copy each slice into a context.
-    label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-    outputs = []
-    for x in data:
-        outputs.append(net(x))
-    # Updates internal evaluation
-    metric.update(label, outputs)
-print('validation acc: %s=%f'%metric.get())
-assert metric.get()[1] > 0.94
+for inputs, labels in val_loader:
+    # Possibly copy inputs and labels to the GPU
+    inputs = inputs.as_in_context(ctx)
+    labels = labels.as_in_context(ctx)
+    metric.update(labels, net(inputs))
+print('Validaton: {} = {}'.format(*metric.get()))
+assert metric.get()[1] > 0.96
 ```
 
-If everything went well, we should see an accuracy value that is around 0.96, which means that we are able to accurately predict the digit in 96% of test images. This is a pretty good result. But as we will see in the next part of this tutorial, we can do a lot better than that.
-
-### Convolutional Neural Network
+If everything went well, we should see an accuracy value that is around 0.968, which means that we are able to accurately predict the digit in 97 % of test images.
+This is a pretty good result, but as we will see in the next part of this tutorial, we can do a lot better than that.
 
-Earlier, we briefly touched on a drawback of MLP when we said we need to discard the input image's original shape and flatten it as a vector before we can feed it as input to the MLP's first fully connected layer. Turns out this is an important issue because we don't take advantage of the fact that pixels in the image have natural spatial correlation along the horizontal and vertical axes. A convolutional neural network (CNN) aims to address this problem by using a more structured weight representation. Instead of flattening the image and doing a simple matrix-matrix multiplication, it employs one or more convolutional layers that each performs a 2-D convolution on the input image.
+That said, a single number only conveys very limited information on the performance of our neural network.
+It is always a good idea to actually look at the images on which the network performed poorly, and check for clues on how to improve the performance.
+We do that with the help of a small function that produces a list of the images which the network got wrong, together with the predicted and true labels.
 
-A single convolution layer consists of one or more filters that each play the role of a feature detector. During training, a CNN learns appropriate representations (parameters) for these filters. Similar to MLP, the output from the convolutional layer is transformed by applying a non-linearity. Besides the convolutional layer, another key aspect of a CNN is the pooling layer. A pooling layer serves to make the CNN translation invariant: a digit remains the same even when it is shifted left/right/up/down by a few pixels. A pooling layer reduces a *n x m* patch into a single value to make the network less sensitive to the spatial location. Pooling layer is always included after each conv (+ activation) layer in the CNN.
-
-The following source code defines a convolutional neural network architecture called LeNet. LeNet is a popular network known to work well on digit classification tasks. We will use a slightly different version from the original LeNet implementation, replacing the sigmoid activations with tanh activations for the neurons.
+```python
+def get_mislabeled(loader):
+    """Return list of ``(input, pred_lbl, true_lbl)`` for mislabeled samples."""
+    mislabeled = []
+    for inputs, labels in loader:
+        inputs = inputs.as_in_context(ctx)
+        labels = labels.as_in_context(ctx)
+        outputs = net(inputs)
+        # Predicted label is the index is where the output is maximal
+        preds = nd.argmax(outputs, axis=1)
+        for i, p, l in zip(inputs, preds, labels):
+            p, l = int(p.asscalar()), int(l.asscalar())
+            if p != l:
+                mislabeled.append((i.asnumpy(), p, l))
+    return mislabeled
+```
 
-A typical way to write your network is creating a new class inherited from `gluon.Block`
-class. We can define the network by composing and inheriting Block class as follows:
+We can now get the mislabeled images in the training and validation sets and plot a selection of them:
 
 ```python
-import mxnet.ndarray as F
-
-class Net(gluon.Block):
-    def __init__(self, **kwargs):
-        super(Net, self).__init__(**kwargs)
-        with self.name_scope():
-            # layers created in name_scope will inherit name space
-            # from parent layer.
-            self.conv1 = nn.Conv2D(20, kernel_size=(5,5))
-            self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
-            self.conv2 = nn.Conv2D(50, kernel_size=(5,5))
-            self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
-            self.fc1 = nn.Dense(500)
-            self.fc2 = nn.Dense(10)
-
-    def forward(self, x):
-        x = self.pool1(F.tanh(self.conv1(x)))
-        x = self.pool2(F.tanh(self.conv2(x)))
-        # 0 means copy over size from corresponding dimension.
-        # -1 means infer size from the rest of dimensions.
-        x = x.reshape((0, -1))
-        x = F.tanh(self.fc1(x))
-        x = F.tanh(self.fc2(x))
-        return x
+import numpy as np
+
+sample_size = 8
+wrong_train = get_mislabeled(train_loader)
+wrong_val = get_mislabeled(val_loader)
+wrong_train_sample = [wrong_train[i] for i in np.random.randint(0, len(wrong_train), size=sample_size)]
+wrong_val_sample = [wrong_val[i] for i in np.random.randint(0, len(wrong_val), size=sample_size)]
+
+import matplotlib.pyplot as plt
+
+fig, axs = plt.subplots(ncols=sample_size)
+for ax, (img, pred, lbl) in zip(axs, wrong_train_sample):
+    fig.set_size_inches(18, 4)
+    fig.suptitle("Sample of wrong predictions in the training set", fontsize=20)
+    ax.imshow(img[0], cmap="gray")
+    ax.set_title("Predicted: {}\nActual: {}".format(pred, lbl))
+    ax.xaxis.set_visible(False)
+    ax.yaxis.set_visible(False)
+
+fig, axs = plt.subplots(ncols=sample_size)
+for ax, (img, pred, lbl) in zip(axs, wrong_val_sample):
+    fig.set_size_inches(18, 4)
+    fig.suptitle("Sample of wrong predictions in the validation set", fontsize=20)
+    ax.imshow(img[0], cmap="gray")
+    ax.set_title("Predicted: {}\nActual: {}".format(pred, lbl))
+    ax.xaxis.set_visible(False)
+    ax.yaxis.set_visible(False)
 ```
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/mnist_wrong_preds_train.png) <!--notebook-skip-line-->
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/gluon/mnist_wrong_preds_val.png) <!--notebook-skip-line-->
+
+In this case, it is rather obvious that our MLP network is either too simple or has not been trained long enough to perform really great on this dataset, as can be seen from the fact that some of the mislabeled examples are rather "easy" and should not be a challenge for our neural net.
+As it turns out, moving to the CNN architecture presented in the following section will give a big performance boost.
+
+### Convolutional Neural Network (CNN)
 
-We just defined the forward function here, and the backward function to compute gradients
-is automatically defined for you using autograd.
-We also imported `mxnet.ndarray` package to use activation functions from `ndarray` API.
+A fundamental issue with the MLP network is that it requires the inputs to be flattened (in the non-batch axes) before they can be processed by the dense layers.
+This means in particular that the spatial structure of an image is largely discarded, and that the values describing it are just treated as a long vector.
+The network then has to figure out the neighborhood relations of pixels from scratch by adjusting its weights accordingly, which seems very wasteful.
 
-Now, We will create the network as follows:
+A CNN aims to address this problem by using a more structured weight representation.
+Instead of connecting all inputs to all outputs, the characteristic [convolution layer](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.Conv2D) only considers a small neighborhood of a pixel to compute the value of the corresponding output pixel.
+In particular, the spatial structure of the image is preserved, i.e., one can speak of input and output pixels in the first place.
+Only the size of the image may change through convolutions.
+[This article](http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html) gives a good and intuitive explanation of convolutions in the context of deep learning.
+
+The size of the neighborhood that a convolution layer considers for each pixel is usually referred to as *filter size* or *kernel size*.
+The array of weights -- which does not depend on the output pixel location, only on the position within such a neighborhood -- is called *filter* or *kernel*.
+Typical filter sizes range from *3 x 3* to *13 x 13*, which implies that a convolution layer has *far* fewer parameters than a dense layer.
 
 ```python
-net = Net()
+conv_layer = nn.Conv2D(kernel_size=(3, 3), channels=32, in_channels=16, activation='relu')
+print(conv_layer.params)
 ```
 
-![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/conv_mnist.png)
-
-**Figure 3:** First conv + pooling layer in LeNet.
+`Parameter conv0_weight (shape=(32, 16, 3, 3), dtype=<class 'numpy.float32'>)` <!--notebook-skip-line-->
 
-Now we train LeNet with similar hyper-parameters as before. Note that, if a GPU is available, we recommend using it. This greatly speeds up computation given that LeNet is more complex and compute-intensive than the previous multilayer perceptron. To do so, we only need to change `mx.cpu()` to `mx.gpu()` and MXNet takes care of the rest. Just like before, we'll stop training after 10 epochs.
+`Parameter conv0_bias (shape=(32,), dtype=<class 'numpy.float32'>)` <!--notebook-skip-line-->
 
-Training and prediction can be done in the similar way as we did for MLP.
+Filters can be thought of as little feature detectors: in early layers, they learn to detect small local structures like edges, whereas later layers become sensitive to more and more global structures.
+Since images often contain a rich set of such features, it is customary to have each convolution layer employ and learn many different filters in parallel, so as to detect many different image features on their respective scales.
+This stacking of filters, which directly translates to a stacking of output images, is referred to as output *channels* of the convolution layer.
+Likewise, the input can already have multiple channels.
+In the above example, the convolution layer takes an input image with 16 channels and maps it to an image with 32 channels by convolving each of the input channels with a different set of 32 filters and then summing over the 16 input channels.
+Therefore, the total number of filter parameters in the convolution layer is `channels * in_channels * prod(kernel_size)`, which amounts to 4608 in the above example.
 
-#### Initialize parameters and optimizer
+Another characteristic feature of CNNs is the usage of *pooling*, i.e., summarizing patches to a single number, to shrink the size of an image as it travels through the layers.
+This step lowers the computational burden of training the network, but the main motivation for pooling is the assumption that it makes the network less sensitive to small translations, rotations or deformations of the image.
+Popular pooling strategies are max-pooling and average-pooling, and they are usually performed after convolution.
 
-We will initialize the network parameters as follows:
+The following code defines a CNN architecture called *LeNet*.
+The LeNet architecture is a popular network known to work well on digit classification tasks.
+We will use a version that differs slightly from the original in the usage of `tanh` activations instead of `sigmoid`.
 
 ```python
-# set the context on GPU is available otherwise CPU
-ctx = [mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()]
-net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
-trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03})
+lenet = nn.HybridSequential(prefix='LeNet_')
+with lenet.name_scope():
+    lenet.add(
+        nn.Conv2D(channels=20, kernel_size=(5, 5), activation='tanh'),
+        nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)),
+        nn.Conv2D(channels=50, kernel_size=(5, 5), activation='tanh'),
+        nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)),
+        nn.Flatten(),
+        nn.Dense(500, activation='tanh'),
+        nn.Dense(10, activation=None),
+    )
 ```
 
-#### Training
+To get an overview of all intermediate sizes of arrays and the number of parameters in each layer, the `summary()` method can be a great help.
+It requires the network parameters to be initialized, and an input array to infer the sizes.
 
 ```python
-# Use Accuracy as the evaluation metric.
-metric = mx.metric.Accuracy()
-softmax_cross_entropy_loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-for i in range(epoch):
-    # Reset the train data iterator.
-    train_data.reset()
-    # Loop over the train data iterator.
-    for batch in train_data:
-        # Splits train data into multiple slices along batch_axis
-        # and copy each slice into a context.
-        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-        # Splits train labels into multiple slices along batch_axis
-        # and copy each slice into a context.
-        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-        outputs = []
-        # Inside training scope
-        with ag.record():
-            for x, y in zip(data, label):
-                z = net(x)
-                # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y)
-                # Backpropogate the error for one iteration.
-                loss.backward()
-                outputs.append(z)
-        # Updates internal evaluation
-        metric.update(label, outputs)
-        # Make one step of parameter update. Trainer needs to know the
-        # batch size of data to normalize the gradient by 1/batch_size.
-        trainer.step(batch.data[0].shape[0])
-    # Gets the evaluation result.
-    name, acc = metric.get()
-    # Reset evaluation result to initial state.
-    metric.reset()
-    print('training acc at epoch %d: %s=%f'%(i, name, acc))
+lenet.initialize(mx.init.Xavier(), ctx=ctx)
+lenet.summary(nd.zeros((1, 1, 28, 28), ctx=ctx))
+```
+
+```
+Output:
+
+--------------------------------------------------------------------------------
+        Layer (type)                                Output Shape         Param #
+================================================================================
+               Input                              (1, 1, 28, 28)               0
+        Activation-1                <Symbol eNet_conv0_tanh_fwd>               0
+        Activation-2                             (1, 20, 24, 24)               0
+            Conv2D-3                             (1, 20, 24, 24)             520
+         MaxPool2D-4                             (1, 20, 12, 12)               0
+        Activation-5                <Symbol eNet_conv1_tanh_fwd>               0
+        Activation-6                               (1, 50, 8, 8)               0
+            Conv2D-7                               (1, 50, 8, 8)           25050
+         MaxPool2D-8                               (1, 50, 4, 4)               0
+           Flatten-9                                    (1, 800)               0
+       Activation-10               <Symbol eNet_dense0_tanh_fwd>               0
+       Activation-11                                    (1, 500)               0
+            Dense-12                                    (1, 500)          400500
+            Dense-13                                     (1, 10)            5010
+================================================================================
+Parameters in forward computation graph, duplicate included
+   Total params: 431080
+   Trainable params: 431080
+   Non-trainable params: 0
+Shared params in forward computation graph: 0
+Unique parameters in model: 431080
+--------------------------------------------------------------------------------
 ```
 
-#### Prediction
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/conv_mnist.png)
+
+**Figure 3:** First conv + pooling layer in LeNet.
 
-Finally, we'll use the trained LeNet model to generate predictions for the test data.
+Now we train LeNet with similar hyperparameters and procedure as before.
+Note that it is advisable to use a GPU if possible, since this model is significantly more computationally demanding to evaluate and train than the previous MLP.
 
 ```python
-# Use Accuracy as the evaluation metric.
+trainer = gluon.Trainer(
+    params=lenet.collect_params(),
+    optimizer='sgd',
+    optimizer_params={'learning_rate': 0.04},
+)
 metric = mx.metric.Accuracy()
-# Reset the validation data iterator.
-val_data.reset()
-# Loop over the validation data iterator.
-for batch in val_data:
-    # Splits validation data into multiple slices along batch_axis
-    # and copy each slice into a context.
-    data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-    # Splits validation label into multiple slices along batch_axis
-    # and copy each slice into a context.
-    label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-    outputs = []
-    for x in data:
-        outputs.append(net(x))
-    # Updates internal evaluation
-    metric.update(label, outputs)
-print('validation acc: %s=%f'%metric.get())
-assert metric.get()[1] > 0.98
+num_epochs = 10
+
+for epoch in range(num_epochs):
+    for inputs, labels in train_loader:
+        inputs = inputs.as_in_context(ctx)
+        labels = labels.as_in_context(ctx)
+
+        with autograd.record():
+            outputs = lenet(inputs)
+            loss = loss_function(outputs, labels)
+
+        loss.backward()
+        metric.update(labels, outputs)
+
+        trainer.step(batch_size=inputs.shape[0])
+
+    name, acc = metric.get()
+    print('After epoch {}: {} = {}'.format(epoch + 1, name, acc))
+    metric.reset()
+
+for inputs, labels in val_loader:
+    inputs = inputs.as_in_context(ctx)
+    labels = labels.as_in_context(ctx)
+    metric.update(labels, lenet(inputs))
+print('Validaton: {} = {}'.format(*metric.get()))
+assert metric.get()[1] > 0.985
 ```
 
-If all went well, we should see a higher accuracy metric for predictions made using LeNet. With CNN we should be able to correctly predict around 98% of all test images.
+If all went well, we should see a higher accuracy metric for predictions made using LeNet.
+With this CNN we should be able to correctly predict around 99% of all validation images.
 
 ## Summary
 
-In this tutorial, we have learned how to use MXNet to solve a standard computer vision problem: classifying images of hand written digits. You have seen how to quickly and easily build, train and evaluate models such as MLP and CNN with MXNet Gluon package.
+In this tutorial, we demonstrated how to use MXNet to solve a standard computer vision problem: classifying images of hand-written digits.
+We showed how to quickly build, train and evaluate models such as MLPs and CNNs with the MXNet Gluon package.
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->

From e0ff3c36ee171386fef01fb86c54c343e4b04c14 Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Wed, 5 Dec 2018 21:58:19 -0800
Subject: [PATCH 38/54] Updated docs for randint operator (#13541)

* updated docs for randint

* added randint in __all__ and reordered acc to categorical then alphabetical

* Trigger CI

* minus mxnet.symbol and alphabetical for ndarray,symbol.md

* alphabetical order
---
 docs/api/python/ndarray/ndarray.md | 20 +++++++++++---------
 docs/api/python/ndarray/random.md  |  6 ++++--
 docs/api/python/symbol/random.md   |  5 +++--
 docs/api/python/symbol/symbol.md   | 19 ++++++++++---------
 python/mxnet/ndarray/random.py     |  2 +-
 python/mxnet/symbol/random.py      |  2 +-
 6 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/docs/api/python/ndarray/ndarray.md b/docs/api/python/ndarray/ndarray.md
index 6fcf1d428d2b..6419c4ed4067 100644
--- a/docs/api/python/ndarray/ndarray.md
+++ b/docs/api/python/ndarray/ndarray.md
@@ -587,15 +587,17 @@ The `ndarray` package provides several classes:
 .. autosummary::
     :nosignatures:
 
-    mxnet.ndarray.random.uniform
-    mxnet.ndarray.random.normal
-    mxnet.ndarray.random.gamma
-    mxnet.ndarray.random.exponential
-    mxnet.ndarray.random.poisson
-    mxnet.ndarray.random.negative_binomial
-    mxnet.ndarray.random.generalized_negative_binomial
-    mxnet.ndarray.random.multinomial
-    mxnet.ndarray.random.shuffle
+    random.exponential
+    random.gamma
+    random.generalized_negative_binomial
+    random.multinomial
+    random.negative_binomial
+    random.normal
+    random.poisson
+    random.randint
+    random.randn
+    random.shuffle
+    random.uniform
     mxnet.random.seed
 ```
 
diff --git a/docs/api/python/ndarray/random.md b/docs/api/python/ndarray/random.md
index 3ea611f5c8e4..60c565dd5528 100644
--- a/docs/api/python/ndarray/random.md
+++ b/docs/api/python/ndarray/random.md
@@ -31,12 +31,14 @@ In the rest of this document, we list routines provided by the `ndarray.random`
     exponential
     gamma
     generalized_negative_binomial
+    multinomial
     negative_binomial
     normal
     poisson
-    uniform
-    multinomial
+    randint
+    randn
     shuffle
+    uniform
     mxnet.random.seed
 ```
 
diff --git a/docs/api/python/symbol/random.md b/docs/api/python/symbol/random.md
index b93f641334f8..1ecaf38830fc 100644
--- a/docs/api/python/symbol/random.md
+++ b/docs/api/python/symbol/random.md
@@ -31,12 +31,13 @@ In the rest of this document, we list routines provided by the `symbol.random` p
     exponential
     gamma
     generalized_negative_binomial
+    multinomial
     negative_binomial
     normal
     poisson
-    uniform
-    multinomial
+    randint
     shuffle
+    uniform
     mxnet.random.seed
 ```
 
diff --git a/docs/api/python/symbol/symbol.md b/docs/api/python/symbol/symbol.md
index a4038d741741..9eba2618065b 100644
--- a/docs/api/python/symbol/symbol.md
+++ b/docs/api/python/symbol/symbol.md
@@ -586,15 +586,16 @@ Composite multiple symbols into a new one by an operator.
 .. autosummary::
     :nosignatures:
 
-    mxnet.symbol.random.uniform
-    mxnet.symbol.random.normal
-    mxnet.symbol.random.gamma
-    mxnet.symbol.random.exponential
-    mxnet.symbol.random.poisson
-    mxnet.symbol.random.negative_binomial
-    mxnet.symbol.random.generalized_negative_binomial
-    mxnet.symbol.random.multinomial
-    mxnet.symbol.random.shuffle
+    random.exponential
+    random.gamma
+    random.generalized_negative_binomial
+    random.multinomial
+    random.negative_binomial
+    random.normal
+    random.poisson
+    random.randint
+    random.shuffle
+    random.uniform
     mxnet.random.seed
 ```
 
diff --git a/python/mxnet/ndarray/random.py b/python/mxnet/ndarray/random.py
index fc8be571e2e3..78339a020862 100644
--- a/python/mxnet/ndarray/random.py
+++ b/python/mxnet/ndarray/random.py
@@ -25,7 +25,7 @@
 
 __all__ = ['uniform', 'normal', 'randn', 'poisson', 'exponential', 'gamma',
            'multinomial', 'negative_binomial', 'generalized_negative_binomial',
-           'shuffle']
+           'shuffle', 'randint']
 
 
 def _random_helper(random, sampler, params, shape, dtype, ctx, out, kwargs):
diff --git a/python/mxnet/symbol/random.py b/python/mxnet/symbol/random.py
index c5940ac96a50..34663cddf02c 100644
--- a/python/mxnet/symbol/random.py
+++ b/python/mxnet/symbol/random.py
@@ -23,7 +23,7 @@
 
 
 __all__ = ['uniform', 'normal', 'poisson', 'exponential', 'gamma', 'multinomial',
-           'negative_binomial', 'generalized_negative_binomial', 'shuffle']
+           'negative_binomial', 'generalized_negative_binomial', 'shuffle', 'randint']
 
 
 def _random_helper(random, sampler, params, shape, dtype, kwargs):

From cf6e8cbd035bf315b3e8280416468a629c780d03 Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Wed, 5 Dec 2018 23:20:08 -0800
Subject: [PATCH 39/54] Chi_square_check for discrete distribution fix (#13543)

* check for bucket instead of index

* enumerate instead of range(len())

* count instead of sum to solve attribute error

* revert to sum

* seperate discrete and continuous

* Trigger CI
---
 python/mxnet/test_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 14875601cd25..26f7762ca9b5 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1911,12 +1911,15 @@ def chi_square_check(generator, buckets, probs, nsamples=1000000):
     if continuous_dist:
         sample_bucket_ids = np.searchsorted(buckets_npy, samples, side='right')
     else:
-        sample_bucket_ids = samples
+        sample_bucket_ids = np.array(samples)
     if continuous_dist:
         sample_bucket_ids = sample_bucket_ids // 2
     obs_freq = np.zeros(shape=len(buckets), dtype=np.int)
-    for i in range(len(buckets)):
-        obs_freq[i] = (sample_bucket_ids == i).sum()
+    for i, _ in enumerate(buckets):
+        if continuous_dist:
+            obs_freq[i] = (sample_bucket_ids == i).sum()
+        else:
+            obs_freq[i] = (sample_bucket_ids == buckets[i]).sum()
     _, p = ss.chisquare(f_obs=obs_freq, f_exp=expected_freq)
     return p, obs_freq, expected_freq
 

From 29885c56700971c1557f7611acb7002d87da5f2b Mon Sep 17 00:00:00 2001
From: Tao Lv <tao.a.lv@intel.com>
Date: Thu, 6 Dec 2018 17:44:06 +0800
Subject: [PATCH 40/54] Revert "Bumped minor version from 1.4.0 to 1.5.0 on
 master, updated License file" (#13558)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Revert "Chi_square_check for discrete distribution fix (#13543)"

This reverts commit cf6e8cbd035bf315b3e8280416468a629c780d03.

* Revert "Updated docs for randint operator (#13541)"

This reverts commit e0ff3c36ee171386fef01fb86c54c343e4b04c14.

* Revert "Simplifications and some fun stuff for the MNIST Gluon tutorial (#13094)"

This reverts commit 8bbac827742c21607a863137792f03bd09847419.

* Revert "Fix #13521 (#13537)"

This reverts commit f6b4665995f8f8ff32862a029b2074475d8467eb.

* Revert "Add a retry to qemu_provision (#13551)"

This reverts commit f6f840110d74111f98c20eab5b08d64a46ebf0cd.

* Revert "[MXNET-769] Use MXNET_HOME in a tempdir in windows to prevent access denied due t… (#13531)"

This reverts commit bd8e0f8356676749ecae16ec38a366b4cc00bf15.

* Revert "[MXNET-1249] Fix Object Detector Performance with GPU (#13522)"

This reverts commit 1c8972c3c8f832519364916865541f48597581c7.

* Revert "Fixing a 404 in the ubuntu setup doc (#13542)"

This reverts commit cb0db290adcfd0fce956d02c234f81d453e41013.

* Revert "Bumped minor version from 1.4.0 to 1.5.0 on master, updated License file (#13478)"

This reverts commit 40db61908000ee86d21aac847ff2225807d6c168.
---
 CMakeLists.txt                                |  1 -
 LICENSE                                       | 94 ++-----------------
 Makefile                                      |  9 +-
 R-package/DESCRIPTION                         | 10 +-
 ci/docker/runtime_functions.sh                |  3 +
 ci/jenkins/Jenkins_steps.groovy               |  8 +-
 contrib/clojure-package/README.md             | 16 ++--
 .../cnn-text-classification/project.clj       |  2 +-
 .../clojure-package/examples/gan/project.clj  |  2 +-
 .../examples/imclassification/project.clj     |  2 +-
 .../examples/module/project.clj               |  2 +-
 .../examples/multi-label/project.clj          |  2 +-
 .../examples/neural-style/project.clj         |  2 +-
 .../examples/pre-trained-models/project.clj   |  2 +-
 .../examples/profiler/project.clj             |  2 +-
 .../clojure-package/examples/rnn/project.clj  |  2 +-
 .../examples/tutorial/project.clj             |  6 +-
 .../examples/visualization/project.clj        |  2 +-
 contrib/clojure-package/project.clj           |  4 +-
 docs/api/python/symbol/contrib.md             |  3 -
 .../scala/mxnet_scala_on_intellij.md          |  4 +-
 include/mxnet/base.h                          |  2 +-
 mkldnn.mk                                     | 12 +--
 python/mxnet/libinfo.py                       |  2 +-
 .../assembly/linux-x86_64-cpu/pom.xml         |  8 +-
 .../assembly/linux-x86_64-gpu/pom.xml         |  8 +-
 scala-package/assembly/osx-x86_64-cpu/pom.xml |  8 +-
 scala-package/assembly/pom.xml                |  2 +-
 scala-package/core/pom.xml                    |  6 +-
 scala-package/examples/pom.xml                |  6 +-
 scala-package/infer/pom.xml                   |  4 +-
 .../init-native/linux-x86_64/pom.xml          |  4 +-
 scala-package/init-native/osx-x86_64/pom.xml  |  4 +-
 scala-package/init-native/pom.xml             |  2 +-
 scala-package/init/pom.xml                    |  2 +-
 scala-package/macros/pom.xml                  |  6 +-
 scala-package/native/linux-x86_64-cpu/pom.xml |  4 +-
 scala-package/native/linux-x86_64-gpu/pom.xml |  4 +-
 scala-package/native/osx-x86_64-cpu/pom.xml   |  4 +-
 scala-package/native/pom.xml                  |  2 +-
 scala-package/pom.xml                         |  2 +-
 scala-package/spark/pom.xml                   |  4 +-
 snapcraft.yaml                                |  2 +-
 tests/cpp/unittest.mk                         |  8 +-
 .../train_mxnet_legacy_models.sh              |  4 +-
 tests/python/mkl/test_mkldnn.py               |  6 +-
 tests/python/mkl/test_mkldnn_install.py       | 56 +++++++++++
 47 files changed, 158 insertions(+), 192 deletions(-)
 create mode 100644 tests/python/mkl/test_mkldnn_install.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 161705643194..3b8bbd2e0272 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,7 +227,6 @@ if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
-    set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
diff --git a/LICENSE b/LICENSE
index 2eb9c329e532..a8b57e583764 100644
--- a/LICENSE
+++ b/LICENSE
@@ -218,20 +218,16 @@
     1. MXNet Cpp-package - For details, /cpp-package/LICENSE
     2. MXNet rcnn - For details, see, example/rcnn/LICENSE
     3. scala-package - For details, see, scala-package/LICENSE
-    4. Warp-CTC - For details, see, 3rdparty/ctc_include/LICENSE
+    4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE
     5. 3rdparty/dlpack - For details, see, 3rdparty/dlpack/LICENSE
     6. 3rdparty/dmlc-core - For details, see, 3rdparty/dmlc-core/LICENSE
     7. 3rdparty/mshadow - For details, see, 3rdparty/mshadow/LICENSE
     8. 3rdparty/tvm - For details, see, 3rdparty/tvm/LICENSE
     9. 3rdparty/tvm/dmlc-core - For details, see, 3rdparty/tvm/dmlc-core/LICENSE
-    10. 3rdparty/tvm/dlpack - For details, see, 3rdparty/tvm/3rdparty/dlpack/LICENSE
-    11. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
-    12. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
-    13. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
-    14. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
-    15. clojure-package - For details, see, contrib/clojure-package/LICENSE
-    16. R-package - For details, see, R-package/LICENSE
-    17. ONNX-TensorRT benchmark package - For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE
+    10. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
+    11. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
+    12. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
+    13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
 
 
     =======================================================================================
@@ -243,9 +239,6 @@
     3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE
     4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt
     5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE
-    6. HalideIR - For details, see 3rdparty/tvm/3rdparty/HalideIR/LICENSE
-    7. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/LICENSE
-    8. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/third_party/onnx/LICENSE
 
 
     =======================================================================================
@@ -253,7 +246,7 @@
     =======================================================================================
 
     1. Moderngpu
-    For details, see, 3rdparty/ctc_include/contrib/moderngpu/LICENSE
+    For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE
 
     /******************************************************************************
     * Redistribution and use in source and binary forms, with or without
@@ -566,79 +559,4 @@
     #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-    =======================================================================================
-
-    12. Google tests
-        For details, ses, 3rdparty/mkldnn/tests/gtests/gtest/LICENSE
-
-    Copyright 2008, Google Inc.
-    All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-
-        * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-        * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-        * Neither the name of Google Inc. nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    =======================================================================================
-
-    13. ONNX python bindings
-    For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE
-
-    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-
-    1. Redistributions of source code must retain the above copyright notice, this
-       list of conditions and the following disclaimer.
-
-    2. Redistributions in binary form must reproduce the above copyright notice,
-       this list of conditions and the following disclaimer in the documentation
-       and/or other materials provided with the distribution.
-
-    3. Neither the name of the copyright holder nor the names of its contributors
-       may be used to endorse or promote products derived from this software
-       without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    You are under no obligation whatsoever to provide any bug fixes, patches, or
-    upgrades to the features, functionality or performance of the source code
-    ("Enhancements") to anyone; however, if you choose to make your Enhancements
-    available either publicly, or directly to the author of this software, without
-    imposing a separate written license agreement for such Enhancements, then you
-    hereby grant the following license: a non-exclusive, royalty-free perpetual
-    license to install, use, modify, prepare derivative works, incorporate into
-    other computer software, distribute, and sublicense such enhancements or
-    derivative works thereof, in binary and source code form.
 
diff --git a/Makefile b/Makefile
index e424904ad785..16ea59f3d585 100644
--- a/Makefile
+++ b/Makefile
@@ -131,13 +131,8 @@ ifeq ($(USE_MKLDNN), 1)
 		CFLAGS += -I$(MKLROOT)/include
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	# MKLDNN but to needs to be dynamically linked for windows as not all VS compilers support static linking
-	ifneq ($(UNAME_S), Windows)
-		LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
-	else
-		CFLAGS += -I$(MKLDNNROOT)/include
-		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
-	endif
+	CFLAGS += -I$(MKLDNNROOT)/include
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
 # setup opencv
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index da098996c68b..46702eff9ed7 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,17 +1,17 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 1.5.0
+Version: 1.4.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
-Maintainer: Qiang Kou <qkou@qkou.info>, anirudhacharya <https://github.com/anirudhacharya>
-Repository: Apache
+Maintainer: Qiang Kou <qkou@qkou.info>
+Repository: DMLC
 Description: MXNet is a deep learning framework designed for both efficiency
     and flexibility. It allows you to mix the flavours of deep learning programs
     together to maximize the efficiency and your productivity.
 License: Apache License (== 2.0)
-URL: https://github.com/apache/incubator-mxnet/tree/master/R-package
-BugReports: https://github.com/apache/incubator-mxnet/issues
+URL: https://github.com/dmlc/mxnet/tree/master/R-package
+BugReports: https://github.com/dmlc/mxnet/issues
 Imports:
     methods,
     Rcpp (>= 0.12.1),
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 5a44cccc6aa0..1fc10bf0e085 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -629,6 +629,9 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 309775c88c85..f48a26737308 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,19 +23,19 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
+mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index 10b3ed770582..bc6100b86123 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -105,9 +105,9 @@ brew install opencv
 - Create a new project with `lein new my-mxnet`
 - Edit your `project.clj` and add one of the following entries to `:dependencies`, based on your system and the compute device you want to use:
 
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.5.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.5.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.5.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.4.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.4.0"]`
 
 After making this change and running `lein deps`, you should be able to run example code like this [NDArray Tutorial](https://github.com/apache/incubator-mxnet/blob/master/contrib/clojure-package/examples/tutorial/src/tutorial/ndarray.clj).
 
@@ -116,20 +116,20 @@ After making this change and running `lein deps`, you should be able to run exam
 With this option, you will install a Git revision of the Clojure package source and a [Scala package jar from Maven](https://search.maven.org/search?q=g:org.apache.mxnet) with native dependencies baked in.
 
 - Install additional dependencies as described in [the corresponding section for Option 1](#installing-additional-dependencies),
-- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.5.0` tag and a clone into the `~/mxnet` directory:
+- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.4.0` tag and a clone into the `~/mxnet` directory:
 
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
   git tag --list  # Find the tag that matches the Scala package version
-  git checkout tags/1.5.0 -b my_mxnet
+  git checkout tags/1.4.0 -b my_mxnet
   git submodule update --init --recursive
   cd contrib/clojure
   ```
 
 - Edit `project.clj` to include the desired Scala jar from Maven:
 
-      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0”]
+      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0”]
 
 - Run `lein test`. All the tests should run without error.
 - At this point you can run `lein install` to build and install the Clojure jar locally.
@@ -147,7 +147,7 @@ The first step is to recursively clone the MXNet repository and checkout the des
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
-  git checkout tags/1.5.0 -b my_mxnet  # this is optional
+  git checkout tags/1.4.0 -b my_mxnet  # this is optional
   git submodule update --init --recursive
   ```
 
@@ -176,7 +176,7 @@ The outcome of this step will be a shared library `lib/libmxnet.so` that is used
 
 #### Building the Clojure jar
  
-- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.5.0-SNAPSHOT"]`, to the `:dependencies`.
+- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.4.0-SNAPSHOT"]`, to the `:dependencies`.
 - Run `lein test`. All the tests should run without an error.
 - Run `lein install` to build and install the Clojure jar locally.
 
diff --git a/contrib/clojure-package/examples/cnn-text-classification/project.clj b/contrib/clojure-package/examples/cnn-text-classification/project.clj
index 29ebefe5d200..3eed0ddf9d9c 100644
--- a/contrib/clojure-package/examples/cnn-text-classification/project.clj
+++ b/contrib/clojure-package/examples/cnn-text-classification/project.clj
@@ -19,6 +19,6 @@
   :description "CNN text classification with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
   :pedantic? :skip
   :main cnn-text-classification.classifier)
diff --git a/contrib/clojure-package/examples/gan/project.clj b/contrib/clojure-package/examples/gan/project.clj
index b8f6903cabba..36b7c6cb3089 100644
--- a/contrib/clojure-package/examples/gan/project.clj
+++ b/contrib/clojure-package/examples/gan/project.clj
@@ -19,6 +19,6 @@
   :description "GAN MNIST with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
                  [nu.pattern/opencv "2.4.9-7"]]
   :main gan.gan-mnist)
diff --git a/contrib/clojure-package/examples/imclassification/project.clj b/contrib/clojure-package/examples/imclassification/project.clj
index 5f77cf55cf35..0dbede5052ac 100644
--- a/contrib/clojure-package/examples/imclassification/project.clj
+++ b/contrib/clojure-package/examples/imclassification/project.clj
@@ -19,6 +19,6 @@
   :description "Clojure examples for image classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
   :pedantic? :skip
   :main imclassification.train-mnist)
diff --git a/contrib/clojure-package/examples/module/project.clj b/contrib/clojure-package/examples/module/project.clj
index b667a2a4e122..a9a0a5f23e6e 100644
--- a/contrib/clojure-package/examples/module/project.clj
+++ b/contrib/clojure-package/examples/module/project.clj
@@ -19,7 +19,7 @@
   :description "Clojure examples for module"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
   :pedantic? :skip
   :main mnist-mlp)
 
diff --git a/contrib/clojure-package/examples/multi-label/project.clj b/contrib/clojure-package/examples/multi-label/project.clj
index 6e6a14340d36..8923738b946d 100644
--- a/contrib/clojure-package/examples/multi-label/project.clj
+++ b/contrib/clojure-package/examples/multi-label/project.clj
@@ -19,5 +19,5 @@
   :description "Example of multi-label classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
   :main multi-label.core)
diff --git a/contrib/clojure-package/examples/neural-style/project.clj b/contrib/clojure-package/examples/neural-style/project.clj
index b6d29f7c0e87..5a8eebea783f 100644
--- a/contrib/clojure-package/examples/neural-style/project.clj
+++ b/contrib/clojure-package/examples/neural-style/project.clj
@@ -19,7 +19,7 @@
   :description "Neural Style Transfer with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main neural-style.core)
diff --git a/contrib/clojure-package/examples/pre-trained-models/project.clj b/contrib/clojure-package/examples/pre-trained-models/project.clj
index 11e002503464..58b591ce5307 100644
--- a/contrib/clojure-package/examples/pre-trained-models/project.clj
+++ b/contrib/clojure-package/examples/pre-trained-models/project.clj
@@ -19,7 +19,7 @@
   :description "Example of using pre-trained models with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main pre-trained-models.fine-tune)
diff --git a/contrib/clojure-package/examples/profiler/project.clj b/contrib/clojure-package/examples/profiler/project.clj
index cc50482d0418..fa30eafa0daf 100644
--- a/contrib/clojure-package/examples/profiler/project.clj
+++ b/contrib/clojure-package/examples/profiler/project.clj
@@ -18,5 +18,5 @@
 (defproject profiler "0.1.0-SNAPSHOT"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
   :main profiler.core)
diff --git a/contrib/clojure-package/examples/rnn/project.clj b/contrib/clojure-package/examples/rnn/project.clj
index 64f4c290741c..291f2bd46e3a 100644
--- a/contrib/clojure-package/examples/rnn/project.clj
+++ b/contrib/clojure-package/examples/rnn/project.clj
@@ -19,5 +19,5 @@
   :description "RNN example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
   :main rnn.train-char-rnn)
diff --git a/contrib/clojure-package/examples/tutorial/project.clj b/contrib/clojure-package/examples/tutorial/project.clj
index 9c4f1b96f9e0..8a78ec6a6abf 100644
--- a/contrib/clojure-package/examples/tutorial/project.clj
+++ b/contrib/clojure-package/examples/tutorial/project.clj
@@ -20,6 +20,6 @@
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
                  ;; Uncomment the one appropriate for your machine & configuration:
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.5.0"]
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.5.0"]
-                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.5.0"]])
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.4.0"]
+                 #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.4.0"]])
diff --git a/contrib/clojure-package/examples/visualization/project.clj b/contrib/clojure-package/examples/visualization/project.clj
index d91ace3188e6..d56ddfb23f0c 100644
--- a/contrib/clojure-package/examples/visualization/project.clj
+++ b/contrib/clojure-package/examples/visualization/project.clj
@@ -19,5 +19,5 @@
   :description "Visualization example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
   :main visualization.core)
diff --git a/contrib/clojure-package/project.clj b/contrib/clojure-package/project.clj
index 12a0504e02d5..ae7ccd67fd9c 100644
--- a/contrib/clojure-package/project.clj
+++ b/contrib/clojure-package/project.clj
@@ -15,7 +15,7 @@
 ;; limitations under the License.
 ;;
 
-(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"
+(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"
   :description "Clojure package for MXNet"
   :url "https://github.com/apache/incubator-mxnet"
   :license {:name "Apache License"
@@ -29,7 +29,7 @@
                  ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu "1.2.1"]
 
                  ;;; CI
-                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0-SNAPSHOT"]
+                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0-SNAPSHOT"]
 
                  [org.clojure/tools.logging "0.4.0"]
                  [org.apache.logging.log4j/log4j-core "2.8.1"]
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
index 35cd11c89a70..a0253216f945 100644
--- a/docs/api/python/symbol/contrib.md
+++ b/docs/api/python/symbol/contrib.md
@@ -55,9 +55,6 @@ In the rest of this document, we list routines provided by the `symbol.contrib`
     foreach
     while_loop
     cond
-    isinf
-    isfinite
-    isnan
     index_copy
     getnnz
     edge_id
diff --git a/docs/tutorials/scala/mxnet_scala_on_intellij.md b/docs/tutorials/scala/mxnet_scala_on_intellij.md
index a0bf24e34e28..174e3018098b 100644
--- a/docs/tutorials/scala/mxnet_scala_on_intellij.md
+++ b/docs/tutorials/scala/mxnet_scala_on_intellij.md
@@ -385,14 +385,14 @@ If you chose to "Build from Source" when following the [install instructions](ht
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.version}-${platform}-sources</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT-sources.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT-sources.jar</systemPath>
     </dependency>
 
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-full_${scala.version}-${platform}</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
     </dependency>
 ```
 
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 92d9c2699d63..f773139d6c3e 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -102,7 +102,7 @@
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 5
+#define MXNET_MINOR 4
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
diff --git a/mkldnn.mk b/mkldnn.mk
index 5af3e9b1d741..d79bbe7d2a0e 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,20 +19,14 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
-	MKLDNN_LIBRARY_TYPE=STATIC
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
-else ifeq ($(UNAME_S), Windows)
-	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
-	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
-	MKLDNN_LIBRARY_TYPE=SHARED
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
 endif
 endif
 
@@ -43,7 +37,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=$(MKLDNN_LIBRARY_TYPE)
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index ff795f914a4b..57c73e5943af 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -111,4 +111,4 @@ def find_include_path():
 
 
 # current version
-__version__ = "1.5.0"
+__version__ = "1.4.0"
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index abefead175c7..fbc0ab027ac7 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index 96ffa38c6af2..a1a94808e918 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index 5c5733a9a4ce..bb6af0353762 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index c1d1a3b8e721..8de320eb2ade 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 484fbbd96790..3425bb15f62a 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -100,13 +100,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 8d3d156a0b18..9e8e119c3c4f 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -149,13 +149,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index ac76cdd19f3b..3e6980cb6f4b 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <artifactId>mxnet-parent_2.11</artifactId>
         <groupId>org.apache.mxnet</groupId>
-        <version>1.5.0-SNAPSHOT</version>
+        <version>1.4.0-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -89,7 +89,7 @@
         <dependency>
             <groupId>org.apache.mxnet</groupId>
             <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-            <version>1.5.0-SNAPSHOT</version>
+            <version>1.4.0-SNAPSHOT</version>
             <scope>provided</scope>
         </dependency>
         <!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index b71d7cf71528..12a36bd6e944 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index b4a0b1d6584a..d0290942ef84 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index bed216e45035..17a829c0c217 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index 4278df6f2e73..a5b88c308637 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
 <!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index cd56060b4b36..d435e211ceeb 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -53,13 +53,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index 2415cf7d26db..ac8e4a45e67a 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index 0186217234bc..cdba5774f6a0 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 0ab7ca1dd0f0..333486c67392 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index 2f6425d21104..e267c8d797ab 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 151462cbcc68..76bf00b54ba6 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -10,7 +10,7 @@
   </parent>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>1.5.0-SNAPSHOT</version>
+  <version>1.4.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/apache/incubator-mxnet/tree/master/scala-package</url>
   <description>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 2db3bee8c78d..ee4f3efa98e4 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <version>1.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -40,7 +40,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <version>1.4.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/snapcraft.yaml b/snapcraft.yaml
index d8d0e301e6b1..e70bf6e5b4b3 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '1.5.0'
+version: '1.4.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 665ce6982874..746ee2f096f1 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -41,22 +41,22 @@ gtest-all.o : $(GTEST_SRCS_)
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
-build/tests/cpp/%.o : tests/cpp/%.cc
+build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc
+build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc
+build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
+build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
diff --git a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
index bda47f9e650d..02d480d9d3ba 100755
--- a/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
+++ b/tests/nightly/model_backwards_compatibility_check/train_mxnet_legacy_models.sh
@@ -61,8 +61,8 @@ echo `pwd`
 ## This list is sorted in descending order chronologically.
 ## Sample output for the below git tag command is : 1.2.0 utils 1.1.0 1.0.0 0.12.1
 ## so from this sample, we will pick up all the versions matching with the current latest version
-## Now while performing inference the latest version could be 1.5.0, which will help in validating models trained
-## on 1.1.0 and 1.2.0 by loading them on the latest version (1.5.0)
+## Now while performing inference the latest version could be 1.4.0, which will help in validating models trained
+## on 1.1.0 and 1.2.0 by loading them on the latest version (1.4.0)
 ## Over a period of time, the model repository will grow since with every new release we
 ## upload models trained on newer versions as well through this script
 previous_versions=($(git tag --sort=-creatordate | grep --invert-match rc))
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index d9d3abfc3ced..c6c0a0832f1f 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -27,6 +27,7 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
+import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -440,4 +441,7 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
-    
+
+
+if __name__ == '__main__':
+    install.test_mkldnn_install()
diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
new file mode 100644
index 000000000000..c2f26df72f2e
--- /dev/null
+++ b/tests/python/mkl/test_mkldnn_install.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+MKL-DNN related test cases
+"""
+
+import sys
+import os
+import logging
+
+
+def test_mkldnn_install():
+    """
+    This test will verify that MXNet is built/installed correctly when
+    compiled with Intel MKL-DNN library. The method will try to import
+    the mxnet module and see if the mkldnn library is mapped to this
+    process's address space.
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    if not sys.platform.startswith('linux'):
+        logging.info("Bypass mkldnn install test for non-Linux OS")
+        return
+
+    try:
+        #pylint: disable=unused-variable
+        import mxnet as mx
+    except (ImportError, OSError) as e:
+        assert 0, "Import mxnet error: %s. Please double check your build/" \
+            "install steps or environment variable settings" % str(e)
+
+    pid = os.getpid()
+    rc = os.system("cat /proc/" + str(pid) +
+                   "/maps | grep libmkldnn > /dev/null")
+
+    if rc == 0:
+        logging.info("MXNet is built/installed correctly with MKL-DNN")
+    else:
+        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
+            "double check your build/install steps or environment " \
+            "variable settings"

From 8feb826e0b707531f8596e209e2d37598fd7a4d7 Mon Sep 17 00:00:00 2001
From: Nicolas Modrzyk <hellonico@gmail.com>
Date: Fri, 7 Dec 2018 02:06:16 +0900
Subject: [PATCH 41/54]  #13441 [Clojure] Add Spec Validations for the Random
 namespace (#13523)

---
 .../neural-style/src/neural_style/core.clj    |  2 +-
 .../org/apache/clojure_mxnet/optimizer.clj    | 26 ++++++++--------
 .../src/org/apache/clojure_mxnet/random.clj   | 30 +++++++++++++++++--
 .../apache/clojure_mxnet/operator_test.clj    |  2 +-
 .../org/apache/clojure_mxnet/random_test.clj  | 17 +++++++++--
 5 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj b/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj
index 50f95c9750ee..fcf402f3466d 100644
--- a/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj
+++ b/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj
@@ -193,7 +193,7 @@
         ;;;train
 
         ;;initialize with random noise
-        img (ndarray/- (random/uniform 0 255 content-np-shape dev) 128)
+        img (ndarray/- (random/uniform 0 255 content-np-shape {:ctx dev}) 128)
         ;;; img (random/uniform -0.1 0.1 content-np-shape dev)
         ;; img content-np
         lr-sched (lr-scheduler/factor-scheduler 10 0.9)
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
index f77f5532bfb1..672090a899b3 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
@@ -24,11 +24,11 @@
    (org.apache.mxnet.optimizer SGD DCASGD NAG AdaDelta RMSProp AdaGrad Adam SGLD)
    (org.apache.mxnet FactorScheduler)))
 
-(s/def ::learning-rate float?)
-(s/def ::momentum float?)
-(s/def ::wd float?)
-(s/def ::clip-gradient float?)
-(s/def ::lr-scheduler #(instance? FactorScheduler))
+(s/def ::learning-rate number?)
+(s/def ::momentum number?)
+(s/def ::wd number?)
+(s/def ::clip-gradient number?)
+(s/def ::lr-scheduler #(instance? FactorScheduler %))
 (s/def ::sgd-opts (s/keys :opt-un [::learning-rate ::momentum ::wd ::clip-gradient ::lr-scheduler]))
 
 (defn sgd
@@ -43,7 +43,7 @@
   ([]
    (sgd {})))
 
-(s/def ::lambda float?)
+(s/def ::lambda number?)
 (s/def ::dcasgd-opts (s/keys :opt-un [::learning-rate ::momentum ::lambda ::wd ::clip-gradient ::lr-scheduler]))
 
 (defn dcasgd
@@ -77,9 +77,9 @@
   ([]
    (nag {})))
 
-(s/def ::rho float?)
-(s/def ::rescale-gradient float?)
-(s/def ::epsilon float?)
+(s/def ::rho number?)
+(s/def ::rescale-gradient number?)
+(s/def ::epsilon number?)
 (s/def ::ada-delta-opts (s/keys :opt-un [::rho ::rescale-gradient ::epsilon ::wd ::clip-gradient]))
 
 (defn ada-delta
@@ -96,8 +96,8 @@
   ([]
    (ada-delta {})))
 
-(s/def gamma1 float?)
-(s/def gamma2 float?)
+(s/def gamma1 number?)
+(s/def gamma2 number?)
 (s/def ::rms-prop-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::gamma1 ::gamma2 ::wd ::clip-gradient]))
 
 (defn rms-prop
@@ -144,8 +144,8 @@
   ([]
    (ada-grad {})))
 
-(s/def ::beta1 float?)
-(s/def ::beta2 float?)
+(s/def ::beta1 number?)
+(s/def ::beta2 number?)
 (s/def ::adam-opts (s/keys :opt-un [::learning-rate ::beta1 ::beta2 ::epsilon ::decay-factor ::wd ::clip-gradient ::lr-scheduler]))
 
 (defn adam
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj
index d6e33789a629..0ec2039ba79b 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/random.clj
@@ -16,8 +16,18 @@
 ;;
 
 (ns org.apache.clojure-mxnet.random
-  (:require [org.apache.clojure-mxnet.shape :as mx-shape])
-  (:import (org.apache.mxnet Random)))
+  (:require
+   [org.apache.clojure-mxnet.shape :as mx-shape]
+   [org.apache.clojure-mxnet.context :as context]
+   [clojure.spec.alpha :as s]
+   [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet Context Random)))
+
+(s/def ::low number?)
+(s/def ::high number?)
+(s/def ::shape-vec (s/coll-of pos-int? :kind vector?))
+(s/def ::ctx #(instance? Context %))
+(s/def ::uniform-opts (s/keys :opt-un [::ctx]))
 
 (defn uniform
   "Generate uniform distribution in [low, high) with shape.
@@ -29,10 +39,18 @@
       out: Output place holder}
     returns: The result ndarray with generated result./"
   ([low high shape-vec {:keys [ctx out] :as opts}]
+   (util/validate! ::uniform-opts opts "Incorrect random uniform parameters")
+   (util/validate! ::low low  "Incorrect random uniform parameter")
+   (util/validate! ::high high  "Incorrect random uniform parameters")
+   (util/validate! ::shape-vec shape-vec  "Incorrect random uniform parameters")
    (Random/uniform (float low) (float high) (mx-shape/->shape shape-vec) ctx out))
   ([low high shape-vec]
    (uniform low high shape-vec {})))
 
+(s/def ::loc number?)
+(s/def ::scale number?)
+(s/def ::normal-opts (s/keys :opt-un [::ctx]))
+
 (defn normal
   "Generate normal(Gaussian) distribution N(mean, stdvar^^2) with shape.
     loc: The standard deviation of the normal distribution
@@ -43,10 +61,15 @@
       out: Output place holder}
     returns: The result ndarray with generated result./"
   ([loc scale shape-vec {:keys [ctx out] :as opts}]
+   (util/validate! ::normal-opts opts  "Incorrect random normal parameters")
+   (util/validate! ::loc loc  "Incorrect random normal parameters")
+   (util/validate! ::scale scale  "Incorrect random normal parameters")
+   (util/validate! ::shape-vec shape-vec  "Incorrect random uniform parameters")
    (Random/normal (float loc) (float scale) (mx-shape/->shape shape-vec) ctx out))
   ([loc scale shape-vec]
    (normal loc scale shape-vec {})))
 
+(s/def ::seed-state number?)
 (defn seed
   " Seed the random number generators in mxnet.
     This seed will affect behavior of functions in this module,
@@ -58,4 +81,5 @@
          This means if you set the same seed, the random number sequence
          generated from GPU0 can be different from CPU."
   [seed-state]
-  (Random/seed (int seed-state)))
+  (util/validate! ::seed-state seed-state  "Incorrect seed parameters")
+  (Random/seed (int seed-state)))
\ No newline at end of file
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
index 1b4b2ea2fbe3..c97711b5fed6 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
@@ -462,7 +462,7 @@
         test (sym/transpose data)
         shape-vec [3 4]
         ctx (context/default-context)
-        arr-data (random/uniform 0 100 shape-vec ctx)
+        arr-data (random/uniform 0 100 shape-vec {:ctx ctx})
         trans (ndarray/transpose (ndarray/copy arr-data))
         exec-test (sym/bind test ctx {"data" arr-data})
         out     (->  (executor/forward exec-test)
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj
index c4e9198073a8..6952335c1390 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/random_test.clj
@@ -26,9 +26,9 @@
     (let [[a b] [-10 10]
           shape [100 100]
           _ (random/seed 128)
-          un1 (random/uniform a b shape {:context ctx})
+          un1 (random/uniform a b shape {:ctx ctx})
           _ (random/seed 128)
-          un2 (random/uniform a b shape {:context ctx})]
+          un2 (random/uniform a b shape {:ctx ctx})]
       (is (= un1 un2))
       (is (<  (Math/abs
                (/ (/ (apply + (ndarray/->vec un1))
@@ -52,3 +52,16 @@
       (is (<  (Math/abs (- mean mu)) 0.1))
       (is (< (Math/abs (- stddev sigma)) 0.1)))))
 
+(defn random-or-normal [fn_]
+  (is (thrown? Exception (fn_ 'a 2 [])))
+  (is (thrown? Exception (fn_ 1 'b [])))
+  (is (thrown? Exception (fn_ 1 2 [-1])))
+  (is (thrown? Exception (fn_ 1 2 [2 3 0])))
+  (is (thrown? Exception (fn_ 1 2 [10 10] {:ctx "a"})))
+  (let [ctx (context/default-context)]
+    (is (not (nil? (fn_ 1 1 [100 100] {:ctx ctx}))))))
+
+(deftest test-random-parameters-specs
+  (random-or-normal random/normal)
+  (random-or-normal random/uniform)
+  (is (thrown? Exception (random/seed "a"))))
\ No newline at end of file

From f390f0cd501c57b5e399eb6c1475d9efbf7c28d5 Mon Sep 17 00:00:00 2001
From: Roshani Nagmote <roshaninagmote2@gmail.com>
Date: Thu, 6 Dec 2018 16:37:28 -0800
Subject: [PATCH 42/54] Adding test for softmaxoutput (#13116)

---
 .../contrib/onnx/mx2onnx/_op_translations.py  |  2 +-
 .../onnx/export/mxnet_export_test.py          | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index 86767a667128..e605e824be43 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -705,7 +705,7 @@ def convert_softmax_output(node, **kwargs):
 
     softmax_node = onnx.helper.make_node(
         "Softmax",
-        [input1.output[0]],
+        [input1.name],
         [name],
         axis=1,
         name=name
diff --git a/tests/python-pytest/onnx/export/mxnet_export_test.py b/tests/python-pytest/onnx/export/mxnet_export_test.py
index 6b858f05e24f..22db0d637a3a 100644
--- a/tests/python-pytest/onnx/export/mxnet_export_test.py
+++ b/tests/python-pytest/onnx/export/mxnet_export_test.py
@@ -241,6 +241,28 @@ def test_square():
 
     npt.assert_almost_equal(result, numpy_op)
 
+
+def test_softmax():
+    input1 = np.random.rand(1000, 1000).astype("float32")
+    label1 = np.random.rand(1000)
+    input_nd = mx.nd.array(input1)
+    label_nd = mx.nd.array(label1)
+
+    ipsym = mx.sym.Variable("ipsym")
+    label = mx.sym.Variable('label')
+    sym = mx.sym.SoftmaxOutput(data=ipsym, label=label, ignore_label=0, use_ignore=False)
+    ex = sym.bind(ctx=mx.cpu(0), args={'ipsym': input_nd, 'label': label_nd})
+    ex.forward(is_train=True)
+    softmax_out = ex.outputs[0].asnumpy()
+
+    converted_model = onnx_mxnet.export_model(sym, {}, [(1000, 1000), (1000,)], np.float32, "softmaxop.onnx")
+
+    sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model)
+    result = forward_pass(sym, arg_params, aux_params, ['ipsym'], input1)
+
+    # Comparing result of forward pass before using onnx export, import
+    npt.assert_almost_equal(result, softmax_out)
+
 @with_seed()
 def test_comparison_ops():
     """Test greater, lesser, equal"""

From 9c0d1731008ee7a63d9e9c886c40e254991e4a0a Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Fri, 7 Dec 2018 01:51:01 +0000
Subject: [PATCH 43/54] Add workspace cleaning after job finished (#13490)

* Add workspace cleaning after job finished

* Update Jenkinsfile_utils.groovy

* Update Jenkinsfile_utils.groovy
---
 ci/Jenkinsfile_utils.groovy | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index fb84e0045d53..f82c238ed075 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -263,7 +263,10 @@ def main_wrapper(args) {
     node(NODE_UTILITY) {
       // Call failure handler
       args['failure_handler']()
-      
+
+      // Clean workspace to reduce space requirements
+      cleanWs()
+
       // Remember to rethrow so the build is marked as failing
       if (err) {
         throw err

From 0011ab2f4bf87619cdfe0bb928d74c5ae0df452c Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Thu, 6 Dec 2018 23:49:16 -0800
Subject: [PATCH 44/54] Fix flaky test test_random:test_randint_generator
 (#13498)

* updated seed, alpha value, comments

* typo in comment fix

* added nrepeat

* removed unusued variable, added link for scipy alpha, rephrased the sentence for discrete distribution buckets

* removed fixed seed, alpha
---
 python/mxnet/test_utils.py           | 11 ++++++-----
 tests/python/unittest/test_random.py | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 26f7762ca9b5..0a4d17dc2668 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1849,12 +1849,12 @@ def chi_square_check(generator, buckets, probs, nsamples=1000000):
 
     If the generator is continuous, the buckets should contain tuples of (range_min, range_max) \
     and the probs should be the corresponding ideal probability within the specific ranges. \
-    Otherwise, the buckets should be the possible output of the discrete distribution and the \
+    Otherwise, the buckets should contain all the possible values generated over the discrete distribution and the \
     probs should be groud-truth probability.
 
     Usually the user is required to specify the probs parameter.
 
-    After obtatining the p value, we could further use the standard p > 0.05 threshold to get \
+    After obtaining the p value, we could further use the standard p > 0.05 (alpha) threshold to get \
     the final result.
 
     Examples::
@@ -1906,7 +1906,6 @@ def chi_square_check(generator, buckets, probs, nsamples=1000000):
             buckets_npy[i * 2 + 1] = buckets[i][1]
     else:
         continuous_dist = False
-        buckets_npy = np.array(buckets)
     expected_freq = (nsamples * np.array(probs, dtype=np.float32)).astype(np.int32)
     if continuous_dist:
         sample_bucket_ids = np.searchsorted(buckets_npy, samples, side='right')
@@ -1923,7 +1922,7 @@ def chi_square_check(generator, buckets, probs, nsamples=1000000):
     _, p = ss.chisquare(f_obs=obs_freq, f_exp=expected_freq)
     return p, obs_freq, expected_freq
 
-def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, success_rate=0.15):
+def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, success_rate=0.25, alpha=0.05):
     """Verify whether the generator is correct using chi-square testing.
 
     The test is repeated for "nrepeat" times and we check if the success rate is
@@ -1946,6 +1945,8 @@ def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, suc
         The times to repeat the test
     success_rate: float
         The desired success rate
+    alpha: float
+        The desired threshold for type-I error i.e. when a true null hypothesis is rejected
 
     Returns
     -------
@@ -1961,7 +1962,7 @@ def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, suc
         cs_ret_l.append(cs_ret)
         obs_freq_l.append(obs_freq)
         expected_freq_l.append(expected_freq)
-    success_num = (np.array(cs_ret_l) > 0.05).sum()
+    success_num = (np.array(cs_ret_l) > alpha).sum()
     if success_num < nrepeat * success_rate:
         raise AssertionError("Generator test fails, Chi-square p=%s, obs_freq=%s, expected_freq=%s."
                              "\nbuckets=%s, probs=%s"
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 3026d31c0f96..405602f073bb 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -860,25 +860,26 @@ def test_randint_extremes():
     assert a>=50000000 and a<=50000010
 
 @with_seed()
-@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/13446")
 def test_randint_generator():
     ctx = mx.context.current_context()
     for dtype in ['int32', 'int64']:
-        for low, high in [(50000000, 50001000),(-50000000,-9900),(-500,199),(-2147483647,2147483647)]:
+        for low, high in [(50000000, 50001000),(-50000100,-50000000),(-500,199)]:
             scale = high - low
             buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.uniform.ppf(x, loc=low, scale=scale), 5)
             # Quantize bucket boundaries to reflect the actual dtype and adjust probs accordingly
             buckets = np.array(buckets, dtype=dtype).tolist()
             probs = [(buckets[i][1] - buckets[i][0]) / float(scale) for i in range(5)]
             generator_mx = lambda x: mx.nd.random.randint(low, high, shape=x, ctx=ctx, dtype=dtype).asnumpy()
-            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs, nrepeat=100)
+            # Scipy uses alpha = 0.01 for testing discrete distribution generator but we are using default alpha=0.05 (higher threshold ensures robustness)
+            # Refer - https://github.com/scipy/scipy/blob/9f12af697763fb5f9767d5cb1280ce62456a3974/scipy/stats/tests/test_discrete_basic.py#L45
             generator_mx_same_seed = \
                 lambda x: np.concatenate(
                     [mx.nd.random.randint(low, high, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
                         for _ in range(10)])
-            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs, nrepeat=100)
 
-with_seed()
+@with_seed()
 def test_randint_without_dtype():
     a = mx.nd.random.randint(low=50000000, high=50000010, ctx=mx.context.current_context())
     assert(a.dtype, 'int32')

From 7485a78372b4a20211105b19e27a0549061ee688 Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Fri, 7 Dec 2018 05:59:35 -0800
Subject: [PATCH 45/54] Update version to v1.5.0 including clojure package
 (#13566)

* Update DESCRIPTION

* update version to v1.5.0 except for clojure

* update version from 1.4.0 to 1.5.0
- add utility script to help bump versions in future
- fix README to correct to current maven versions
---
 R-package/DESCRIPTION                         | 10 +++----
 contrib/clojure-package/README.md             | 16 +++++------
 .../cnn-text-classification/project.clj       |  2 +-
 .../clojure-package/examples/gan/project.clj  |  2 +-
 .../examples/imclassification/project.clj     |  2 +-
 .../examples/module/project.clj               |  2 +-
 .../examples/multi-label/project.clj          |  2 +-
 .../examples/neural-style/project.clj         |  2 +-
 .../examples/pre-trained-models/project.clj   |  2 +-
 .../examples/profiler/project.clj             |  2 +-
 .../clojure-package/examples/rnn/project.clj  |  2 +-
 .../examples/visualization/project.clj        |  2 +-
 contrib/clojure-package/project.clj           |  4 +--
 .../scripts/update_versions.sh                | 27 +++++++++++++++++++
 .../scala/mxnet_scala_on_intellij.md          |  4 +--
 include/mxnet/base.h                          |  2 +-
 python/mxnet/libinfo.py                       |  2 +-
 .../assembly/linux-x86_64-cpu/pom.xml         |  8 +++---
 .../assembly/linux-x86_64-gpu/pom.xml         |  8 +++---
 scala-package/assembly/osx-x86_64-cpu/pom.xml |  8 +++---
 scala-package/assembly/pom.xml                |  2 +-
 scala-package/core/pom.xml                    |  6 ++---
 scala-package/examples/pom.xml                |  6 ++---
 scala-package/infer/pom.xml                   |  4 +--
 .../init-native/linux-x86_64/pom.xml          |  4 +--
 scala-package/init-native/osx-x86_64/pom.xml  |  4 +--
 scala-package/init-native/pom.xml             |  2 +-
 scala-package/init/pom.xml                    |  2 +-
 scala-package/macros/pom.xml                  |  6 ++---
 scala-package/native/linux-x86_64-cpu/pom.xml |  4 +--
 scala-package/native/linux-x86_64-gpu/pom.xml |  4 +--
 scala-package/native/osx-x86_64-cpu/pom.xml   |  4 +--
 scala-package/native/pom.xml                  |  2 +-
 scala-package/pom.xml                         |  2 +-
 scala-package/spark/pom.xml                   |  4 +--
 snapcraft.yaml                                |  2 +-
 36 files changed, 97 insertions(+), 70 deletions(-)
 create mode 100755 contrib/clojure-package/scripts/update_versions.sh

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 46702eff9ed7..c710a915bd88 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,17 +1,17 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 1.4.0
+Version: 1.5.0
 Date: 2017-06-27
-Author: Tianqi Chen, Qiang Kou, Tong He
+Author: Tianqi Chen, Qiang Kou, Tong He, Anirudh Acharya <https://github.com/anirudhacharya>
 Maintainer: Qiang Kou <qkou@qkou.info>
-Repository: DMLC
+Repository: apache/incubator-mxnet
 Description: MXNet is a deep learning framework designed for both efficiency
     and flexibility. It allows you to mix the flavours of deep learning programs
     together to maximize the efficiency and your productivity.
 License: Apache License (== 2.0)
-URL: https://github.com/dmlc/mxnet/tree/master/R-package
-BugReports: https://github.com/dmlc/mxnet/issues
+URL: https://github.com/apache/incubator-mxnet/tree/master/R-package
+BugReports: https://github.com/apache/incubator-mxnet/issues
 Imports:
     methods,
     Rcpp (>= 0.12.1),
diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index bc6100b86123..152c9c635e6d 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -105,9 +105,9 @@ brew install opencv
 - Create a new project with `lein new my-mxnet`
 - Edit your `project.clj` and add one of the following entries to `:dependencies`, based on your system and the compute device you want to use:
 
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.4.0"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.4.0"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.3.1"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.3.1"]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.3.1"]`
 
 After making this change and running `lein deps`, you should be able to run example code like this [NDArray Tutorial](https://github.com/apache/incubator-mxnet/blob/master/contrib/clojure-package/examples/tutorial/src/tutorial/ndarray.clj).
 
@@ -116,20 +116,20 @@ After making this change and running `lein deps`, you should be able to run exam
 With this option, you will install a Git revision of the Clojure package source and a [Scala package jar from Maven](https://search.maven.org/search?q=g:org.apache.mxnet) with native dependencies baked in.
 
 - Install additional dependencies as described in [the corresponding section for Option 1](#installing-additional-dependencies),
-- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.4.0` tag and a clone into the `~/mxnet` directory:
+- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.3.1` tag and a clone into the `~/mxnet` directory:
 
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
   git tag --list  # Find the tag that matches the Scala package version
-  git checkout tags/1.4.0 -b my_mxnet
+  git checkout tags/1.3.1 -b my_mxnet
   git submodule update --init --recursive
   cd contrib/clojure
   ```
 
 - Edit `project.clj` to include the desired Scala jar from Maven:
 
-      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0”]
+      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.3.1”]
 
 - Run `lein test`. All the tests should run without error.
 - At this point you can run `lein install` to build and install the Clojure jar locally.
@@ -147,7 +147,7 @@ The first step is to recursively clone the MXNet repository and checkout the des
   ```bash
   git clone --recursive https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
-  git checkout tags/1.4.0 -b my_mxnet  # this is optional
+  git checkout tags/1.3.1 -b my_mxnet  # this is optional
   git submodule update --init --recursive
   ```
 
@@ -176,7 +176,7 @@ The outcome of this step will be a shared library `lib/libmxnet.so` that is used
 
 #### Building the Clojure jar
  
-- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.4.0-SNAPSHOT"]`, to the `:dependencies`.
+- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.5.0-SNAPSHOT"]`, to the `:dependencies`.
 - Run `lein test`. All the tests should run without an error.
 - Run `lein install` to build and install the Clojure jar locally.
 
diff --git a/contrib/clojure-package/examples/cnn-text-classification/project.clj b/contrib/clojure-package/examples/cnn-text-classification/project.clj
index 3eed0ddf9d9c..29ebefe5d200 100644
--- a/contrib/clojure-package/examples/cnn-text-classification/project.clj
+++ b/contrib/clojure-package/examples/cnn-text-classification/project.clj
@@ -19,6 +19,6 @@
   :description "CNN text classification with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main cnn-text-classification.classifier)
diff --git a/contrib/clojure-package/examples/gan/project.clj b/contrib/clojure-package/examples/gan/project.clj
index 36b7c6cb3089..b8f6903cabba 100644
--- a/contrib/clojure-package/examples/gan/project.clj
+++ b/contrib/clojure-package/examples/gan/project.clj
@@ -19,6 +19,6 @@
   :description "GAN MNIST with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [nu.pattern/opencv "2.4.9-7"]]
   :main gan.gan-mnist)
diff --git a/contrib/clojure-package/examples/imclassification/project.clj b/contrib/clojure-package/examples/imclassification/project.clj
index 0dbede5052ac..5f77cf55cf35 100644
--- a/contrib/clojure-package/examples/imclassification/project.clj
+++ b/contrib/clojure-package/examples/imclassification/project.clj
@@ -19,6 +19,6 @@
   :description "Clojure examples for image classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main imclassification.train-mnist)
diff --git a/contrib/clojure-package/examples/module/project.clj b/contrib/clojure-package/examples/module/project.clj
index a9a0a5f23e6e..b667a2a4e122 100644
--- a/contrib/clojure-package/examples/module/project.clj
+++ b/contrib/clojure-package/examples/module/project.clj
@@ -19,7 +19,7 @@
   :description "Clojure examples for module"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :pedantic? :skip
   :main mnist-mlp)
 
diff --git a/contrib/clojure-package/examples/multi-label/project.clj b/contrib/clojure-package/examples/multi-label/project.clj
index 8923738b946d..6e6a14340d36 100644
--- a/contrib/clojure-package/examples/multi-label/project.clj
+++ b/contrib/clojure-package/examples/multi-label/project.clj
@@ -19,5 +19,5 @@
   :description "Example of multi-label classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main multi-label.core)
diff --git a/contrib/clojure-package/examples/neural-style/project.clj b/contrib/clojure-package/examples/neural-style/project.clj
index 5a8eebea783f..b6d29f7c0e87 100644
--- a/contrib/clojure-package/examples/neural-style/project.clj
+++ b/contrib/clojure-package/examples/neural-style/project.clj
@@ -19,7 +19,7 @@
   :description "Neural Style Transfer with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main neural-style.core)
diff --git a/contrib/clojure-package/examples/pre-trained-models/project.clj b/contrib/clojure-package/examples/pre-trained-models/project.clj
index 58b591ce5307..11e002503464 100644
--- a/contrib/clojure-package/examples/pre-trained-models/project.clj
+++ b/contrib/clojure-package/examples/pre-trained-models/project.clj
@@ -19,7 +19,7 @@
   :description "Example of using pre-trained models with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
                  [net.mikera/imagez "0.12.0"]
                  [thinktopic/think.image "0.4.16"]]
   :main pre-trained-models.fine-tune)
diff --git a/contrib/clojure-package/examples/profiler/project.clj b/contrib/clojure-package/examples/profiler/project.clj
index fa30eafa0daf..cc50482d0418 100644
--- a/contrib/clojure-package/examples/profiler/project.clj
+++ b/contrib/clojure-package/examples/profiler/project.clj
@@ -18,5 +18,5 @@
 (defproject profiler "0.1.0-SNAPSHOT"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main profiler.core)
diff --git a/contrib/clojure-package/examples/rnn/project.clj b/contrib/clojure-package/examples/rnn/project.clj
index 291f2bd46e3a..64f4c290741c 100644
--- a/contrib/clojure-package/examples/rnn/project.clj
+++ b/contrib/clojure-package/examples/rnn/project.clj
@@ -19,5 +19,5 @@
   :description "RNN example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main rnn.train-char-rnn)
diff --git a/contrib/clojure-package/examples/visualization/project.clj b/contrib/clojure-package/examples/visualization/project.clj
index d56ddfb23f0c..d91ace3188e6 100644
--- a/contrib/clojure-package/examples/visualization/project.clj
+++ b/contrib/clojure-package/examples/visualization/project.clj
@@ -19,5 +19,5 @@
   :description "Visualization example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main visualization.core)
diff --git a/contrib/clojure-package/project.clj b/contrib/clojure-package/project.clj
index ae7ccd67fd9c..12a0504e02d5 100644
--- a/contrib/clojure-package/project.clj
+++ b/contrib/clojure-package/project.clj
@@ -15,7 +15,7 @@
 ;; limitations under the License.
 ;;
 
-(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.4.0-SNAPSHOT"
+(defproject org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"
   :description "Clojure package for MXNet"
   :url "https://github.com/apache/incubator-mxnet"
   :license {:name "Apache License"
@@ -29,7 +29,7 @@
                  ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu "1.2.1"]
 
                  ;;; CI
-                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.4.0-SNAPSHOT"]
+                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0-SNAPSHOT"]
 
                  [org.clojure/tools.logging "0.4.0"]
                  [org.apache.logging.log4j/log4j-core "2.8.1"]
diff --git a/contrib/clojure-package/scripts/update_versions.sh b/contrib/clojure-package/scripts/update_versions.sh
new file mode 100755
index 000000000000..607e3f357bc9
--- /dev/null
+++ b/contrib/clojure-package/scripts/update_versions.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Run this from the main Clojure project directory with 2 arguments
+# old-version and new-version
+# Ex: scripts/update_version 1.5.0-SNAPSHOT 1.5.0-SNAPSHOT
+
+set -evx
+echo "Replacing  $2  with  $2  in the directory  $PWD "
+find ./ -type f -exec sed -i '' -e "s/$1/$2/g" {} \;
+echo "Done! Check the changed files"
diff --git a/docs/tutorials/scala/mxnet_scala_on_intellij.md b/docs/tutorials/scala/mxnet_scala_on_intellij.md
index 174e3018098b..a0bf24e34e28 100644
--- a/docs/tutorials/scala/mxnet_scala_on_intellij.md
+++ b/docs/tutorials/scala/mxnet_scala_on_intellij.md
@@ -385,14 +385,14 @@ If you chose to "Build from Source" when following the [install instructions](ht
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.version}-${platform}-sources</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT-sources.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT-sources.jar</systemPath>
     </dependency>
 
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-full_${scala.version}-${platform}</artifactId>
       <scope>system</scope>
-      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
+      <systemPath>/PathToMXNetSource/incubator-mxnet/scala-package/assembly/osx-x86_64-cpu/target/mxnet-full_${scala.version}-osx-x86_64-cpu-1.5.0-SNAPSHOT.jar</systemPath>
     </dependency>
 ```
 
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index f773139d6c3e..92d9c2699d63 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -102,7 +102,7 @@
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 4
+#define MXNET_MINOR 5
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 57c73e5943af..ff795f914a4b 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -111,4 +111,4 @@ def find_include_path():
 
 
 # current version
-__version__ = "1.4.0"
+__version__ = "1.5.0"
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index fbc0ab027ac7..abefead175c7 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index a1a94808e918..96ffa38c6af2 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index bb6af0353762..5c5733a9a4ce 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,18 +18,18 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
     </dependency>
   </dependencies>
 
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index 8de320eb2ade..c1d1a3b8e721 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 3425bb15f62a..484fbbd96790 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -100,13 +100,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 9e8e119c3c4f..8d3d156a0b18 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -149,13 +149,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index 3e6980cb6f4b..ac76cdd19f3b 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <artifactId>mxnet-parent_2.11</artifactId>
         <groupId>org.apache.mxnet</groupId>
-        <version>1.4.0-SNAPSHOT</version>
+        <version>1.5.0-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -89,7 +89,7 @@
         <dependency>
             <groupId>org.apache.mxnet</groupId>
             <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-            <version>1.4.0-SNAPSHOT</version>
+            <version>1.5.0-SNAPSHOT</version>
             <scope>provided</scope>
         </dependency>
         <!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 12a36bd6e944..b71d7cf71528 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index d0290942ef84..b4a0b1d6584a 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 17a829c0c217..bed216e45035 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index a5b88c308637..4278df6f2e73 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
 <!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index d435e211ceeb..cd56060b4b36 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -53,13 +53,13 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index ac8e4a45e67a..2415cf7d26db 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index cdba5774f6a0..0186217234bc 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 333486c67392..0ab7ca1dd0f0 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +20,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index e267c8d797ab..2f6425d21104 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 76bf00b54ba6..151462cbcc68 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -10,7 +10,7 @@
   </parent>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>1.4.0-SNAPSHOT</version>
+  <version>1.5.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/apache/incubator-mxnet/tree/master/scala-package</url>
   <description>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index ee4f3efa98e4..2db3bee8c78d 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.4.0-SNAPSHOT</version>
+    <version>1.5.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -40,7 +40,7 @@
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.4.0-SNAPSHOT</version>
+      <version>1.5.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/snapcraft.yaml b/snapcraft.yaml
index e70bf6e5b4b3..d8d0e301e6b1 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '1.4.0'
+version: '1.5.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 

From 4f61c32611252640ed229dc9b386a51e1e6b3a56 Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Fri, 7 Dec 2018 08:53:04 -0800
Subject: [PATCH 46/54] License update  (#13565)

* Update LICENSE

* update license for Clojure, R, ONNX-TRT and location of 3rd party
dependencies.

* fixed typo
---
 LICENSE | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 6 deletions(-)

diff --git a/LICENSE b/LICENSE
index a8b57e583764..10dea2522182 100644
--- a/LICENSE
+++ b/LICENSE
@@ -218,16 +218,20 @@
     1. MXNet Cpp-package - For details, /cpp-package/LICENSE
     2. MXNet rcnn - For details, see, example/rcnn/LICENSE
     3. scala-package - For details, see, scala-package/LICENSE
-    4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE
+    4. Warp-CTC - For details, see, 3rdparty/ctc_include/LICENSE
     5. 3rdparty/dlpack - For details, see, 3rdparty/dlpack/LICENSE
     6. 3rdparty/dmlc-core - For details, see, 3rdparty/dmlc-core/LICENSE
     7. 3rdparty/mshadow - For details, see, 3rdparty/mshadow/LICENSE
     8. 3rdparty/tvm - For details, see, 3rdparty/tvm/LICENSE
     9. 3rdparty/tvm/dmlc-core - For details, see, 3rdparty/tvm/dmlc-core/LICENSE
-    10. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
-    11. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
-    12. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
-    13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+    10. 3rdparty/tvm/dlpack - For details, see, 3rdparty/tvm/3rdparty/dlpack/LICENSE
+    11. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
+    12. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
+    13. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
+    14. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+    15. clojure-package - For details, see, contrib/clojure-package/LICENSE
+    16. R-package - For details, see, R-package/LICENSE
+    17. ONNX-TensorRT benchmark package - For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE
 
 
     =======================================================================================
@@ -239,6 +243,9 @@
     3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE
     4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt
     5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE
+    6. HalideIR - For details, see 3rdparty/tvm/3rdparty/HalideIR/LICENSE
+    7. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/LICENSE
+    8. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/third_party/onnx/LICENSE
 
 
     =======================================================================================
@@ -246,7 +253,7 @@
     =======================================================================================
 
     1. Moderngpu
-    For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE
+    For details, see, 3rdparty/ctc_include/contrib/moderngpu/LICENSE
 
     /******************************************************************************
     * Redistribution and use in source and binary forms, with or without
@@ -559,4 +566,79 @@
     #  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     #  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+    =======================================================================================
+
+    12. Google tests
+        For details, see, 3rdparty/mkldnn/tests/gtests/gtest/LICENSE
+
+    Copyright 2008, Google Inc.
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+        * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+        * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following disclaimer
+    in the documentation and/or other materials provided with the
+    distribution.
+        * Neither the name of Google Inc. nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    13. ONNX python bindings
+    For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this
+       list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    3. Neither the name of the copyright holder nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+    OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You are under no obligation whatsoever to provide any bug fixes, patches, or
+    upgrades to the features, functionality or performance of the source code
+    ("Enhancements") to anyone; however, if you choose to make your Enhancements
+    available either publicly, or directly to the author of this software, without
+    imposing a separate written license agreement for such Enhancements, then you
+    hereby grant the following license: a non-exclusive, royalty-free perpetual
+    license to install, use, modify, prepare derivative works, incorporate into
+    other computer software, distribute, and sublicense such enhancements or
+    derivative works thereof, in binary and source code form.
 

From 7d74452531b95486ff07c211fa37cbc82155f4c9 Mon Sep 17 00:00:00 2001
From: Zhipeng Jia <zhipeng.jia@outlook.com>
Date: Fri, 7 Dec 2018 18:42:35 -0600
Subject: [PATCH 47/54] Fix use-before-assignment in convert_dot (#13511)

---
 python/mxnet/contrib/onnx/mx2onnx/_op_translations.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index e605e824be43..0f4b448a5416 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -442,6 +442,8 @@ def convert_dot(node, **kwargs):
     MatMul and Transpose operators based on the values set for
     transpose_a, transpose_b attributes."""
     name, input_nodes, attrs = get_inputs(node, kwargs)
+    input_node_a = input_nodes[0]
+    input_node_b = input_nodes[1]
 
     trans_a_node = None
     trans_b_node = None

From 186a746e557d8ef9551f52c6e4c1175394c323c0 Mon Sep 17 00:00:00 2001
From: Jake Lee <gstu1130@gmail.com>
Date: Fri, 7 Dec 2018 17:06:05 -0800
Subject: [PATCH 48/54] fix the situation where idx didn't align with rec
 (#13550)

minor fix the image.py

add last_batch_handle for imagedeiter

remove the label type

refactor the imageiter unit test

fix the trailing whitespace

fix coding style

add new line

move helper function to the top of the file
---
 python/mxnet/image/detection.py     |  64 ++++++++--
 python/mxnet/image/image.py         |   5 +-
 tests/python/unittest/test_image.py | 184 +++++++++++++++-------------
 3 files changed, 157 insertions(+), 96 deletions(-)

diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index b27917c86238..d5b5ecab528a 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -658,19 +658,26 @@ class ImageDetIter(ImageIter):
         Data name for provided symbols.
     label_name : str
         Name for detection labels
+    last_batch_handle : str, optional
+        How to handle the last batch.
+        This parameter can be 'pad'(default), 'discard' or 'roll_over'.
+        If 'pad', the last batch will be padded with data starting from the begining
+        If 'discard', the last batch will be discarded
+        If 'roll_over', the remaining elements will be rolled over to the next iteration
     kwargs : ...
         More arguments for creating augmenter. See mx.image.CreateDetAugmenter.
     """
     def __init__(self, batch_size, data_shape,
                  path_imgrec=None, path_imglist=None, path_root=None, path_imgidx=None,
                  shuffle=False, part_index=0, num_parts=1, aug_list=None, imglist=None,
-                 data_name='data', label_name='label', **kwargs):
+                 data_name='data', label_name='label', last_batch_handle='pad', **kwargs):
         super(ImageDetIter, self).__init__(batch_size=batch_size, data_shape=data_shape,
                                            path_imgrec=path_imgrec, path_imglist=path_imglist,
                                            path_root=path_root, path_imgidx=path_imgidx,
                                            shuffle=shuffle, part_index=part_index,
                                            num_parts=num_parts, aug_list=[], imglist=imglist,
-                                           data_name=data_name, label_name=label_name)
+                                           data_name=data_name, label_name=label_name,
+                                           last_batch_handle=last_batch_handle)
 
         if aug_list is None:
             self.auglist = CreateDetAugmenter(data_shape, **kwargs)
@@ -751,14 +758,10 @@ def reshape(self, data_shape=None, label_shape=None):
             self.provide_label = [(self.provide_label[0][0], (self.batch_size,) + label_shape)]
             self.label_shape = label_shape
 
-    def next(self):
-        """Override the function for returning next batch."""
+    def _batchify(self, batch_data, batch_label, start=0):
+        """Override the helper function for batchifying data"""
+        i = start
         batch_size = self.batch_size
-        c, h, w = self.data_shape
-        batch_data = nd.zeros((batch_size, c, h, w))
-        batch_label = nd.empty(self.provide_label[0][1])
-        batch_label[:] = -1
-        i = 0
         try:
             while i < batch_size:
                 label, s = self.next_sample()
@@ -783,7 +786,48 @@ def next(self):
             if not i:
                 raise StopIteration
 
-        return io.DataBatch([batch_data], [batch_label], batch_size - i)
+        return i
+
+    def next(self):
+        """Override the function for returning next batch."""
+        batch_size = self.batch_size
+        c, h, w = self.data_shape
+        # if last batch data is rolled over
+        if self._cache_data is not None:
+            # check both the data and label have values
+            assert self._cache_label is not None, "_cache_label didn't have values"
+            assert self._cache_idx is not None, "_cache_idx didn't have values"
+            batch_data = self._cache_data
+            batch_label = self._cache_label
+            i = self._cache_idx
+        else:
+            batch_data = nd.zeros((batch_size, c, h, w))
+            batch_label = nd.empty(self.provide_label[0][1])
+            batch_label[:] = -1
+            i = self._batchify(batch_data, batch_label)
+        # calculate the padding
+        pad = batch_size - i
+        # handle padding for the last batch
+        if pad != 0:
+            if self.last_batch_handle == 'discard':
+                raise StopIteration
+            # if the option is 'roll_over', throw StopIteration and cache the data
+            elif self.last_batch_handle == 'roll_over' and \
+                self._cache_data is None:
+                self._cache_data = batch_data
+                self._cache_label = batch_label
+                self._cache_idx = i
+                raise StopIteration
+            else:
+                _ = self._batchify(batch_data, batch_label, i)
+                if self.last_batch_handle == 'pad':
+                    self._allow_read = False
+                else:
+                    self._cache_data = None
+                    self._cache_label = None
+                    self._cache_idx = None
+
+        return io.DataBatch([batch_data], [batch_label], pad=pad)
 
     def augmentation_transform(self, data, label):  # pylint: disable=arguments-differ
         """Override Transforms input data with specified augmentations."""
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index c9a457f5b7e2..9c2a1cbfba2a 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -1145,7 +1145,7 @@ def __init__(self, batch_size, data_shape, label_width=1,
         self.shuffle = shuffle
         if self.imgrec is None:
             self.seq = imgkeys
-        elif shuffle or num_parts > 1:
+        elif shuffle or num_parts > 1 or path_imgidx:
             assert self.imgidx is not None
             self.seq = self.imgidx
         else:
@@ -1261,7 +1261,7 @@ def next(self):
             i = self._cache_idx
             # clear the cache data
         else:
-            batch_data = nd.empty((batch_size, c, h, w))
+            batch_data = nd.zeros((batch_size, c, h, w))
             batch_label = nd.empty(self.provide_label[0][1])
             i = self._batchify(batch_data, batch_label)
         # calculate the padding
@@ -1285,6 +1285,7 @@ def next(self):
                     self._cache_data = None
                     self._cache_label = None
                     self._cache_idx = None
+
         return io.DataBatch([batch_data], [batch_label], pad=pad)
 
     def check_data_shape(self, data_shape):
diff --git a/tests/python/unittest/test_image.py b/tests/python/unittest/test_image.py
index 4f66823cdbf1..4063027cc1e5 100644
--- a/tests/python/unittest/test_image.py
+++ b/tests/python/unittest/test_image.py
@@ -25,6 +25,7 @@
 
 from nose.tools import raises
 
+
 def _get_data(url, dirname):
     import os, tarfile
     download(url, dirname=dirname, overwrite=False)
@@ -50,6 +51,62 @@ def _generate_objects():
     label = np.hstack((cid[:, np.newaxis], boxes)).ravel().tolist()
     return [2, 5] + label
 
+def _test_imageiter_last_batch(imageiter_list, assert_data_shape):
+    test_iter = imageiter_list[0]
+    # test batch data shape
+    for _ in range(3):
+        for batch in test_iter:
+            assert batch.data[0].shape == assert_data_shape
+        test_iter.reset()
+    # test last batch handle(discard)
+    test_iter = imageiter_list[1]
+    i = 0
+    for batch in test_iter:
+        i += 1
+    assert i == 5
+    # test last_batch_handle(pad)
+    test_iter = imageiter_list[2]
+    i = 0
+    for batch in test_iter:
+        if i == 0:
+            first_three_data = batch.data[0][:2]
+        if i == 5:
+            last_three_data = batch.data[0][1:]
+        i += 1
+    assert i == 6
+    assert np.array_equal(first_three_data.asnumpy(), last_three_data.asnumpy())
+    # test last_batch_handle(roll_over)
+    test_iter = imageiter_list[3]
+    i = 0
+    for batch in test_iter:
+        if i == 0:
+            first_image = batch.data[0][0]
+        i += 1
+    assert i == 5
+    test_iter.reset()
+    first_batch_roll_over = test_iter.next()
+    assert np.array_equal(
+        first_batch_roll_over.data[0][1].asnumpy(), first_image.asnumpy())
+    assert first_batch_roll_over.pad == 2
+    # test iteratopr work properly after calling reset several times when last_batch_handle is roll_over
+    for _ in test_iter:
+        pass
+    test_iter.reset()
+    first_batch_roll_over_twice = test_iter.next()
+    assert np.array_equal(
+        first_batch_roll_over_twice.data[0][2].asnumpy(), first_image.asnumpy())
+    assert first_batch_roll_over_twice.pad == 1
+    # we've called next once
+    i = 1
+    for _ in test_iter:
+        i += 1
+    # test the third epoch with size 6
+    assert i == 6
+    # test shuffle option for sanity test
+    test_iter = imageiter_list[4]
+    for _ in test_iter:
+        pass
+
 
 class TestImage(unittest.TestCase):
     IMAGES_URL = "http://data.mxnet.io/data/test_images.tar.gz"
@@ -151,86 +208,32 @@ def test_color_normalize(self):
             assert_almost_equal(mx_result.asnumpy(), (src - mean) / std, atol=1e-3)
 
     def test_imageiter(self):
-        def check_imageiter(dtype='float32'):
-            im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
-            fname = './data/test_imageiter.lst'
-            file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x])
-                         for k, x in enumerate(TestImage.IMAGES)]
-            with open(fname, 'w') as f:
-                for line in file_list:
-                    f.write(line + '\n')
-
-            test_list = ['imglist', 'path_imglist']
+        im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
+        fname = './data/test_imageiter.lst'
+        file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x])
+                        for k, x in enumerate(TestImage.IMAGES)]
+        with open(fname, 'w') as f:
+            for line in file_list:
+                f.write(line + '\n')
 
+        test_list = ['imglist', 'path_imglist']
+        for dtype in ['int32', 'float32', 'int64', 'float64']:
             for test in test_list:
                 imglist = im_list if test == 'imglist' else None
                 path_imglist = fname if test == 'path_imglist' else None
-
-                test_iter = mx.image.ImageIter(2, (3, 224, 224), label_width=1, imglist=imglist,
-                    path_imglist=path_imglist, path_root='', dtype=dtype)
-                # test batch data shape
-                for _ in range(3):
-                    for batch in test_iter:
-                        assert batch.data[0].shape == (2, 3, 224, 224)
-                    test_iter.reset()
-                # test last batch handle(discard)
-                test_iter = mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist,
-                    path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='discard')
-                i = 0
-                for batch in test_iter:
-                    i += 1
-                assert i == 5
-                # test last_batch_handle(pad)
-                test_iter = mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist,
-                    path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='pad')
-                i = 0
-                for batch in test_iter:
-                    if i == 0:
-                        first_three_data = batch.data[0][:2]
-                    if i == 5:
-                        last_three_data = batch.data[0][1:]
-                    i += 1
-                assert i == 6
-                assert np.array_equal(first_three_data.asnumpy(), last_three_data.asnumpy())
-                # test last_batch_handle(roll_over)
-                test_iter = mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist,
-                    path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='roll_over')
-                i = 0
-                for batch in test_iter:
-                    if i == 0:
-                        first_image = batch.data[0][0]
-                    i += 1
-                assert i == 5
-                test_iter.reset()
-                first_batch_roll_over = test_iter.next()
-                assert np.array_equal(
-                    first_batch_roll_over.data[0][1].asnumpy(), first_image.asnumpy())
-                assert first_batch_roll_over.pad == 2
-                # test iteratopr work properly after calling reset several times when last_batch_handle is roll_over
-                for _ in test_iter:
-                    pass
-                test_iter.reset()
-                first_batch_roll_over_twice = test_iter.next()
-                assert np.array_equal(
-                    first_batch_roll_over_twice.data[0][2].asnumpy(), first_image.asnumpy())
-                assert first_batch_roll_over_twice.pad == 1
-                # we've called next once
-                i = 1
-                for _ in test_iter:
-                    i += 1
-                # test the third epoch with size 6
-                assert i == 6
-                # test shuffle option for sanity test
-                test_iter = mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist, shuffle=True,
-                                               path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='pad')
-                for _ in test_iter:
-                    pass
-
-        for dtype in ['int32', 'float32', 'int64', 'float64']:
-            check_imageiter(dtype)
-
-        # test with default dtype
-        check_imageiter()
+                imageiter_list = [
+                    mx.image.ImageIter(2, (3, 224, 224), label_width=1, imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype),
+                    mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='discard'),
+                    mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='pad'),
+                    mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='roll_over'),
+                    mx.image.ImageIter(3, (3, 224, 224), label_width=1, imglist=imglist, shuffle=True,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch_handle='pad')
+                ]
+                _test_imageiter_last_batch(imageiter_list, (2, 3, 224, 224))
 
     @with_seed()
     def test_augmenters(self):
@@ -259,16 +262,20 @@ def test_image_detiter(self):
         im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
         det_iter = mx.image.ImageDetIter(2, (3, 300, 300), imglist=im_list, path_root='')
         for _ in range(3):
-            for batch in det_iter:
+            for _ in det_iter:
                 pass
-            det_iter.reset()
-
+        det_iter.reset()
         val_iter = mx.image.ImageDetIter(2, (3, 300, 300), imglist=im_list, path_root='')
         det_iter = val_iter.sync_label_shape(det_iter)
         assert det_iter.data_shape == val_iter.data_shape
         assert det_iter.label_shape == val_iter.label_shape
 
-        # test file list
+        # test batch_size is not divisible by number of images
+        det_iter = mx.image.ImageDetIter(4, (3, 300, 300), imglist=im_list, path_root='')
+        for _ in det_iter:
+            pass
+
+        # test file list with last batch handle
         fname = './data/test_imagedetiter.lst'
         im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(TestImage.IMAGES)]
         with open(fname, 'w') as f:
@@ -276,10 +283,19 @@ def test_image_detiter(self):
                 line = '\t'.join([str(k) for k in line])
                 f.write(line + '\n')
 
-        det_iter = mx.image.ImageDetIter(2, (3, 400, 400), path_imglist=fname,
-            path_root='')
-        for batch in det_iter:
-            pass
+        imageiter_list = [
+            mx.image.ImageDetIter(2, (3, 400, 400),
+                path_imglist=fname, path_root=''),
+            mx.image.ImageDetIter(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch_handle='discard'),
+            mx.image.ImageDetIter(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch_handle='pad'),
+            mx.image.ImageDetIter(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch_handle='roll_over'),
+            mx.image.ImageDetIter(3, (3, 400, 400), shuffle=True,
+                path_imglist=fname, path_root='', last_batch_handle='pad')
+        ]
+        _test_imageiter_last_batch(imageiter_list, (2, 3, 400, 400))
 
     def test_det_augmenters(self):
         # only test if all augmenters will work

From 9e3f974e12c982759bb11bddbfef4f251a12c70a Mon Sep 17 00:00:00 2001
From: Chaitanya Prakash Bapat <chai.bapat@gmail.com>
Date: Fri, 7 Dec 2018 19:16:52 -0800
Subject: [PATCH 49/54] Update MXNetTutorialTemplate.ipynb (#13568)

Fix typos
---
 example/MXNetTutorialTemplate.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/MXNetTutorialTemplate.ipynb b/example/MXNetTutorialTemplate.ipynb
index 851a87f18247..0b0b19381809 100644
--- a/example/MXNetTutorialTemplate.ipynb
+++ b/example/MXNetTutorialTemplate.ipynb
@@ -13,7 +13,7 @@
    "source": [
     "A brief introduction to the tutorial that describes:\n",
     "\n",
-    "- The problem that that the tutorial addresses\n",
+    "- The problem that the tutorial addresses\n",
     "- Who the intended audience is\n",
     "- The expected experience level of that audience with a concept or tool \n",
     "- Which environment/language it runs in \n",
@@ -239,7 +239,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If appropriate, summarize the tasks required to create the model, defining and explaining key concepts."
+    "If appropriate, summarize the tasks required to fit the model, defining and explaining key concepts."
    ]
   },
   {
@@ -342,7 +342,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To *fperform the task*, *provide explanation here.*"
+    "To *perform the task*, *provide explanation here.*"
    ]
   },
   {

From 636933d424d789661d9e954ebfb569e1a2945a78 Mon Sep 17 00:00:00 2001
From: Vandana Kannan <vandanavk@users.noreply.github.com>
Date: Fri, 7 Dec 2018 19:39:47 -0800
Subject: [PATCH 50/54] ONNX import/export: Size (#13112)

---
 python/mxnet/contrib/onnx/mx2onnx/_op_translations.py | 8 ++++++++
 python/mxnet/contrib/onnx/onnx2mx/_import_helper.py   | 3 ++-
 python/mxnet/contrib/onnx/onnx2mx/_op_translations.py | 4 ++++
 tests/python-pytest/onnx/export/onnx_backend_test.py  | 3 ++-
 tests/python-pytest/onnx/import/test_cases.py         | 3 ++-
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index 0f4b448a5416..0d20c76240bd 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -1647,3 +1647,11 @@ def convert_logical_not(node, **kwargs):
     and return the created node.
     """
     return create_basic_op_node('Not', node, kwargs)
+
+
+@mx_op.register("size_array")
+def convert_size(node, **kwargs):
+    """Map MXNet's size_array operator attributes to onnx's Size operator
+    and return the created node.
+    """
+    return create_basic_op_node('Size', node, kwargs)
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
index f61910f838ea..2ceabaec1dcd 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
@@ -21,7 +21,7 @@
 from ._op_translations import identity, random_uniform, random_normal
 from ._op_translations import add, subtract, multiply, divide, absolute, negative, add_n
 from ._op_translations import tanh, arccos, arcsin, arctan, _cos, _sin, _tan
-from ._op_translations import softplus, shape, gather, lp_pooling
+from ._op_translations import softplus, shape, gather, lp_pooling, size
 from ._op_translations import ceil, floor, hardsigmoid, global_lppooling
 from ._op_translations import concat
 from ._op_translations import leaky_relu, _elu, _prelu, _selu, softmax, fully_connected
@@ -139,6 +139,7 @@
     'Softplus'          : softplus,
     'Tan'               : _tan,
     'Shape'             : shape,
+    'Size'              : size,
     'Gather'            : gather,
     'HardSigmoid'       : hardsigmoid,
     'LpPool'            : lp_pooling,
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
index 368b98d662b1..702832529314 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
@@ -642,6 +642,10 @@ def shape(attrs, inputs, proto_obj):
     """Returns shape of input array."""
     return 'shape_array', attrs, inputs
 
+def size(attrs, inputs, proto_obj):
+    """Returns array containing size of data."""
+    return "size_array", attrs, inputs
+
 def reduce_l2(attrs, inputs, proto_obj):
     """Reduce input tensor by l2 normalization."""
     new_attrs = translation_utils._fix_attribute_names(attrs, {'axes':'axis'})
diff --git a/tests/python-pytest/onnx/export/onnx_backend_test.py b/tests/python-pytest/onnx/export/onnx_backend_test.py
index be9273eb6fac..c9926c4d5e15 100644
--- a/tests/python-pytest/onnx/export/onnx_backend_test.py
+++ b/tests/python-pytest/onnx/export/onnx_backend_test.py
@@ -97,7 +97,8 @@
     'test_depthtospace',
     'test_hardsigmoid',
     'test_instancenorm',
-    'test_shape'
+    'test_shape',
+    'test_size'
     ]
 
 BASIC_MODEL_TESTS = [
diff --git a/tests/python-pytest/onnx/import/test_cases.py b/tests/python-pytest/onnx/import/test_cases.py
index f41fe92352db..e0b26cc49830 100644
--- a/tests/python-pytest/onnx/import/test_cases.py
+++ b/tests/python-pytest/onnx/import/test_cases.py
@@ -85,7 +85,8 @@
     'test_operator_maxpool',
     'test_operator_params',
     'test_operator_permute2',
-    'test_depthtospace'
+    'test_depthtospace',
+    'test_size'
     ]
 
 BASIC_MODEL_TESTS = [

From 95f1e1c51a38d34e62baa08975c7fc3548ae82e0 Mon Sep 17 00:00:00 2001
From: Steffen Rochel <steffenrochel@gmail.com>
Date: Fri, 7 Dec 2018 19:41:06 -0800
Subject: [PATCH 51/54] fix link for gluon model zoo (#13583)

---
 docs/community/ecosystem.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/community/ecosystem.md b/docs/community/ecosystem.md
index 54f8c8993ea9..100ed973cf3f 100644
--- a/docs/community/ecosystem.md
+++ b/docs/community/ecosystem.md
@@ -62,7 +62,7 @@ Community contributions to MXNet have added many new valuable features and funct
 
 ## Model Zoos
 
-* [Gluon Model Zoo](https://github.com/awslabs/mxnet-model-server) - models trained in Gluon and available through Gluon's model zoo API.
+* [Gluon Model Zoo](https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html) - models trained in Gluon and available through Gluon's model zoo API.
 * [ONNX Model Zoo](https://github.com/onnx/models) - ONNX models from a variety of ONNX-supported frameworks.
 
 

From 7d2b804823bca19bb303f77076d899319deda6bb Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@apache.org>
Date: Fri, 7 Dec 2018 19:46:57 -0800
Subject: [PATCH 52/54] Fix exception handling api doc (#13519)

* Fix exception handling api doc

* Update waitall api doc

Co-Authored-By: anirudh2290 <anirudh2290@apache.org>
---
 python/mxnet/ndarray/ndarray.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 78ec0b91f88d..4e6d0cdc929f 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -157,6 +157,11 @@ def waitall():
     """Wait for all async operations to finish in MXNet.
 
     This function is used for benchmarking only.
+    .. warning::
+    If your code has exceptions, `waitall` can cause silent failures.
+    For this reason you should avoid `waitall` in your code.
+    Use it only if you are confident that your code is error free.
+    Then make sure you call `wait_to_read` on all outputs after `waitall`.
     """
     check_call(_LIB.MXNDArrayWaitAll())
 

From f2ca66f2c537783aa60251080582793f42f395a7 Mon Sep 17 00:00:00 2001
From: Lai Wei <royweilai@gmail.com>
Date: Fri, 7 Dec 2018 19:57:54 -0800
Subject: [PATCH 53/54] add cpp example inception to nightly test (#13534)

* add inception test

* fix max iter for mlp

* rename and add comment

* rename epoch num
---
 cpp-package/example/mlp.cpp  | 10 +++++-----
 cpp-package/tests/ci_test.sh |  3 +++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
index 595d75c67c06..cc16f53cf205 100644
--- a/cpp-package/example/mlp.cpp
+++ b/cpp-package/example/mlp.cpp
@@ -144,13 +144,13 @@ void MLP() {
                                grad_req_type, aux_states);
 
   std::cout << "Training" << std::endl;
-  int max_iters = 20000;
+  int max_epoch = 15000;
   mx_float learning_rate = 0.0001;
-  for (int iter = 0; iter < max_iters; ++iter) {
+  for (int epoch_num = 0; epoch_num < max_epoch; ++epoch_num) {
     exe->Forward(true);
-
-    if (iter % 100 == 0) {
-      std::cout << "epoch " << iter << std::endl;
+    // print accuracy every 100 epoch
+    if (epoch_num % 100 == 0) {
+      std::cout << "epoch " << epoch_num << std::endl;
       std::vector<NDArray>& out = exe->outputs;
       float* cptr = new float[128 * 10];
       out[0].SyncCopyToCPU(cptr, 128 * 10);
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 7674e2d988b2..4a17d8d34c19 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -36,6 +36,9 @@ cp ../../build/cpp-package/example/lenet_with_mxdataiter .
 cp ../../build/cpp-package/example/resnet .
 ./resnet 5
 
+cp ../../build/cpp-package/example/inception_bn .
+./inception_bn 5
+
 cp ../../build/cpp-package/example/mlp .
 ./mlp
 

From ba02bf2fe2da423caa59ddb3fd5e433b90b730bf Mon Sep 17 00:00:00 2001
From: Pedro Larroy <pedro.larroy.lists@gmail.com>
Date: Mon, 10 Dec 2018 15:13:12 +0100
Subject: [PATCH 54/54] Add notes about debug with libstdc++ symbols (#13533)

---
 ci/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/README.md b/ci/README.md
index e11c140d4c43..f56a6f6a7978 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -172,11 +172,12 @@ nosetests-3.4 -v -s tests/python/unittest/test_ndarray.py
 
 
 # Debug with cgdb
-
+sudo apt install -y libstdc++6-6-dbg
 cgdb build/tests/mxnet_unit_tests
 
 (gdb) !pwd
 /home/qemu/mxnet
 (gdb) set substitute-path /work /home/qemu
+(gdb) set substitute-path /build/gcc-6-6mK9AW/gcc-6-6.3.0/build/arm-linux-gnueabihf/libstdc++-v3/include/ /usr/include/c++/6/
 (gdb) r --gtest_filter="ACTIVATION_PERF.ExecuteBidirectional"
 ```