From 16bc2a241c33f60363eeeec2f23c997a8e9bd391 Mon Sep 17 00:00:00 2001
From: Yemin Shi <shiyemin2@qq.com>
Date: Mon, 21 Dec 2015 11:13:15 +0800
Subject: [PATCH 1/9] fix implicit-conversion-failure-from-initializer-list

---
 src/symbol/graph_executor.cc | 2 +-
 src/symbol/static_graph.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 0134135c58e5..2a1b4884d43e 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -300,7 +300,7 @@ void GraphExecutor::InitGraph(const Symbol &symbol,
   }
   std::sort(head_nodes.begin(), head_nodes.end());
   head_nodes.resize(std::unique(head_nodes.begin(), head_nodes.end()) - head_nodes.begin());
-  std::vector<uint32_t> fwd_nodes = graph_.PostDFSOrder(head_nodes, {});
+  std::vector<uint32_t> fwd_nodes = graph_.PostDFSOrder(head_nodes, std::unordered_set<uint32_t>());
   num_forward_nodes_ = fwd_nodes.size();
 
   std::unordered_set<uint32_t> fwd_set(fwd_nodes.begin(), fwd_nodes.end());
diff --git a/src/symbol/static_graph.h b/src/symbol/static_graph.h
index db93602b80bf..869d96375045 100644
--- a/src/symbol/static_graph.h
+++ b/src/symbol/static_graph.h
@@ -183,7 +183,7 @@ class StaticGraph {
    * \return a post DFS visit order of nodes that can reach heads.
    */
   std::vector<uint32_t> PostDFSOrder(const std::vector<uint32_t>& head_nodes,
-                                     const std::unordered_set<uint32_t>& banned = {}) const;
+                                     const std::unordered_set<uint32_t>& banned = std::unordered_set<uint32_t>()) const;
   /*!
    * \brief infer the node shapes in the computation graph.
    *

From aad6806484ca8c51f1171b93323593c0bbf579cd Mon Sep 17 00:00:00 2001
From: Yemin Shi <shiyemin2@qq.com>
Date: Mon, 21 Dec 2015 12:43:06 +0800
Subject: [PATCH 2/9] break src/symbol/static_graph.h line 186

---
 src/symbol/static_graph.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/symbol/static_graph.h b/src/symbol/static_graph.h
index 869d96375045..639a47d8a4a7 100644
--- a/src/symbol/static_graph.h
+++ b/src/symbol/static_graph.h
@@ -183,7 +183,8 @@ class StaticGraph {
    * \return a post DFS visit order of nodes that can reach heads.
    */
   std::vector<uint32_t> PostDFSOrder(const std::vector<uint32_t>& head_nodes,
-                                     const std::unordered_set<uint32_t>& banned = std::unordered_set<uint32_t>()) const;
+                                     const std::unordered_set<uint32_t>& banned
+                                     = std::unordered_set<uint32_t>()) const;
   /*!
    * \brief infer the node shapes in the computation graph.
    *

From 71c46cb74cf5d11ca2683d7d1d606c41256ca354 Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Mon, 21 Dec 2015 22:27:10 +0800
Subject: [PATCH 3/9] add a shell to get ptb data

add a shell to get ptb data
---
 example/rnn/get_ptb_data.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 example/rnn/get_ptb_data.sh

diff --git a/example/rnn/get_ptb_data.sh b/example/rnn/get_ptb_data.sh
new file mode 100644
index 000000000000..ea7e1d4e6aaf
--- /dev/null
+++ b/example/rnn/get_ptb_data.sh
@@ -0,0 +1,13 @@
+#!/bin/env bash
+
+RNN_DIR=$(cd `dirname $0`; pwd)
+DATA_DIR="${RNN_DIR}/data/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} doesn't exist, will create one";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt; 
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+wget -P ${DATA_DIR}  https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;

From afddc97ae35b19d9c94de8286f711c930828fb5d Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Mon, 21 Dec 2015 22:29:49 +0800
Subject: [PATCH 4/9] correct train's output

change the swith to with
---
 example/rnn/lstm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
index 33fdb507f0dd..b560d5d270cf 100644
--- a/example/rnn/lstm.py
+++ b/example/rnn/lstm.py
@@ -187,8 +187,8 @@ def calc_nll(seq_label_probs, X, begin):
 def train_lstm(model, X_train_batch, X_val_batch,
                num_round, update_period,
                optimizer='rmsprop', half_life=2,max_grad_norm = 5.0, **kwargs):
-    print("Training swith train.shape=%s" % str(X_train_batch.shape))
-    print("Training swith val.shape=%s" % str(X_val_batch.shape))
+    print("Training with train.shape=%s" % str(X_train_batch.shape))
+    print("Training with val.shape=%s" % str(X_val_batch.shape))
     m = model
     seq_len = len(m.seq_data)
     batch_size = m.seq_data[0].shape[0]

From 13e9ff80506eb31c9a0dd55578c18f629437862a Mon Sep 17 00:00:00 2001
From: qiao hai-jun <qiaohaijun@users.noreply.github.com>
Date: Tue, 22 Dec 2015 00:24:05 +0800
Subject: [PATCH 5/9] add a shell scripts to download ptb dataset

---
 example/rnn/get_ptb_data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/rnn/get_ptb_data.sh b/example/rnn/get_ptb_data.sh
index ea7e1d4e6aaf..2b517f4ebc4d 100644
--- a/example/rnn/get_ptb_data.sh
+++ b/example/rnn/get_ptb_data.sh
@@ -10,4 +10,4 @@ fi
 
 wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt; 
 wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR}  https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;

From 4b87b586eb89332bec0a4dbb18e9e4936002c67e Mon Sep 17 00:00:00 2001
From: Mu Li <muli@cs.cmu.edu>
Date: Mon, 21 Dec 2015 13:59:36 -0500
Subject: [PATCH 6/9] Update README.md

---
 example/image-classification/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/example/image-classification/README.md b/example/image-classification/README.md
index 468b04b6b8e2..b55ee6fa1152 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -147,31 +147,31 @@ model.fit(X=train_data, y=train_label)
 
 The following factors may significant affect the performance:
 
-- Use a fast backend. A fast BLAS library, e.g. openblas, altas,
+1. Use a fast backend. A fast BLAS library, e.g. openblas, altas,
 and mkl, is necessary if only using CPU. While for Nvidia GPUs, we strongly
 recommend to use CUDNN.
-- Three important things for the input data:
-  - data format. If you are using the `rec` format, then everything should be
+2. Three important things for the input data:
+  1. data format. If you are using the `rec` format, then everything should be
     fine.
-  - decoding. In default MXNet uses 4 CPU threads for decoding the images, which
+  2. decoding. In default MXNet uses 4 CPU threads for decoding the images, which
     are often able to decode over 1k images per second. You
     may increase the number of threads if either you are using a low-end CPU or
     you GPUs are very powerful.
-  - place to store the data. Any local or distributed filesystem (HDFS, Amazon
+  3. place to store the data. Any local or distributed filesystem (HDFS, Amazon
     S3) should be fine. There may be a problem if multiple machines read the
     data from the network shared filesystem (NFS) at the same time.
-- Use a large batch size. We often choose the largest one which can fit into
+3. Use a large batch size. We often choose the largest one which can fit into
   the GPU memory. But a too large value may slow down the convergence. For
   example, the safe batch size for CIFAR 10 is around 200, while for ImageNet
   1K, the batch size can go beyond 1K.
-- Choose the proper `kvstore` if using more than one GPU. (See
+4. Choose the proper `kvstore` if using more than one GPU. (See
   [doc/developer-guide/multi_node.md](../../doc/developer-guide/multi_node.md)
   for more information)
-  - For a single machine, often the default `local` is good enough. But you may want
+  1. For a single machine, often the default `local` is good enough. But you may want
   to use `local_allreduce_device` for models with size >> 100MB such as AlexNet
   and VGG. But also note that `local_allreduce_device` takes more GPU memory than
   others.
-  - For multiple machines, we recommend to try `dist_sync` first. But if the
+  2. For multiple machines, we recommend to try `dist_sync` first. But if the
   model size is quite large or you use a large number of machines, you may want to use `dist_async`.
 
 ## Results

From c0919d3db3ada9f474ed82e48d2f1baa1f1e2de9 Mon Sep 17 00:00:00 2001
From: terrytangyuan <terrytangyuan@gmail.com>
Date: Mon, 21 Dec 2015 20:59:31 -0600
Subject: [PATCH 7/9] Added Adam Optimizer

---
 .../scala/ml/dmlc/mxnet/optimizer/Adam.scala  | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
new file mode 100644
index 000000000000..bc84877cd7e3
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
@@ -0,0 +1,83 @@
+import ml.dmlc.mxnet.{NDArray, Optimizer, LRScheduler}
+import scala.math
+
+/**
+ * Adam optimizer as described in [King2014]
+ *
+ * [King2014] Diederik Kingma, Jimmy Ba,
+ * Adam: A Method for Stochastic Optimization,
+ * http://arxiv.org/abs/1412.6980
+ *
+ * @param learningRate Float, Step size.
+ * @param beta1 Float, Exponential decay rate for the first moment estimates.
+ * @param beta2 Float, Exponential decay rate for the second moment estimates.
+ * @param epsilon Float
+ * @param decayFactor Float
+ * @param wd Float, L2 regularization coefficient add to all the weights
+ * @param rescaleGrad Float, rescaling factor of gradient.
+ * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
+ * @param lrScheduler The learning rate scheduler
+ */
+class Adam(val learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2: Float = 0.999f,
+          val epsilon: Float = 0.00000001f, val decayFactor: Float = 1-0.00000001f, val wd: Float = 0.0f,
+           rescaleGrad: Float = 1f, val clipGradient: Float = 0f,
+          val lrScheduler: LRScheduler = null) extends Optimizer(rescaleGrad: Float) {
+
+  protected var time: Int = 0
+  protected var timeFirstIndex: Int
+  /**
+   * Update the parameters.
+   * @param index An unique integer key used to index the parameters
+   * @param weight weight ndarray
+   * @param grad grad ndarray
+   * @param state NDArray or other objects returned by initState
+   *              The auxiliary state used in optimization.
+   */
+  override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
+    val lr =
+      (if (lrScheduler != null) {
+        val scheduledLr = lrScheduler(numUpdate)
+        updateCount(index)
+        scheduledLr
+      } else {
+        this.learningRate
+      }) * lrScale.getOrElse(index, 1f)
+
+    var mean, variance  = state
+
+    if (timeFirstIndex == null) {
+      timeFirstIndex = index
+      time = 0
+    } else if (timeFirstIndex == index) {
+      time += 1
+    }
+
+    val t1: Int = time + 1
+    learningRate = (lr * math.sqrt(1.0 - math.pow(beta2, t1))/(1.0 - math.pow(beta1, t1)))
+    val beta1t = beta1 * math.pow(decayFactor, t1 - 1)
+
+
+    var grad = grad * rescaleGrad
+    if (clipGradient != 0f) {
+      grad = NDArray.clip(grad, -clipGradient, clipGradient)
+    }
+
+    // mean_t
+    if (state != null) {
+      val mom = state.asInstanceOf[NDArray]
+      mom *= momentum
+      mom += -lr * (grad + wd * weight)
+      weight += mom
+    } else {
+      require(momentum == 0f)
+      weight += -lr * (grad + wd * weight)
+    }
+  }
+
+  // Create additional optimizer state: mean, variance
+  override def createState(index: Int, weight: NDArray): AnyRef = {
+    timeFirstIndex = null
+    (NDArray.zeros(weight.shape, weight.context), // mean
+      NDArray.zeros(weight.shape, weight.context)) // variance
+  }
+}

From 63f311eb76c9251c68fbea0798e832f55546d68f Mon Sep 17 00:00:00 2001
From: terrytangyuan <terrytangyuan@gmail.com>
Date: Tue, 22 Dec 2015 01:37:06 -0600
Subject: [PATCH 8/9] Finished Adam optimizer

---
 .../scala/ml/dmlc/mxnet/optimizer/Adam.scala  | 41 ++++++++++---------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
index bc84877cd7e3..b06f1969f732 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
@@ -1,5 +1,5 @@
 import ml.dmlc.mxnet.{NDArray, Optimizer, LRScheduler}
-import scala.math
+import ml.dmlc.mxnet.NDArrayConversions._
 
 /**
  * Adam optimizer as described in [King2014]
@@ -18,13 +18,13 @@ import scala.math
  * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
  * @param lrScheduler The learning rate scheduler
  */
-class Adam(val learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2: Float = 0.999f,
+class Adam(var learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2: Float = 0.999f,
           val epsilon: Float = 0.00000001f, val decayFactor: Float = 1-0.00000001f, val wd: Float = 0.0f,
            rescaleGrad: Float = 1f, val clipGradient: Float = 0f,
           val lrScheduler: LRScheduler = null) extends Optimizer(rescaleGrad: Float) {
 
   protected var time: Int = 0
-  protected var timeFirstIndex: Int
+  protected var timeFirstIndex: Int = 0
   /**
    * Update the parameters.
    * @param index An unique integer key used to index the parameters
@@ -43,9 +43,9 @@ class Adam(val learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2:
         this.learningRate
       }) * lrScale.getOrElse(index, 1f)
 
-    var mean, variance  = state
+    var (mean, variance)  = state
 
-    if (timeFirstIndex == null) {
+    if (timeFirstIndex == 0) {
       timeFirstIndex = index
       time = 0
     } else if (timeFirstIndex == index) {
@@ -53,30 +53,31 @@ class Adam(val learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2:
     }
 
     val t1: Int = time + 1
-    learningRate = (lr * math.sqrt(1.0 - math.pow(beta2, t1))/(1.0 - math.pow(beta1, t1)))
-    val beta1t = beta1 * math.pow(decayFactor, t1 - 1)
+    learningRate = (lr * math.sqrt(1.0 - math.pow(beta2, t1))/(1.0 - math.pow(beta1, t1))) toFloat
+    val beta1t = beta1 * math.pow(decayFactor, t1 - 1) toFloat
 
-
-    var grad = grad * rescaleGrad
+    var resdGrad = grad * rescaleGrad
     if (clipGradient != 0f) {
-      grad = NDArray.clip(grad, -clipGradient, clipGradient)
+      resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
     }
 
-    // mean_t
-    if (state != null) {
-      val mom = state.asInstanceOf[NDArray]
-      mom *= momentum
-      mom += -lr * (grad + wd * weight)
-      weight += mom
-    } else {
-      require(momentum == 0f)
-      weight += -lr * (grad + wd * weight)
+    val meanT = beta1t * mean.asInstanceOf[NDArray] + (1.0 - beta1t) * resdGrad toScalar
+    val varianceT = beta2 * variance.asInstanceOf[NDArray] + (1.0f - beta2) * resdGrad * resdGrad toScalar
+
+    var step = learningRate * meanT / (math.sqrt(varianceT) + epsilon)
+
+    if (wd > 0.0f) {
+      step += lr * wd * weight
     }
+
+    weight += -step.toFloat
+    mean = meanT
+    variance = varianceT
   }
 
   // Create additional optimizer state: mean, variance
   override def createState(index: Int, weight: NDArray): AnyRef = {
-    timeFirstIndex = null
+    timeFirstIndex = 0
     (NDArray.zeros(weight.shape, weight.context), // mean
       NDArray.zeros(weight.shape, weight.context)) // variance
   }

From edaeeec79d8d1889590d19a5b7d51ff647387a4e Mon Sep 17 00:00:00 2001
From: terrytangyuan <terrytangyuan@gmail.com>
Date: Tue, 22 Dec 2015 01:51:28 -0600
Subject: [PATCH 9/9] Minor fix to pass the test

---
 .../core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
index b06f1969f732..5616506e78e6 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
@@ -8,6 +8,8 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * Adam: A Method for Stochastic Optimization,
  * http://arxiv.org/abs/1412.6980
  *
+ * @author Yuan Tang
+ *
  * @param learningRate Float, Step size.
  * @param beta1 Float, Exponential decay rate for the first moment estimates.
  * @param beta2 Float, Exponential decay rate for the second moment estimates.
@@ -67,7 +69,7 @@ class Adam(var learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2:
     var step = learningRate * meanT / (math.sqrt(varianceT) + epsilon)
 
     if (wd > 0.0f) {
-      step += lr * wd * weight
+      step += (lr * wd * weight).toScalar
     }
 
     weight += -step.toFloat