diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 2868ecef1480..1567cfcf2568 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -91,3 +91,4 @@ List of Contributors
 * [Xianliang Wang](https://github.com/wangxianliang)
 * [Junru Shao](https://github.com/yzgysjr)
 * [Xiao Liu](https://github.com/skylook)
+* [Lowik CHANUSSOT](https://github.com/Nzeuwik)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index a58759426cb1..00021fdefbd3 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -2,7 +2,7 @@ Package: mxnet
 Type: Package
 Title: MXNet
 Version: 0.5
-Date: 2015-10-02
+Date: 2015-12-23
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qkou@umail.iu.edu>
 Description: MXNet is a deep learning framework designed for both efficiency
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 98389b70e218..69085bb83d1f 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -27,6 +27,7 @@ export(mx.gpu)
 export(mx.init.create)
 export(mx.init.normal)
 export(mx.init.uniform)
+export(mx.io.CSVIter)
 export(mx.io.ImageRecordIter)
 export(mx.io.MNISTIter)
 export(mx.io.arrayiter)
@@ -42,19 +43,30 @@ export(mx.mlp)
 export(mx.model.FeedForward.create)
 export(mx.model.load)
 export(mx.model.save)
+export(mx.nd.abs)
 export(mx.nd.array)
+export(mx.nd.ceil)
 export(mx.nd.choose.element.0index)
 export(mx.nd.clip)
 export(mx.nd.copyto)
+export(mx.nd.cos)
 export(mx.nd.dot)
 export(mx.nd.exp)
+export(mx.nd.floor)
 export(mx.nd.load)
 export(mx.nd.log)
+export(mx.nd.max)
+export(mx.nd.min)
 export(mx.nd.norm)
 export(mx.nd.ones)
+export(mx.nd.round)
+export(mx.nd.rsqrt)
 export(mx.nd.save)
+export(mx.nd.sign)
+export(mx.nd.sin)
 export(mx.nd.sqrt)
 export(mx.nd.square)
+export(mx.nd.sum)
 export(mx.nd.zeros)
 export(mx.opt.create)
 export(mx.opt.get.updater)
@@ -70,24 +82,38 @@ export(mx.symbol.Convolution)
 export(mx.symbol.Deconvolution)
 export(mx.symbol.Dropout)
 export(mx.symbol.ElementWiseSum)
+export(mx.symbol.Embedding)
 export(mx.symbol.Flatten)
 export(mx.symbol.FullyConnected)
 export(mx.symbol.Group)
+export(mx.symbol.IdentityAttachKLSparseReg)
 export(mx.symbol.LRN)
 export(mx.symbol.LeakyReLU)
 export(mx.symbol.LinearRegressionOutput)
 export(mx.symbol.LogisticRegressionOutput)
+export(mx.symbol.MAERegressionOutput)
 export(mx.symbol.Pooling)
 export(mx.symbol.Reshape)
 export(mx.symbol.SliceChannel)
 export(mx.symbol.Softmax)
+export(mx.symbol.SoftmaxActivation)
 export(mx.symbol.SoftmaxOutput)
+export(mx.symbol.SwapAxis)
+export(mx.symbol.UpSampling)
 export(mx.symbol.Variable)
+export(mx.symbol.abs)
+export(mx.symbol.ceil)
+export(mx.symbol.cos)
 export(mx.symbol.exp)
+export(mx.symbol.floor)
 export(mx.symbol.infer.shape)
 export(mx.symbol.load)
 export(mx.symbol.log)
+export(mx.symbol.round)
+export(mx.symbol.rsqrt)
 export(mx.symbol.save)
+export(mx.symbol.sign)
+export(mx.symbol.sin)
 export(mx.symbol.sqrt)
 export(mx.symbol.square)
 export(mxnet.export)
diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index 3aaaff0bc537..d1d8a14f103e 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -2,6 +2,26 @@
 # Generated by mxnet.export, do not edit by hand.
 ######
 
+#' Take absolute value of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.abs
+NULL
+
+#' Take ceil value of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.ceil
+NULL
+
 #' Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs. This function assume rhs uses 0-based index.
 #' 
 #' @param lhs  NDArray
@@ -28,6 +48,16 @@ NULL
 #' @name mx.nd.clip
 NULL
 
+#' Take cos of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.cos
+NULL
+
 #' Calcuate 2D matrix multiplication
 #' 
 #' @param lhs  NDArray
@@ -50,6 +80,16 @@ NULL
 #' @name mx.nd.exp
 NULL
 
+#' Take floor value of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.floor
+NULL
+
 #' Take log of the src
 #' 
 #' @param src  NDArray
@@ -60,6 +100,26 @@ NULL
 #' @name mx.nd.log
 NULL
 
+#' Take max of the src.The result will be ndarray of shape (1,) on the same device.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.max
+NULL
+
+#' Take min of the src.The result will be ndarray of shape (1,) on the same device.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.min
+NULL
+
 #' Take L2 norm of the src.The result will be ndarray of shape (1,) on the same device.
 #' 
 #' @param src  NDArray
@@ -70,6 +130,46 @@ NULL
 #' @name mx.nd.norm
 NULL
 
+#' Take round value of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.round
+NULL
+
+#' Take rsqrt of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.rsqrt
+NULL
+
+#' Take sign value of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.sign
+NULL
+
+#' Take sin of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.sin
+NULL
+
 #' Take sqrt of the src
 #' 
 #' @param src  NDArray
@@ -90,6 +190,33 @@ NULL
 #' @name mx.nd.square
 NULL
 
+#' Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.sum
+NULL
+
+#' Create iterator for dataset in csv.
+#' 
+#' @param data.csv  string, required
+#'     Dataset Param: Data csv path.
+#' @param data.shape  Shape(tuple), required
+#'     Dataset Param: Shape of the data.
+#' @param label.csv  string, optional, default='NULL'
+#'     Dataset Param: Label csv path. If is NULL, all labels will be returned as 0
+#' @param label.shape  Shape(tuple), optional, default=(1,)
+#'     Dataset Param: Shape of the label.
+#' @return iter The result mx.dataiter
+#' 
+#' @export
+mx.io.CSVIter <- function(...) {
+  mx.varg.io.CSVIter(list(...))
+}
+
 #' Create iterator for dataset packed in recordio.
 #' 
 #' @param path.imglist  string, optional, default=''
@@ -114,13 +241,9 @@ NULL
 #'     Augmentation Param: Random Seed.
 #' @param batch.size  int (non-negative), required
 #'     Batch Param: Batch size.
-#' @param data.shape  Shape(tuple), required
-#'     Dataset Param: Shape of each instance generated by the DataIter.
-#' @param label.width  int (non-negative), optional, default=1
-#'     Dataset Param: Label width.
 #' @param round.batch  boolean, optional, default=True
 #'     Batch Param: Use round robin to handle overflow batch.
-#' @param prefetch.buffer  long (non-negative), optional, default=4
+#' @param prefetch.buffer  , optional, default=4
 #'     Backend Param: Number of prefetched parameters
 #' @param rand.crop  boolean, optional, default=False
 #'     Augmentation Param: Whether to random crop on the image
@@ -197,7 +320,7 @@ mx.io.ImageRecordIter <- function(...) {
 #'     partition the data into multiple parts
 #' @param part.index  int, optional, default='0'
 #'     the index of the part will read
-#' @param prefetch.buffer  long (non-negative), optional, default=4
+#' @param prefetch.buffer  , optional, default=4
 #'     Backend Param: Number of prefetched parameters
 #' @return iter The result mx.dataiter
 #' 
@@ -206,11 +329,11 @@ mx.io.MNISTIter <- function(...) {
   mx.varg.io.MNISTIter(list(...))
 }
 
-#' Apply activation function to input.
+#' Apply activation function to input.Softmax Activation is only available with CUDNN on GPUand will be computed at each location across channel if input is 4D.
 #' 
 #' @param data  Symbol
 #'     Input data to activation function.
-#' @param act.type  {'relu', 'sigmoid', 'tanh'}, required
+#' @param act.type  {'relu', 'sigmoid', 'softrelu', 'tanh'}, required
 #'     Activation function to be applied.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -225,10 +348,12 @@ mx.symbol.Activation <- function(...) {
 #' 
 #' @param data  Symbol
 #'     Input data to batch normalization
-#' @param eps  float, optional, default=1e-10
+#' @param eps  float, optional, default=0.001
 #'     Epsilon to prevent div 0
-#' @param momentum  float, optional, default=0.1
+#' @param momentum  float, optional, default=0.9
 #'     Momentum for moving average
+#' @param fix.gamma  boolean, optional, default=True
+#'     Fix gamma while training
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -255,6 +380,8 @@ mx.symbol.BlockGrad <- function(...) {
 #' 
 #' @param num.args  int, required
 #'     Number of inputs to be concated.
+#' @param dim  int, optional, default='1'
+#'     the dimension to be concated.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -283,7 +410,7 @@ mx.symbol.Concat <- function(...) {
 #' @param num.group  int (non-negative), optional, default=1
 #'     Number of groups partition. This option is not supported by CuDNN, you can use SliceChannel to num_group,apply convolution and concat instead to achieve the same need.
 #' @param workspace  long (non-negative), optional, default=512
-#'     Tmp workspace for convolution (MB)
+#'     Tmp workspace for convolution (MB).
 #' @param no.bias  boolean, optional, default=False
 #'     Whether to disable bias parameter.
 #' @param name  string, optional
@@ -354,6 +481,25 @@ mx.symbol.ElementWiseSum <- function(...) {
   mx.varg.symbol.ElementWiseSum(list(...))
 }
 
+#' Get embedding for one-hot input
+#' 
+#' @param data  Symbol
+#'     Input data to the EmbeddingOp.
+#' @param weight  Symbol
+#'     Enbedding weight matrix.
+#' @param input.dim  int, required
+#'     input dim of one-hot encoding
+#' @param output.dim  int, required
+#'     output dim of embedding
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.Embedding <- function(...) {
+  mx.varg.symbol.Embedding(list(...))
+}
+
 #' Flatten input
 #' 
 #' @param data  Symbol
@@ -388,6 +534,25 @@ mx.symbol.FullyConnected <- function(...) {
   mx.varg.symbol.FullyConnected(list(...))
 }
 
+#' Apply a sparse regularization to the output a sigmoid activation function.
+#' 
+#' @param data  Symbol
+#'     Input data.
+#' @param sparseness.target  float, optional, default=0.1
+#'     The sparseness target
+#' @param penalty  float, optional, default=0.001
+#'     The tradeoff parameter for the sparseness penalty
+#' @param momentum  float, optional, default=0.9
+#'     The momentum for running average
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.IdentityAttachKLSparseReg <- function(...) {
+  mx.varg.symbol.IdentityAttachKLSparseReg(list(...))
+}
+
 #' Apply convolution to input then add a bias.
 #' 
 #' @param data  Symbol
@@ -413,10 +578,10 @@ mx.symbol.LRN <- function(...) {
 #' 
 #' @param data  Symbol
 #'     Input data to activation function.
-#' @param act.type  {'leaky', 'prelu', 'rrelu'},optional, default='leaky'
+#' @param act.type  {'elu', 'leaky', 'prelu', 'rrelu'},optional, default='leaky'
 #'     Activation function to be applied.
 #' @param slope  float, optional, default=0.25
-#'     Init slope for the activation. (For leaky only)
+#'     Init slope for the activation. (For leaky and elu only)
 #' @param lower.bound  float, optional, default=0.125
 #'     Lower bound of random slope. (For rrelu only)
 #' @param upper.bound  float, optional, default=0.334
@@ -436,6 +601,8 @@ mx.symbol.LeakyReLU <- function(...) {
 #'     Input data to function.
 #' @param label  Symbol
 #'     Input label to function.
+#' @param grad.scale  float, optional, default=1
+#'     Scale the gradient by a float factor
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -452,6 +619,8 @@ mx.symbol.LinearRegressionOutput <- function(...) {
 #'     Input data to function.
 #' @param label  Symbol
 #'     Input label to function.
+#' @param grad.scale  float, optional, default=1
+#'     Scale the gradient by a float factor
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -461,6 +630,23 @@ mx.symbol.LogisticRegressionOutput <- function(...) {
   mx.varg.symbol.LogisticRegressionOutput(list(...))
 }
 
+#' Use mean absolute error regression for final output, this is used on final output of a net.
+#' 
+#' @param data  Symbol
+#'     Input data to function.
+#' @param label  Symbol
+#'     Input label to function.
+#' @param grad.scale  float, optional, default=1
+#'     Scale the gradient by a float factor
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.MAERegressionOutput <- function(...) {
+  mx.varg.symbol.MAERegressionOutput(list(...))
+}
+
 #' Perform spatial pooling on inputs.
 #' 
 #' @param data  Symbol
@@ -487,7 +673,7 @@ mx.symbol.Pooling <- function(...) {
 #' @param data  Symbol
 #'     Input data to  reshape.
 #' @param target.shape  Shape(tuple), required
-#'     Target new shape
+#'     Target new shape. One and only one dim can be 0, in which case it will be infered from the rest of dims
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -527,6 +713,21 @@ mx.symbol.Softmax <- function(...) {
   mx.varg.symbol.Softmax(list(...))
 }
 
+#' Apply softmax activation to input. This is intended for internal layers. For output (loss layer) please use SoftmaxOutput. If type=instance, this operator will compute a softmax for each instance in the batch; this is the default mode. If type=channel, this operator will compute a num_channel-class softmax at each position of each instance; this can be used for fully convolutional network, image segmentation, etc.
+#' 
+#' @param data  Symbol
+#'     Input data to activation function.
+#' @param type  {'channel', 'instance'},optional, default='instance'
+#'     Softmax Mode. If set to instance, this operator will compute a softmax for each instance in the batch; this is the default mode. If set to channel, this operator will compute a num_channel-class softmax at each position of each instance; this can be used for fully convolutional network, image segmentation, etc.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.SoftmaxActivation <- function(...) {
+  mx.varg.symbol.SoftmaxActivation(list(...))
+}
+
 #' Perform a softmax transformation on input, backprop with logloss.
 #' 
 #' @param data  Symbol
@@ -544,6 +745,81 @@ mx.symbol.SoftmaxOutput <- function(...) {
   mx.varg.symbol.SoftmaxOutput(list(...))
 }
 
+#' Apply swapaxis to input.
+#' 
+#' @param data  Symbol
+#'     Input data to the SwapAxisOp.
+#' @param dim1  int (non-negative), optional, default=0
+#'     the first axis to be swapped.
+#' @param dim2  int (non-negative), optional, default=0
+#'     the second axis to be swapped.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.SwapAxis <- function(...) {
+  mx.varg.symbol.SwapAxis(list(...))
+}
+
+#' Perform nearest neighboor/bilinear up sampling to inputs
+#' 
+#' @param scale  int (non-negative), required
+#'     Up sampling scale
+#' @param num.filter  int (non-negative), optional, default=0
+#'     Input filter. Only used by nearest sample_type.
+#' @param sample.type  {'bilinear', 'nearest'}, required
+#'     upsampling method
+#' @param num.args  int, required
+#'     Number of inputs to be upsampled. For nearest neighbor upsampling, this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other inputs will be upsampled to thesame size. For bilinear upsampling this must be 2; 1 input and 1 weight.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.UpSampling <- function(...) {
+  mx.varg.symbol.UpSampling(list(...))
+}
+
+#' Take absolute value of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.abs <- function(...) {
+  mx.varg.symbol.abs(list(...))
+}
+
+#' Take ceil value of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.ceil <- function(...) {
+  mx.varg.symbol.ceil(list(...))
+}
+
+#' Take cos of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.cos <- function(...) {
+  mx.varg.symbol.cos(list(...))
+}
+
 #' Take exp of the src
 #' 
 #' @param src  Symbol
@@ -557,6 +833,19 @@ mx.symbol.exp <- function(...) {
   mx.varg.symbol.exp(list(...))
 }
 
+#' Take floor value of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.floor <- function(...) {
+  mx.varg.symbol.floor(list(...))
+}
+
 #' Take log of the src
 #' 
 #' @param src  Symbol
@@ -570,6 +859,58 @@ mx.symbol.log <- function(...) {
   mx.varg.symbol.log(list(...))
 }
 
+#' Take round value of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.round <- function(...) {
+  mx.varg.symbol.round(list(...))
+}
+
+#' Take rsqrt of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.rsqrt <- function(...) {
+  mx.varg.symbol.rsqrt(list(...))
+}
+
+#' Take sign value of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.sign <- function(...) {
+  mx.varg.symbol.sign(list(...))
+}
+
+#' Take sin of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.sin <- function(...) {
+  mx.varg.symbol.sin(list(...))
+}
+
 #' Take sqrt of the src
 #' 
 #' @param src  Symbol
diff --git a/R-package/R/symbol.R b/R-package/R/symbol.R
index 1165c02074df..0a3db864a65f 100644
--- a/R-package/R/symbol.R
+++ b/R-package/R/symbol.R
@@ -16,7 +16,7 @@ NULL
 #'
 #' @export
 mx.symbol.Group <- function(...) {
-  mx.varg.symbo.internal.Group(list(...))
+  mx.varg.symbol.internal.Group(list(...))
 }
 
 #' Save an mx.symbol object
@@ -105,14 +105,25 @@ init.symbol.methods <- function() {
   setMethod("+", signature(e1 = "Rcpp_MXSymbol", e2 = "Rcpp_MXSymbol"), function(e1, e2) {
     mx.varg.symbol.internal.Plus(list(e1, e2))
   })
+  setMethod("+", signature(e1 = "Rcpp_MXSymbol", e2 = "numeric"), function(e1, e2) {
+    mx.varg.symbol.internal.PlusScalar(list(e1, scalar = e2))
+  })
   setMethod("-", signature(e1 = "Rcpp_MXSymbol", e2 = "Rcpp_MXSymbol"), function(e1, e2) {
     mx.varg.symbol.internal.Minus(list(e1, e2))
   })
+  setMethod("-", signature(e1 = "Rcpp_MXSymbol", e2 = "numeric"), function(e1, e2) {
+    mx.varg.symbol.internal.MinusScalar(list(e1, scalar = e2))
+  })
   setMethod("*", signature(e1 = "Rcpp_MXSymbol", e2 = "Rcpp_MXSymbol"), function(e1, e2) {
     mx.varg.symbol.internal.Mul(list(e1, e2))
   })
+  setMethod("*", signature(e1 = "Rcpp_MXSymbol", e2 = "numeric"), function(e1, e2) {
+    mx.varg.symbol.internal.MulScalar(list(e1, scalar = e2))
+  })
   setMethod("/", signature(e1 = "Rcpp_MXSymbol", e2 = "Rcpp_MXSymbol"), function(e1, e2) {
     mx.varg.symbol.internal.Div(list(e1, e2))
   })
-
+  setMethod("/", signature(e1 = "Rcpp_MXSymbol", e2 = "numeric"), function(e1, e2) {
+    mx.varg.symbol.internal.DivScalar(list(e1, scalar = e2))
+  })
 }
diff --git a/R-package/demo/00Index b/R-package/demo/00Index
index 059f16492fc7..1629d05f86f4 100644
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -1 +1,7 @@
+basic_bench                 Basic benchmark
+basic_executor              Basic executor operations
+basic_kvstore               Basic kvstore operations
+basic_model                 Basic model operations
 basic_ndarray               Basic ndarray operations
+basic_random                Basic random number generators
+basic_symbol                Basic symbol operations
diff --git a/R-package/man/mx.io.CSVIter.Rd b/R-package/man/mx.io.CSVIter.Rd
new file mode 100644
index 000000000000..31499ce5c2e5
--- /dev/null
+++ b/R-package/man/mx.io.CSVIter.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.io.CSVIter}
+\alias{mx.io.CSVIter}
+\title{Create iterator for dataset in csv.}
+\usage{
+mx.io.CSVIter(...)
+}
+\arguments{
+\item{data.csv}{string, required
+Dataset Param: Data csv path.}
+
+\item{data.shape}{Shape(tuple), required
+Dataset Param: Shape of the data.}
+
+\item{label.csv}{string, optional, default='NULL'
+Dataset Param: Label csv path. If is NULL, all labels will be returned as 0}
+
+\item{label.shape}{Shape(tuple), optional, default=(1,)
+Dataset Param: Shape of the label.}
+}
+\value{
+iter The result mx.dataiter
+}
+\description{
+Create iterator for dataset in csv.
+}
+
diff --git a/R-package/man/mx.io.ImageRecordIter.Rd b/R-package/man/mx.io.ImageRecordIter.Rd
index 4e13ca79790a..be652bac5ca9 100644
--- a/R-package/man/mx.io.ImageRecordIter.Rd
+++ b/R-package/man/mx.io.ImageRecordIter.Rd
@@ -40,16 +40,10 @@ Augmentation Param: Random Seed.}
 \item{batch.size}{int (non-negative), required
 Batch Param: Batch size.}
 
-\item{data.shape}{Shape(tuple), required
-Dataset Param: Shape of each instance generated by the DataIter.}
-
-\item{label.width}{int (non-negative), optional, default=1
-Dataset Param: Label width.}
-
 \item{round.batch}{boolean, optional, default=True
 Batch Param: Use round robin to handle overflow batch.}
 
-\item{prefetch.buffer}{long (non-negative), optional, default=4
+\item{prefetch.buffer}{, optional, default=4
 Backend Param: Number of prefetched parameters}
 
 \item{rand.crop}{boolean, optional, default=False
diff --git a/R-package/man/mx.io.MNISTIter.Rd b/R-package/man/mx.io.MNISTIter.Rd
index 2e239022319e..afe17095a551 100644
--- a/R-package/man/mx.io.MNISTIter.Rd
+++ b/R-package/man/mx.io.MNISTIter.Rd
@@ -34,7 +34,7 @@ partition the data into multiple parts}
 \item{part.index}{int, optional, default='0'
 the index of the part will read}
 
-\item{prefetch.buffer}{long (non-negative), optional, default=4
+\item{prefetch.buffer}{, optional, default=4
 Backend Param: Number of prefetched parameters}
 }
 \value{
diff --git a/R-package/man/mx.nd.abs.Rd b/R-package/man/mx.nd.abs.Rd
new file mode 100644
index 000000000000..ff65bced8c3c
--- /dev/null
+++ b/R-package/man/mx.nd.abs.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.abs}
+\alias{mx.nd.abs}
+\title{Take absolute value of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take absolute value of the src
+}
+
diff --git a/R-package/man/mx.nd.ceil.Rd b/R-package/man/mx.nd.ceil.Rd
new file mode 100644
index 000000000000..5e18ff72f9a2
--- /dev/null
+++ b/R-package/man/mx.nd.ceil.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.ceil}
+\alias{mx.nd.ceil}
+\title{Take ceil value of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take ceil value of the src
+}
+
diff --git a/R-package/man/mx.nd.cos.Rd b/R-package/man/mx.nd.cos.Rd
new file mode 100644
index 000000000000..3ab50dee28db
--- /dev/null
+++ b/R-package/man/mx.nd.cos.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.cos}
+\alias{mx.nd.cos}
+\title{Take cos of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take cos of the src
+}
+
diff --git a/R-package/man/mx.nd.floor.Rd b/R-package/man/mx.nd.floor.Rd
new file mode 100644
index 000000000000..cc63c26cecd3
--- /dev/null
+++ b/R-package/man/mx.nd.floor.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.floor}
+\alias{mx.nd.floor}
+\title{Take floor value of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take floor value of the src
+}
+
diff --git a/R-package/man/mx.nd.max.Rd b/R-package/man/mx.nd.max.Rd
new file mode 100644
index 000000000000..842b971c9700
--- /dev/null
+++ b/R-package/man/mx.nd.max.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.max}
+\alias{mx.nd.max}
+\title{Take max of the src.The result will be ndarray of shape (1,) on the same device.}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take max of the src.The result will be ndarray of shape (1,) on the same device.
+}
+
diff --git a/R-package/man/mx.nd.min.Rd b/R-package/man/mx.nd.min.Rd
new file mode 100644
index 000000000000..88998dab5fb6
--- /dev/null
+++ b/R-package/man/mx.nd.min.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.min}
+\alias{mx.nd.min}
+\title{Take min of the src.The result will be ndarray of shape (1,) on the same device.}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take min of the src.The result will be ndarray of shape (1,) on the same device.
+}
+
diff --git a/R-package/man/mx.nd.round.Rd b/R-package/man/mx.nd.round.Rd
new file mode 100644
index 000000000000..18cd5a310ceb
--- /dev/null
+++ b/R-package/man/mx.nd.round.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.round}
+\alias{mx.nd.round}
+\title{Take round value of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take round value of the src
+}
+
diff --git a/R-package/man/mx.nd.rsqrt.Rd b/R-package/man/mx.nd.rsqrt.Rd
new file mode 100644
index 000000000000..a73a8e4820d6
--- /dev/null
+++ b/R-package/man/mx.nd.rsqrt.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.rsqrt}
+\alias{mx.nd.rsqrt}
+\title{Take rsqrt of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take rsqrt of the src
+}
+
diff --git a/R-package/man/mx.nd.sign.Rd b/R-package/man/mx.nd.sign.Rd
new file mode 100644
index 000000000000..1d3333c094d7
--- /dev/null
+++ b/R-package/man/mx.nd.sign.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.sign}
+\alias{mx.nd.sign}
+\title{Take sign value of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take sign value of the src
+}
+
diff --git a/R-package/man/mx.nd.sin.Rd b/R-package/man/mx.nd.sin.Rd
new file mode 100644
index 000000000000..61119d277d30
--- /dev/null
+++ b/R-package/man/mx.nd.sin.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.sin}
+\alias{mx.nd.sin}
+\title{Take sin of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take sin of the src
+}
+
diff --git a/R-package/man/mx.nd.sum.Rd b/R-package/man/mx.nd.sum.Rd
new file mode 100644
index 000000000000..a9046d0ab2f7
--- /dev/null
+++ b/R-package/man/mx.nd.sum.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.sum}
+\alias{mx.nd.sum}
+\title{Take sum of the src.The result will be ndarray of shape (1,) on the same device.}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+}
+
diff --git a/R-package/man/mx.symbol.Activation.Rd b/R-package/man/mx.symbol.Activation.Rd
index 3fd9892faedc..1e8cc8388998 100644
--- a/R-package/man/mx.symbol.Activation.Rd
+++ b/R-package/man/mx.symbol.Activation.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Activation}
 \alias{mx.symbol.Activation}
-\title{Apply activation function to input.}
+\title{Apply activation function to input.Softmax Activation is only available with CUDNN on GPUand will be computed at each location across channel if input is 4D.}
 \usage{
 mx.symbol.Activation(...)
 }
@@ -10,7 +10,7 @@ mx.symbol.Activation(...)
 \item{data}{Symbol
 Input data to activation function.}
 
-\item{act.type}{{'relu', 'sigmoid', 'tanh'}, required
+\item{act.type}{{'relu', 'sigmoid', 'softrelu', 'tanh'}, required
 Activation function to be applied.}
 
 \item{name}{string, optional
@@ -20,6 +20,6 @@ Name of the resulting symbol.}
 out The result mx.symbol
 }
 \description{
-Apply activation function to input.
+Apply activation function to input.Softmax Activation is only available with CUDNN on GPUand will be computed at each location across channel if input is 4D.
 }
 
diff --git a/R-package/man/mx.symbol.BatchNorm.Rd b/R-package/man/mx.symbol.BatchNorm.Rd
index 2f7a984d5d97..d8fc0235a73d 100644
--- a/R-package/man/mx.symbol.BatchNorm.Rd
+++ b/R-package/man/mx.symbol.BatchNorm.Rd
@@ -10,12 +10,15 @@ mx.symbol.BatchNorm(...)
 \item{data}{Symbol
 Input data to batch normalization}
 
-\item{eps}{float, optional, default=1e-10
+\item{eps}{float, optional, default=0.001
 Epsilon to prevent div 0}
 
-\item{momentum}{float, optional, default=0.1
+\item{momentum}{float, optional, default=0.9
 Momentum for moving average}
 
+\item{fix.gamma}{boolean, optional, default=True
+Fix gamma while training}
+
 \item{name}{string, optional
 Name of the resulting symbol.}
 }
diff --git a/R-package/man/mx.symbol.Concat.Rd b/R-package/man/mx.symbol.Concat.Rd
index e290ede87c9a..d30a1fccc18e 100644
--- a/R-package/man/mx.symbol.Concat.Rd
+++ b/R-package/man/mx.symbol.Concat.Rd
@@ -10,6 +10,9 @@ mx.symbol.Concat(...)
 \item{num.args}{int, required
 Number of inputs to be concated.}
 
+\item{dim}{int, optional, default='1'
+the dimension to be concated.}
+
 \item{name}{string, optional
 Name of the resulting symbol.}
 }
diff --git a/R-package/man/mx.symbol.Convolution.Rd b/R-package/man/mx.symbol.Convolution.Rd
index 8914c6cbec78..140be1f8ff41 100644
--- a/R-package/man/mx.symbol.Convolution.Rd
+++ b/R-package/man/mx.symbol.Convolution.Rd
@@ -32,7 +32,7 @@ convolution filter(channel) number}
 Number of groups partition. This option is not supported by CuDNN, you can use SliceChannel to num_group,apply convolution and concat instead to achieve the same need.}
 
 \item{workspace}{long (non-negative), optional, default=512
-Tmp workspace for convolution (MB)}
+Tmp workspace for convolution (MB).}
 
 \item{no.bias}{boolean, optional, default=False
 Whether to disable bias parameter.}
diff --git a/R-package/man/mx.symbol.Embedding.Rd b/R-package/man/mx.symbol.Embedding.Rd
new file mode 100644
index 000000000000..0e9029ea3a8f
--- /dev/null
+++ b/R-package/man/mx.symbol.Embedding.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.Embedding}
+\alias{mx.symbol.Embedding}
+\title{Get embedding for one-hot input}
+\usage{
+mx.symbol.Embedding(...)
+}
+\arguments{
+\item{data}{Symbol
+Input data to the EmbeddingOp.}
+
+\item{weight}{Symbol
+Enbedding weight matrix.}
+
+\item{input.dim}{int, required
+input dim of one-hot encoding}
+
+\item{output.dim}{int, required
+output dim of embedding}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Get embedding for one-hot input
+}
+
diff --git a/R-package/man/mx.symbol.IdentityAttachKLSparseReg.Rd b/R-package/man/mx.symbol.IdentityAttachKLSparseReg.Rd
new file mode 100644
index 000000000000..c280235d6052
--- /dev/null
+++ b/R-package/man/mx.symbol.IdentityAttachKLSparseReg.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.IdentityAttachKLSparseReg}
+\alias{mx.symbol.IdentityAttachKLSparseReg}
+\title{Apply a sparse regularization to the output a sigmoid activation function.}
+\usage{
+mx.symbol.IdentityAttachKLSparseReg(...)
+}
+\arguments{
+\item{data}{Symbol
+Input data.}
+
+\item{sparseness.target}{float, optional, default=0.1
+The sparseness target}
+
+\item{penalty}{float, optional, default=0.001
+The tradeoff parameter for the sparseness penalty}
+
+\item{momentum}{float, optional, default=0.9
+The momentum for running average}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Apply a sparse regularization to the output a sigmoid activation function.
+}
+
diff --git a/R-package/man/mx.symbol.LeakyReLU.Rd b/R-package/man/mx.symbol.LeakyReLU.Rd
index 3a91c82e2df7..429643cdc73a 100644
--- a/R-package/man/mx.symbol.LeakyReLU.Rd
+++ b/R-package/man/mx.symbol.LeakyReLU.Rd
@@ -10,11 +10,11 @@ mx.symbol.LeakyReLU(...)
 \item{data}{Symbol
 Input data to activation function.}
 
-\item{act.type}{{'leaky', 'prelu', 'rrelu'},optional, default='leaky'
+\item{act.type}{{'elu', 'leaky', 'prelu', 'rrelu'},optional, default='leaky'
 Activation function to be applied.}
 
 \item{slope}{float, optional, default=0.25
-Init slope for the activation. (For leaky only)}
+Init slope for the activation. (For leaky and elu only)}
 
 \item{lower.bound}{float, optional, default=0.125
 Lower bound of random slope. (For rrelu only)}
diff --git a/R-package/man/mx.symbol.LinearRegressionOutput.Rd b/R-package/man/mx.symbol.LinearRegressionOutput.Rd
index 4dd9faef6082..5ee18cff7a26 100644
--- a/R-package/man/mx.symbol.LinearRegressionOutput.Rd
+++ b/R-package/man/mx.symbol.LinearRegressionOutput.Rd
@@ -13,6 +13,9 @@ Input data to function.}
 \item{label}{Symbol
 Input label to function.}
 
+\item{grad.scale}{float, optional, default=1
+Scale the gradient by a float factor}
+
 \item{name}{string, optional
 Name of the resulting symbol.}
 }
diff --git a/R-package/man/mx.symbol.LogisticRegressionOutput.Rd b/R-package/man/mx.symbol.LogisticRegressionOutput.Rd
index f6825519961e..bcd90a898101 100644
--- a/R-package/man/mx.symbol.LogisticRegressionOutput.Rd
+++ b/R-package/man/mx.symbol.LogisticRegressionOutput.Rd
@@ -14,6 +14,9 @@ Input data to function.}
 \item{label}{Symbol
 Input label to function.}
 
+\item{grad.scale}{float, optional, default=1
+Scale the gradient by a float factor}
+
 \item{name}{string, optional
 Name of the resulting symbol.}
 }
diff --git a/R-package/man/mx.symbol.MAERegressionOutput.Rd b/R-package/man/mx.symbol.MAERegressionOutput.Rd
new file mode 100644
index 000000000000..03eacdbaea33
--- /dev/null
+++ b/R-package/man/mx.symbol.MAERegressionOutput.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.MAERegressionOutput}
+\alias{mx.symbol.MAERegressionOutput}
+\title{Use mean absolute error regression for final output, this is used on final output of a net.}
+\usage{
+mx.symbol.MAERegressionOutput(...)
+}
+\arguments{
+\item{data}{Symbol
+Input data to function.}
+
+\item{label}{Symbol
+Input label to function.}
+
+\item{grad.scale}{float, optional, default=1
+Scale the gradient by a float factor}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Use mean absolute error regression for final output, this is used on final output of a net.
+}
+
diff --git a/R-package/man/mx.symbol.Reshape.Rd b/R-package/man/mx.symbol.Reshape.Rd
index 803e5d1d4335..0f31dbfe8e1e 100644
--- a/R-package/man/mx.symbol.Reshape.Rd
+++ b/R-package/man/mx.symbol.Reshape.Rd
@@ -11,7 +11,7 @@ mx.symbol.Reshape(...)
 Input data to  reshape.}
 
 \item{target.shape}{Shape(tuple), required
-Target new shape}
+Target new shape. One and only one dim can be 0, in which case it will be infered from the rest of dims}
 
 \item{name}{string, optional
 Name of the resulting symbol.}
diff --git a/R-package/man/mx.symbol.SoftmaxActivation.Rd b/R-package/man/mx.symbol.SoftmaxActivation.Rd
new file mode 100644
index 000000000000..b0cb6771f453
--- /dev/null
+++ b/R-package/man/mx.symbol.SoftmaxActivation.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.SoftmaxActivation}
+\alias{mx.symbol.SoftmaxActivation}
+\title{Apply softmax activation to input. This is intended for internal layers. For output (loss layer) please use SoftmaxOutput. If type=instance, this operator will compute a softmax for each instance in the batch; this is the default mode. If type=channel, this operator will compute a num_channel-class softmax at each position of each instance; this can be used for fully convolutional network, image segmentation, etc.}
+\usage{
+mx.symbol.SoftmaxActivation(...)
+}
+\arguments{
+\item{data}{Symbol
+Input data to activation function.}
+
+\item{type}{{'channel', 'instance'},optional, default='instance'
+Softmax Mode. If set to instance, this operator will compute a softmax for each instance in the batch; this is the default mode. If set to channel, this operator will compute a num_channel-class softmax at each position of each instance; this can be used for fully convolutional network, image segmentation, etc.}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Apply softmax activation to input. This is intended for internal layers. For output (loss layer) please use SoftmaxOutput. If type=instance, this operator will compute a softmax for each instance in the batch; this is the default mode. If type=channel, this operator will compute a num_channel-class softmax at each position of each instance; this can be used for fully convolutional network, image segmentation, etc.
+}
+
diff --git a/R-package/man/mx.symbol.SwapAxis.Rd b/R-package/man/mx.symbol.SwapAxis.Rd
new file mode 100644
index 000000000000..b74831cd3fb0
--- /dev/null
+++ b/R-package/man/mx.symbol.SwapAxis.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.SwapAxis}
+\alias{mx.symbol.SwapAxis}
+\title{Apply swapaxis to input.}
+\usage{
+mx.symbol.SwapAxis(...)
+}
+\arguments{
+\item{data}{Symbol
+Input data to the SwapAxisOp.}
+
+\item{dim1}{int (non-negative), optional, default=0
+the first axis to be swapped.}
+
+\item{dim2}{int (non-negative), optional, default=0
+the second axis to be swapped.}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Apply swapaxis to input.
+}
+
diff --git a/R-package/man/mx.symbol.UpSampling.Rd b/R-package/man/mx.symbol.UpSampling.Rd
new file mode 100644
index 000000000000..83ef96e93ac4
--- /dev/null
+++ b/R-package/man/mx.symbol.UpSampling.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.UpSampling}
+\alias{mx.symbol.UpSampling}
+\title{Perform nearest neighboor/bilinear up sampling to inputs}
+\usage{
+mx.symbol.UpSampling(...)
+}
+\arguments{
+\item{scale}{int (non-negative), required
+Up sampling scale}
+
+\item{num.filter}{int (non-negative), optional, default=0
+Input filter. Only used by nearest sample_type.}
+
+\item{sample.type}{{'bilinear', 'nearest'}, required
+upsampling method}
+
+\item{num.args}{int, required
+Number of inputs to be upsampled. For nearest neighbor upsampling, this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other inputs will be upsampled to thesame size. For bilinear upsampling this must be 2; 1 input and 1 weight.}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Perform nearest neighboor/bilinear up sampling to inputs
+}
+
diff --git a/R-package/man/mx.symbol.abs.Rd b/R-package/man/mx.symbol.abs.Rd
new file mode 100644
index 000000000000..70f81f175bda
--- /dev/null
+++ b/R-package/man/mx.symbol.abs.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.abs}
+\alias{mx.symbol.abs}
+\title{Take absolute value of the src}
+\usage{
+mx.symbol.abs(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take absolute value of the src
+}
+
diff --git a/R-package/man/mx.symbol.ceil.Rd b/R-package/man/mx.symbol.ceil.Rd
new file mode 100644
index 000000000000..3d504adc9ac1
--- /dev/null
+++ b/R-package/man/mx.symbol.ceil.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.ceil}
+\alias{mx.symbol.ceil}
+\title{Take ceil value of the src}
+\usage{
+mx.symbol.ceil(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take ceil value of the src
+}
+
diff --git a/R-package/man/mx.symbol.cos.Rd b/R-package/man/mx.symbol.cos.Rd
new file mode 100644
index 000000000000..39bacf0a7f53
--- /dev/null
+++ b/R-package/man/mx.symbol.cos.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.cos}
+\alias{mx.symbol.cos}
+\title{Take cos of the src}
+\usage{
+mx.symbol.cos(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take cos of the src
+}
+
diff --git a/R-package/man/mx.symbol.floor.Rd b/R-package/man/mx.symbol.floor.Rd
new file mode 100644
index 000000000000..643c765fdc30
--- /dev/null
+++ b/R-package/man/mx.symbol.floor.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.floor}
+\alias{mx.symbol.floor}
+\title{Take floor value of the src}
+\usage{
+mx.symbol.floor(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take floor value of the src
+}
+
diff --git a/R-package/man/mx.symbol.round.Rd b/R-package/man/mx.symbol.round.Rd
new file mode 100644
index 000000000000..00ecf48f309f
--- /dev/null
+++ b/R-package/man/mx.symbol.round.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.round}
+\alias{mx.symbol.round}
+\title{Take round value of the src}
+\usage{
+mx.symbol.round(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take round value of the src
+}
+
diff --git a/R-package/man/mx.symbol.rsqrt.Rd b/R-package/man/mx.symbol.rsqrt.Rd
new file mode 100644
index 000000000000..a190be29b6a8
--- /dev/null
+++ b/R-package/man/mx.symbol.rsqrt.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.rsqrt}
+\alias{mx.symbol.rsqrt}
+\title{Take rsqrt of the src}
+\usage{
+mx.symbol.rsqrt(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take rsqrt of the src
+}
+
diff --git a/R-package/man/mx.symbol.sign.Rd b/R-package/man/mx.symbol.sign.Rd
new file mode 100644
index 000000000000..b628380bd961
--- /dev/null
+++ b/R-package/man/mx.symbol.sign.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.sign}
+\alias{mx.symbol.sign}
+\title{Take sign value of the src}
+\usage{
+mx.symbol.sign(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take sign value of the src
+}
+
diff --git a/R-package/man/mx.symbol.sin.Rd b/R-package/man/mx.symbol.sin.Rd
new file mode 100644
index 000000000000..7b77a8d18ff6
--- /dev/null
+++ b/R-package/man/mx.symbol.sin.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.sin}
+\alias{mx.symbol.sin}
+\title{Take sin of the src}
+\usage{
+mx.symbol.sin(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take sin of the src
+}
+
diff --git a/R-package/src/base.h b/R-package/src/base.h
index a9763cc022be..ff618f0dfafb 100644
--- a/R-package/src/base.h
+++ b/R-package/src/base.h
@@ -15,6 +15,7 @@
 #include <sstream>
 #include <set>
 #include <vector>
+#include <algorithm>
 
 /*! \brief namespace of mxnet */
 namespace mxnet {
@@ -265,7 +266,8 @@ inline bool isSimple(const Rcpp::RObject& val) {
 inline std::string toPyString(const std::string &key, const Rcpp::RObject& val) {
   std::ostringstream os;
   int len = Rf_length(val);
-  if (len != 1) {
+  if (len != 1  ||
+      key.substr(std::max(5, static_cast<int>(key.size())) - 5) == std::string("shape")) {
     RCHECK(TYPEOF(val) == INTSXP || TYPEOF(val) == REALSXP)
         << "Only accept integer vectors or simple types";
     // Do shape convesion back to reversed shape.
diff --git a/R-package/src/io.cc b/R-package/src/io.cc
index f84fd2159aea..8da9fbba839c 100644
--- a/R-package/src/io.cc
+++ b/R-package/src/io.cc
@@ -151,7 +151,6 @@ DataIterCreateFunction::DataIterCreateFunction
   const char **arg_names;
   const char **arg_type_infos;
   const char **arg_descriptions;
-  const char *key_var_num_args;
 
   MX_CALL(MXDataIterGetIterInfo(
       handle_, &name, &description, &num_args,
diff --git a/R-package/src/symbol.cc b/R-package/src/symbol.cc
index 82cd2cb86696..62aab69b1823 100644
--- a/R-package/src/symbol.cc
+++ b/R-package/src/symbol.cc
@@ -109,7 +109,7 @@ Symbol::RObjectType Symbol::GetInternals() const {
 
 Symbol::RObjectType Symbol::GetOutput(mx_uint index) const {
   SymbolHandle out;
-  MX_CALL(MXSymbolGetOutput(handle_, index, &out));
+  MX_CALL(MXSymbolGetOutput(handle_, index - 1, &out));
   return Symbol::RObject(out);
 }
 
@@ -316,6 +316,8 @@ void Symbol::InitRcppModule() {
               "Get a symbol that contains all the internals")
       .method("get.output", &Symbol::GetOutput,
               "Get index-th output symbol of current one")
+      .method("[[", &Symbol::GetOutput,
+              "Get index-th output symbol of current one")
       .method("infer.shape", &Symbol::InferShape,
               "Inference the shape information given unknown ones");
 
diff --git a/dmlc-core b/dmlc-core
index c109d083b64a..27013a86f8b8 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit c109d083b64abb1f6d8fff07e51086815818264b
+Subproject commit 27013a86f8b8fd8bb9ebf2253928436e0eb38e13
diff --git a/doc/aws.md b/doc/aws.md
index 09b9bc6b477e..67edf9e2667b 100644
--- a/doc/aws.md
+++ b/doc/aws.md
@@ -10,14 +10,14 @@ MXNet. In particular, we will address:
 
 ## Use Amazon S3 to host data
 
-Amazon S3 is distributed data storage, which is quite convenient for host large
-scale datasets. In order to S3, we need first to get the
+Amazon S3 is distributed data storage, which is quite convenient for hosting large
+scale datasets. In order to use S3, we need first to get the
 [AWS credentials](http://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSGettingStartedGuide/AWSCredentials.html)),
-which includes a `ACCESS_KEY_ID` and a `SECRET_ACCESS_KEY`.
+which includes an `ACCESS_KEY_ID` and a `SECRET_ACCESS_KEY`.
 
 In order for MXNet to use S3, we only need to set the environment variables `AWS_ACCESS_KEY_ID` and
 `AWS_SECRET_ACCESS_KEY` properly. For example, we can add the following two lines in
-`~/.bashrc` (replace the strings with the correct ones)
+`~/.bashrc` (replacing the strings with the correct ones)
 
 ```bash
 export AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE
@@ -42,7 +42,7 @@ MXNet requires the following libraries
 - `opencv` for image augmentations
 - `curl` and `openssl` for read/write Amazon S3
 
-Installing `CUDA` on EC2 instances needs a little bit efforts. Caffe has a nice
+Installing `CUDA` on EC2 instances needs a little bit of effort. Caffe has a nice
 [tutorial](https://github.com/BVLC/caffe/wiki/Install-Caffe-on-EC2-from-scratch-(Ubuntu,-CUDA-7,-cuDNN))
 on how to install CUDA 7.0 on Ubuntu 14.04 (Note: we tried CUDA 7.5 on Nov 7
 2015, but it is problematic.)
@@ -74,7 +74,7 @@ echo "USE_S3=1" >>config.mk
 make -j8
 ```
 
-Test if every goes well, we train a convolution neural network on MNIST using GPU:
+In order to test whether everything has installed properly, we train a convolution neural network on MNIST using GPU:
 
 ```bash
 python tests/python/gpu/test_conv.py
diff --git a/doc/build.md b/doc/build.md
index 0c5f241c0cec..b3d8ff2559dd 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -2,12 +2,16 @@ Installation Guide
 ==================
 
 This page gives instructions of how to build and install the mxnet package from
-scratch on various systems. It consists of two steps, first we build the shared
-library from the C++ codes (`libmxnet.so` for linux/osx and `libmxnet.dll` for
-windows). Then we install the language, e.g. Python, packages. If the
-instructions on this page do not work for you, please feel free to ask questions
-at [mxnet/issues](https://github.com/dmlc/mxnet/issues), or even better to send
-pull request if you can fix the problem.
+scratch on various systems. It consists of two steps:
+
+1. Fist build the shared library from the C++ codes (`libmxnet.so` for linux,
+ `libmxnet.dylib` for osx and `libmxnet.dll` for windows).
+2. Then install the language packages (e.g. Python package).
+
+Please refer to [FAQ](#frequently-asked-questions) first if you had any problem
+during installation. If the instructions do not work for you, please feel free
+to ask questions at [mxnet/issues](https://github.com/dmlc/mxnet/issues), or
+even better to send pull request if you can fix the problem.
 
 ## Contents
 - [Build the Shared Library](#build-mxnet-library)
@@ -20,6 +24,7 @@ pull request if you can fix the problem.
 - [Python Package Installation](#python-package-installation)
 - [R Package Installation](#r-package-installation)
 - [Docker Images](#docker-images)
+- [Frequently asked questions](#frequently-asked-questions)
 
 ## Build the Shared Library
 
@@ -175,6 +180,10 @@ There are several ways to install the package:
    ```bash
    sudo apt-get install python-setuptools
    ```
+
+   *NOTE: If you recompiled mxnet, then you need to reinstall mxnet again to
+    make the new library take effect*
+
 2. Only set the environment variable `PYTHONPATH` to tell python where to find
    the library. For example, assume we cloned `mxnet` on the home directory
    `~`. then we can added the following line in `~/.bashrc`
@@ -249,3 +258,29 @@ sudo docker run -it --device /dev/nvidiactl --device /dev/nvidia-uvm --device /d
 
 For a guide to Docker, see the [official docs](https://docs.docker.com/userguide/). For more details on how to use the
 MXNet Docker images, including requirements for CUDA support, consult the [source project](https://github.com/Kaixhin/dockerfiles).
+
+## Frequently Asked Questions
+
+1. **Compile failed after `git pull`**
+
+   Please first update the submodules, clean all and recompile:
+
+   ```bash
+   git submodule update && make clean_all && make -j4
+   ```
+
+2. **Compile failed after `config.mk` is modified**
+
+   This often happens if `USE_CUDA` or `USE_DIST_KVSTORE` has been changed. You
+   need to clean all first:
+
+    ```bash
+    make clean_all && make -j4
+    ```
+
+3. **Still get the error message e.g. `compile with USE_DIST_KVSTORE=1 to use
+   dist` after recomplied with `USE_DIST_KVSTORE=1`**
+
+   It is often because mxnet is failed to load the new built library. If you
+   installed mxnet system-widely, e.g. `python setup.py install`, then you need
+   to reinstall the package again.
diff --git a/example/image-classification/README.md b/example/image-classification/README.md
index 468b04b6b8e2..811c639593d7 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -147,31 +147,31 @@ model.fit(X=train_data, y=train_label)
 
 The following factors may significant affect the performance:
 
-- Use a fast backend. A fast BLAS library, e.g. openblas, altas,
+1. Use a fast backend. A fast BLAS library, e.g. openblas, altas,
 and mkl, is necessary if only using CPU. While for Nvidia GPUs, we strongly
 recommend to use CUDNN.
-- Three important things for the input data:
-  - data format. If you are using the `rec` format, then everything should be
+2. Three important things for the input data:
+  1. data format. If you are using the `rec` format, then everything should be
     fine.
-  - decoding. In default MXNet uses 4 CPU threads for decoding the images, which
+  2. decoding. In default MXNet uses 4 CPU threads for decoding the images, which
     are often able to decode over 1k images per second. You
     may increase the number of threads if either you are using a low-end CPU or
     you GPUs are very powerful.
-  - place to store the data. Any local or distributed filesystem (HDFS, Amazon
+  3. place to store the data. Any local or distributed filesystem (HDFS, Amazon
     S3) should be fine. There may be a problem if multiple machines read the
     data from the network shared filesystem (NFS) at the same time.
-- Use a large batch size. We often choose the largest one which can fit into
+3. Use a large batch size. We often choose the largest one which can fit into
   the GPU memory. But a too large value may slow down the convergence. For
   example, the safe batch size for CIFAR 10 is around 200, while for ImageNet
   1K, the batch size can go beyond 1K.
-- Choose the proper `kvstore` if using more than one GPU. (See
+4. Choose the proper `kvstore` if using more than one GPU. (See
   [doc/developer-guide/multi_node.md](../../doc/developer-guide/multi_node.md)
   for more information)
-  - For a single machine, often the default `local` is good enough. But you may want
+  1. For a single machine, often the default `local` is good enough. But you may want
   to use `local_allreduce_device` for models with size >> 100MB such as AlexNet
   and VGG. But also note that `local_allreduce_device` takes more GPU memory than
   others.
-  - For multiple machines, we recommend to try `dist_sync` first. But if the
+  2. For multiple machines, we recommend to try `dist_sync` first. But if the
   model size is quite large or you use a large number of machines, you may want to use `dist_async`.
 
 ## Results
@@ -180,8 +180,9 @@ recommend to use CUDNN.
 
   | name | hardware | software |
   | --- | --- | --- |
-  | GTX980 | dual Xeon E5-2680 v2, dual GTX 980, 1G Ethernet | GCC 4.8, CUDA 7.5, CUDNN v3 |
-  | EC2-g2.8x | Xeon E5-2670, dual GRID K520, 10G Ethernet | GCC 4.8, CUDA 7.5, CUDNN v3 |
+  | GTX980 | Xeon E5-1650 v3, 4 x GTX 980 | GCC 4.8, CUDA 7.5, CUDNN 3 |
+  | TitanX | dual Xeon E5-2630 v3, 4 x GTX Titan X | GCC 4.8, CUDA 7.5, CUDNN 3 |
+  | EC2-g2.8x | Xeon E5-2670, 2 x GRID K520, 10G Ethernet | GCC 4.8, CUDA 7.5, CUDNN 3 |
 
 - Datasets
 
@@ -210,24 +211,48 @@ python train_cifar10.py --batch-size 128 --lr 0.1 --lr-factor .94 --num-epoch 50
 
 ### ILSVRC 12
 
-#### `train_imagenet.py` with `--network alexnet`
+<!-- #### Alexnet -->
 
-- time for one epoch:
+<!-- `train_imagenet.py` with `--network alexnet` -->
 
-  | 1 x GTX 980 | 2 x GTX 980  | 4 x GTX 980  |
-  | ----------- | ------------ | ------------ |
-  | 2,413 sec | 1,244 sec | 906 sec |
+<!-- - time for one epoch: -->
 
-#### `train_imagenet.py` with `--network inception-bn`
+<!--   | 1 x GTX 980 | 2 x GTX 980  | 4 x GTX 980  | -->
+<!--   | ----------- | ------------ | ------------ | -->
+<!--   | 2,413 sec | 1,244 sec | 906 sec | -->
+
+#### VGG
+
+`train_imagenet.py` with `--network vgg`
+
+- Performance
+
+  | Cluster | # machines | # GPUs | batch size | kvstore | epoch time |
+  | --- | --- | --- | --- | --- | ---: |
+  | TitanX | 1 | 1 | 96 | `none` | 14,545 |
+  | - | - | 2 | - | `local` | 19,692 |
+  | - | - | 4 | - | - | 20,014 |
+  | - | - | 2 | - | `local_allreduce_device` | 9,142 |
+  | - | - | 4 | - | - | 8,533 |
+  | - | - | - | 384 | - | 5,161 |
+
+#### Inception with Batch Normalization
+
+`train_imagenet.py` with `--network inception-bn`
 
 - Performance
 
   | Cluster | # machines | # GPUs | batch size | kvstore | epoch time |
   | --- | --- | --- | --- | --- | ---: |
   | GTX980 | 1 | 1 |  32 | `local` | 13,210 |
-  | - | 1 | 2 |  64 | `local` | 7,198 |
-  | - | 1 | 3 |  128 | `local` | 4,952 |
-  | - | 1 | 4 |  128 | `local` | 3,589 |
+  | - | - | 2 |  64 | - | 7,198 |
+  | - | - | 3 |  128 | - | 4,952 |
+  | - | - | 4 |  - | - | 3,589 |
+  | TitanX | 1 | 1 | 128 | `none` | 10,666 |
+  | - | - | 2 | - | `local` | 5,161 |
+  | - | - | 3 | - | - | 3,460 |
+  | - | - | 4 | - | - | 2,844 |
+  | - | - | - | 512 | - | 2,495 |
   | EC2-g2.8x | 1 | 4 | 144 |  `local` | 14,203 |
   | - | 10 | 40 | 144 |  `dist_sync` | 1,422 |
 
@@ -236,8 +261,8 @@ python train_cifar10.py --batch-size 128 --lr 0.1 --lr-factor .94 --num-epoch 50
   - `single machine` :
 
   ```bash
-  python train_imagenet.py --network inception-bn \
-      --batch-size 128 --lr 0.05 --num-epoch 60 --lr-factor .94 \
+  python train_imagenet.py --batch-size 144 --lr 0.05 --lr-factor .94 \
+      --gpus 0,1,2,3 --num-epoch 60 --network inception-bn \
       --data-dir ilsvrc12/ --model-prefix model/ilsvrc12
   ```
 
@@ -251,7 +276,8 @@ python train_cifar10.py --batch-size 128 --lr 0.1 --lr-factor .94 --num-epoch 50
         --data-dir s3://dmlc/ilsvrc12/  --model-prefix s3://dmlc/model/ilsvrc12
   ```
 
-  *Note: S3 is unstable sometimes, before fixing this problem, we recommend to download data to `/mnt` first*
+  *Note: S3 is unstable sometimes, if your training hangs or getting error
+   freqently, you cant download data to `/mnt` first*
 
   Accuracy vs epoch ([the interactive figure](https://docs.google.com/spreadsheets/d/1AEesHjWUZOzCN0Gp_PYI1Cw4U1kZMKot360p9Fowmjw/pubchart?oid=1740787404&format=interactive)):
 
diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
index 726a2a77ee55..54bf605e5d28 100644
--- a/example/image-classification/train_imagenet.py
+++ b/example/image-classification/train_imagenet.py
@@ -35,6 +35,10 @@
                     help='the number of training examples')
 parser.add_argument('--num-classes', type=int, default=1000,
                     help='the number of classes')
+parser.add_argument('--log-file', type=str, 
+		    help='the name of log file')
+parser.add_argument('--log-dir', type=str, default="/tmp/",
+                    help='directory of the log file')
 args = parser.parse_args()
 
 # network
diff --git a/example/image-classification/train_model.py b/example/image-classification/train_model.py
index be2e08db36fb..002e9818e724 100644
--- a/example/image-classification/train_model.py
+++ b/example/image-classification/train_model.py
@@ -1,6 +1,7 @@
 import find_mxnet
 import mxnet as mx
 import logging
+import os
 
 def fit(args, network, data_loader):
     # kvstore
@@ -8,8 +9,22 @@ def fit(args, network, data_loader):
 
     # logging
     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
-    logging.basicConfig(level=logging.DEBUG, format=head)
-    logging.info('start with arguments %s', args)
+    if 'log_file' in args and args.log_file is not None:
+        log_file = args.log_file
+        log_dir = args.log_dir
+        log_file_full_name = os.path.join(log_dir, log_file)
+        if not os.path.exists(log_dir): 
+            os.mkdir(log_dir)
+        logger = logging.getLogger()
+        handler = logging.FileHandler(log_file_full_name)
+        formatter = logging.Formatter(head)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(logging.DEBUG)
+        logger.info('start with arguments %s', args)
+    else:
+        logging.basicConfig(level=logging.DEBUG, format=head)
+        logging.info('start with arguments %s', args)
 
     # load model?
     model_prefix = args.model_prefix
diff --git a/example/kaggle-ndsb2/Preprocessing.py b/example/kaggle-ndsb2/Preprocessing.py
new file mode 100644
index 000000000000..fb55b4634066
--- /dev/null
+++ b/example/kaggle-ndsb2/Preprocessing.py
@@ -0,0 +1,141 @@
+"""Preprocessing script.
+
+This script walks over the directories and dump the frames into a csv file
+"""
+import os
+import csv
+import sys
+import random
+import scipy
+import numpy as np
+import dicom
+from skimage import io, transform
+
+def mkdir(fname):
+   try:
+       os.mkdir(fname)
+   except:
+       pass
+
+def get_frames(root_path):
+   """Get path to all the frame in view SAX and contain complete frames"""
+   ret = []
+   for root, _, files in os.walk(root_path):
+       if len(files) == 0 or not files[0].endswith(".dcm") or root.find("sax") == -1:
+           continue
+       prefix = files[0].rsplit('-', 1)[0]
+       fileset = set(files)
+       expected = ["%s-%04d.dcm" % (prefix, i + 1) for i in range(30)]
+       if all(x in fileset for x in expected):
+           ret.append([root + "/" + x for x in expected])
+   # sort for reproduciblity
+   return sorted(ret, key = lambda x: x[0])
+
+
+def get_label_map(fname):
+   labelmap = {}
+   fi = open(fname)
+   fi.readline()
+   for line in fi:
+       arr = line.split(',')
+       labelmap[int(arr[0])] = line
+   return labelmap
+
+
+def write_label_csv(fname, frames, label_map):
+   fo = open(fname, "w")
+   for lst in frames:
+       index = int(lst[0].split("/")[3])
+       if label_map != None:
+           fo.write(label_map[index])
+       else:
+           fo.write("%d,0,0\n" % index)
+   fo.close()
+
+
+def write_data_csv(fname, frames, preproc):
+   """Write data to csv file"""
+   fdata = open(fname, "w")
+   dwriter = csv.writer(fdata)
+   counter = 0
+   result = []
+   for lst in frames:
+       data = []
+       for path in lst:
+           f = dicom.read_file(path)
+           img = preproc(f.pixel_array.astype(float) / np.max(f.pixel_array))
+           dst_path = path.rsplit(".", 1)[0] + ".64x64.jpg"
+           scipy.misc.imsave(dst_path, img)
+           result.append(dst_path)
+           data.append(img)
+       data = np.array(data, dtype=np.uint8)
+       data = data.reshape(data.size)
+       dwriter.writerow(data)
+       counter += 1
+       if counter % 100 == 0:
+           print("%d slices processed" % counter)
+   print("All finished, %d slices in total" % counter)
+   fdata.close()
+   return result
+
+
+def crop_resize(img, size):
+   """crop center and resize"""
+   if img.shape[0] < img.shape[1]:
+       img = img.T
+   # we crop image from center
+   short_egde = min(img.shape[:2])
+   yy = int((img.shape[0] - short_egde) / 2)
+   xx = int((img.shape[1] - short_egde) / 2)
+   crop_img = img[yy : yy + short_egde, xx : xx + short_egde]
+   # resize to 64, 64
+   resized_img = transform.resize(crop_img, (size, size))
+   resized_img *= 255
+   return resized_img.astype("uint8")
+
+
+def local_split(train_index):
+   random.seed(0)
+   train_index = set(train_index)
+   all_index = sorted(train_index)
+   num_test = int(len(all_index) / 3)
+   random.shuffle(all_index)
+   train_set = set(all_index[num_test:])
+   test_set = set(all_index[:num_test])
+   return train_set, test_set
+
+
+def split_csv(src_csv, split_to_train, train_csv, test_csv):
+   ftrain = open(train_csv, "w")
+   ftest = open(test_csv, "w")
+   cnt = 0
+   for l in open(src_csv):
+       if split_to_train[cnt]:
+           ftrain.write(l)
+       else:
+           ftest.write(l)
+       cnt = cnt + 1
+   ftrain.close()
+   ftest.close()
+
+# Load the list of all the training frames, and shuffle them
+# Shuffle the training frames
+random.seed(10)
+train_frames = get_frames("./data/train")
+random.shuffle(train_frames)
+validate_frames = get_frames("./data/validate")
+
+# Write the corresponding label information of each frame into file.
+write_label_csv("./train-label.csv", train_frames, get_label_map("./data/train.csv"))
+write_label_csv("./validate-label.csv", validate_frames, None)
+
+# Dump the data of each frame into a CSV file, apply crop to 64 preprocessor
+train_lst = write_data_csv("./train-64x64-data.csv", train_frames, lambda x: crop_resize(x, 64))
+valid_lst = write_data_csv("./validate-64x64-data.csv", validate_frames, lambda x: crop_resize(x, 64))
+
+# Generate local train/test split, which you could use to tune your model locally.
+train_index = np.loadtxt("./train-label.csv", delimiter=",")[:,0].astype("int")
+train_set, test_set = local_split(train_index)
+split_to_train = [x in train_set for x in train_index]
+split_csv("./train-label.csv", split_to_train, "./local_train-label.csv", "./local_test-label.csv")
+split_csv("./train-64x64-data.csv", split_to_train, "./local_train-64x64-data.csv", "./local_test-64x64-data.csv")
diff --git a/example/kaggle-ndsb2/README.md b/example/kaggle-ndsb2/README.md
new file mode 100644
index 000000000000..5ee5ef66b255
--- /dev/null
+++ b/example/kaggle-ndsb2/README.md
@@ -0,0 +1,69 @@
+# End-to-End Deep Learning Tutorial for Kaggle NDSB-II
+
+In this example, we will demo how to use MXNet to build an end-to-end deep learning system to help Diagnose Heart Disease.  The demo network is able to achieve 0.039222 CRPS on validation set, which is good enough to get Top-10 (on Dec 22nd, 2015).
+
+Notice this is a very simple model with no attempt to optimize the structure or hyper parameters, you can build fantastic network based on it. While this tutorial is written in python, mxnet comes with support for other popular languages such as R and Julia which can also be used. You are more than welcomed to try and contribute back to this example.
+
+This example requires GPU to train. If you are working with AWS,
+A simple guide to build MXNet on AWS and existing AMI can be found at [This document](https://mxnet.readthedocs.org/en/latest/aws.html).
+you can also choose to put your data on S3, and having all the machine directly load data from S3, without having to copy data over when you are starting new instances.
+
+
+## General Overview of model
+### Input Data
+We notice for in most of data, there are 30 frames for a sequence. A simple idea is pack this sequence into a multi-channel input, then let neural network learn from it. This tutorial is based on this idea: We first find accumulate all suitable data with 30 frames, then feed to the neural network to learn the target directly.
+
+Another idea is use difference to measure change in time-series. By using MXNet symbolic interface, we can build a dynamic difference channels to transform input inside of the network. It helps a little in the final result.
+
+### Network Objective
+For the network, we use a 20 years old LeNet style convolution network with batch normalization and dropout. We did not finetune the configuration and hyper parameters as this is mainly for demonstration purposes. We are sure better solutions can be found.
+
+One important idea of the model is to predict what the problem is asking for. In this problem, we are asked to predict a CDF value on 600 data-point. So we formulate the problem as a regression problem. We ask the neural-net to output 600 values, which corresponds to the CDF value to be predicted. The label is transformed into the 0-1 function as used in the evaluation target.
+
+
+## Preprocessing
+We first run a preprocessing step, to pack the data into a csv file. Each line of the csv file corresponds to a 30 x 64 x 64 tensor, which gives 30 frames of resized images. We can also use other inputs besides csv. We choose the csv because this format is quite common for all language and it is easy to manipulate.
+The input dataset is quite big. While they can fit into memory of a big machine, we want to be safe for all desktop settings, so we will use a CSVIter from mxnet to load data from disk on the fly during training, without loading all the data into memory. You are also more than welcomed to try the in-memory setting.
+
+
+
+## Step by step
+
+Prepare raw data in ```data``` folder. The tree of ```data``` folder is like
+
+```
+-data
+ |
+ ---- sample_submission_validate.csv
+ |
+ ---- train.csv
+ |
+ ---- train
+ |    |
+ |    ---- 0
+ |    |
+ |    ---- …
+ |
+ ---- validate
+      |
+      ---- 501
+      |
+      ---- …
+```
+
+2. Run ```python3 Preprocessing.py``` to do preprocessing of data.
+3. After we have the processed data, run ```python3 Train.py``` to generate ```submission.csv```
+
+
+Note:
+- To run with python2, you need to change ```Train.py, line #139``` to the python2 syntax.
+- To modify network, change ```get_lenet``` function in ```Train.py```
+- We also provide ```local_train```, ```local_test``` file for local parameter tuning.
+- To run on multiple GPU with huge network, or questions about saving network paramter etc, please refer [MXNet docs](https://mxnet.readthedocs.org/en/latest/)
+
+
+## About MXNet
+MXNet is a deep learning framework designed for both efficiency and flexibility by DMLC group. Like all other packages in DMLC,  it will fully utilize all the resources to solve the problem under limited resource constraint, with a flexible programming interface. You can use it for all purposes of data science and deep learning tasks with R, Julia, python and more. see
+
+
+
diff --git a/example/kaggle-ndsb2/Train.R b/example/kaggle-ndsb2/Train.R
new file mode 100644
index 000000000000..352e4a156393
--- /dev/null
+++ b/example/kaggle-ndsb2/Train.R
@@ -0,0 +1,154 @@
+require(mxnet)
+require(data.table)
+
+get.lenet <- function() {
+  source <- mx.symbol.Variable("data")
+  source <- (source-128) / 128
+  frames <- mx.symbol.SliceChannel(source, num.outputs = 30)
+  diffs <- list()
+  for (i in 1:29) {
+    diffs <- c(diffs, frames[[i + 1]] - frames[[i]])
+  }
+  diffs$num.args = 29
+  source <- mxnet:::mx.varg.symbol.Concat(diffs)
+  net <-
+    mx.symbol.Convolution(source, kernel = c(5, 5), num.filter = 40)
+  net <- mx.symbol.BatchNorm(net, fix.gamma = TRUE)
+  net <- mx.symbol.Activation(net, act.type = "relu")
+  net <-
+    mx.symbol.Pooling(
+      net, pool.type = "max", kernel = c(2, 2), stride = c(2, 2)
+    )
+  net <-
+    mx.symbol.Convolution(net, kernel = c(3, 3), num.filter = 40)
+  net <- mx.symbol.BatchNorm(net, fix.gamma = TRUE)
+  net <- mx.symbol.Activation(net, act.type = "relu")
+  net <-
+    mx.symbol.Pooling(
+      net, pool.type = "max", kernel = c(2, 2), stride = c(2, 2)
+    )
+  flatten <- mx.symbol.Flatten(net)
+  flatten <- mx.symbol.Dropout(flatten)
+  fc1 <- mx.symbol.FullyConnected(data = flatten, num.hidden = 600)
+  return(mx.symbol.LogisticRegressionOutput(data = fc1, name = 'softmax'))
+}
+
+network <- get.lenet()
+batch_size <- 32
+
+data_train <- mx.io.CSVIter(
+  data.csv = "./train-64x64-data.csv", data.shape = c(64, 64, 30),
+  label.csv = "./train-stytole.csv", label.shape = 600,
+  batch.size = batch_size
+)
+
+data_validate <- mx.io.CSVIter(
+  data.csv = "./validate-64x64-data.csv",
+  data.shape = c(64, 64, 30),
+  batch.size = 1
+)
+
+mx.metric.CRPS <- mx.metric.custom("CRPS", function(label, pred) {
+  pred <- as.array(pred)
+  label <- as.array(label)
+  for (i in 1:dim(pred)[2]) {
+    for (j in 1:(dim(pred)[1] - 1)) {
+      if (pred[j, i] > pred[j + 1, i]) {
+        pred[j + 1, i] = pred[j, i]
+      }
+    }
+  }
+  return(sum((label - pred) ^ 2) / length(label))
+})
+
+mx.set.seed(0)
+stytole_model <- mx.model.FeedForward.create(
+  X = data_train,
+  ctx = mx.gpu(0),
+  symbol = network,
+  num.round = 65,
+  learning.rate = 0.001,
+  wd = 0.00001,
+  momentum = 0.9,
+  eval.metric = mx.metric.CRPS
+)
+
+stytole_prob = predict(stytole_model, data_validate)
+
+network = get.lenet()
+batch_size = 32
+data_train <-
+  mx.io.CSVIter(
+    data.csv = "./train-64x64-data.csv", data.shape = c(64, 64, 30),
+    label.csv = "./train-diastole.csv", label.shape = 600,
+    batch.size = batch_size
+  )
+
+diastole_model = mx.model.FeedForward.create(
+  X = data_train,
+  ctx = mx.gpu(0),
+  symbol = network,
+  num.round = 65,
+  learning.rate = 0.001,
+  wd = 0.00001,
+  momentum = 0.9,
+  eval.metric = mx.metric.CRPS
+)
+
+diastole_prob = predict(diastole_model, data_validate)
+
+accumulate_result <- function(validate_lst, prob) {
+  t <- read.table(validate_lst, sep = ",")
+  p <- cbind(t[,1], t(prob))
+  dt <- as.data.table(p)
+  return(dt[, lapply(.SD, mean), by = V1])
+}
+
+stytole_result = as.data.frame(accumulate_result("./validate-label.csv", stytole_prob))
+diastole_result = as.data.frame(accumulate_result("./validate-label.csv", diastole_prob))
+
+train_csv <- read.table("./train-label.csv", sep = ',')
+
+doHist <- function(data) {
+  res <- rep(0, 600)
+  for (i in 1:length(data)) {
+    for (j in round(data[i]):600) {
+      res[j] = res[j] + 1 
+    }
+  }
+  return(res / length(data))
+}
+
+hSystole = doHist(train_csv[, 2])
+hDiastole = doHist(train_csv[, 3])
+
+res <- read.table("data/sample_submission_validate.csv", sep = ",", header = TRUE, stringsAsFactors = FALSE)
+
+submission_helper <- function(pred) {
+  for (i in 2:length(pred)) {
+    if (pred[i] < pred[i - 1]) {
+      pred[i] = pred[i - 1]
+    }
+  }
+  return(pred)
+}
+
+for (i in 1:nrow(res)) {
+  key <- unlist(strsplit(res$Id[i], "_"))[1]
+  target <- unlist(strsplit(res$Id[i], "_"))[2]
+  if (key %in% stytole_result$V1) {
+    if (target == 'Diastole') {
+      res[i, 2:601] <- submission_helper(diastole_result[which(diastole_result$V1 == key), 2:601])
+    } else {
+      res[i, 2:601] <- submission_helper(stytole_result[which(stytole_result$V1 == key), 2:601])
+    }
+  } else {
+    if (target == 'Diastole') {
+      res[i, 2:601] <- hDiastole
+    } else {
+      res[i, 2:601] <- hSystole
+    }    
+  }
+}
+
+write.table(res, file = "submission.csv", sep = ",", quote = FALSE, row.names = FALSE)
diff --git a/example/kaggle-ndsb2/Train.py b/example/kaggle-ndsb2/Train.py
new file mode 100644
index 000000000000..a51442d373ac
--- /dev/null
+++ b/example/kaggle-ndsb2/Train.py
@@ -0,0 +1,217 @@
+"""Training script, this is converted from a ipython notebook
+"""
+
+import os
+import csv
+import sys
+import numpy as np
+import mxnet as mx
+import logging
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+# In[2]:
+
+def get_lenet():
+    """ A lenet style net, takes difference of each frame as input.
+    """
+    source = mx.sym.Variable("data")
+    source = (source - 128) * (1.0/128)
+    frames = mx.sym.SliceChannel(source, num_outputs=30)
+    diffs = [frames[i+1] - frames[i] for i in range(29)]
+    source = mx.sym.Concat(*diffs)
+    net = mx.sym.Convolution(source, kernel=(5, 5), num_filter=40)
+    net = mx.sym.BatchNorm(net, fix_gamma=True)
+    net = mx.sym.Activation(net, act_type="relu")
+    net = mx.sym.Pooling(net, pool_type="max", kernel=(2,2), stride=(2,2))
+    net = mx.sym.Convolution(net, kernel=(3, 3), num_filter=40)
+    net = mx.sym.BatchNorm(net, fix_gamma=True)
+    net = mx.sym.Activation(net, act_type="relu")
+    net = mx.sym.Pooling(net, pool_type="max", kernel=(2,2), stride=(2,2))
+    # first fullc
+    flatten = mx.symbol.Flatten(net)
+    flatten = mx.symbol.Dropout(flatten)
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=600)
+    # Name the final layer as softmax so it auto matches the naming of data iterator
+    # Otherwise we can also change the provide_data in the data iter
+    return mx.symbol.LogisticRegressionOutput(data=fc1, name='softmax')
+
+def CRPS(label, pred):
+    """ Custom evaluation metric on CRPS.
+    """
+    for i in range(pred.shape[0]):
+        for j in range(pred.shape[1] - 1):
+            if pred[i, j] > pred[i, j + 1]:
+                pred[i, j + 1] = pred[i, j]
+    return np.sum(np.square(label - pred)) / label.size
+
+
+# In[3]:
+
+def encode_label(label_data):
+    """Run encoding to encode the label into the CDF target.
+    """
+    stytole = label_data[:, 1]
+    diastole = label_data[:, 2]
+    stytole_encode = np.array([
+            (x < np.arange(600)) for x in stytole
+        ], dtype=np.uint8)
+    diastole_encode = np.array([
+            (x < np.arange(600)) for x in diastole
+        ], dtype=np.uint8)
+    return stytole_encode, diastole_encode
+
+def encode_csv(label_csv, stytole_csv, diastole_csv):
+    stytole_encode, diastole_encode = encode_label(np.loadtxt(label_csv, delimiter=","))
+    np.savetxt(stytole_csv, stytole_encode, delimiter=",", fmt="%g")
+    np.savetxt(diastole_csv, diastole_encode, delimiter=",", fmt="%g")
+
+# Write encoded label into the target csv
+# We use CSV so that not all data need to sit into memory
+# You can also use inmemory numpy array if your machine is large enough
+encode_csv("./train-label.csv", "./train-stytole.csv", "./train-diastole.csv")
+
+
+# # Training the stytole net
+
+# In[4]:
+
+network = get_lenet()
+batch_size = 32
+devs = [mx.gpu(0)]
+data_train = mx.io.CSVIter(data_csv="./train-64x64-data.csv", data_shape=(30, 64, 64),
+                           label_csv="./train-stytole.csv", label_shape=(600,),
+                           batch_size=batch_size)
+
+data_validate = mx.io.CSVIter(data_csv="./validate-64x64-data.csv", data_shape=(30, 64, 64),
+                              batch_size=1)
+
+stytole_model = mx.model.FeedForward(ctx=devs,
+        symbol             = network,
+        num_epoch          = 65,
+        learning_rate      = 0.001,
+        wd                 = 0.00001,
+        momentum           = 0.9)
+
+stytole_model.fit(X=data_train, eval_metric = mx.metric.np(CRPS))
+
+
+# # Predict stytole
+
+# In[5]:
+
+stytole_prob = stytole_model.predict(data_validate)
+
+
+# # Training the diastole net
+
+# In[6]:
+
+network = get_lenet()
+batch_size = 32
+devs = [mx.gpu(0)]
+data_train = mx.io.CSVIter(data_csv="./train-64x64-data.csv", data_shape=(30, 64, 64),
+                           label_csv="./train-diastole.csv", label_shape=(600,),
+                           batch_size=batch_size)
+
+diastole_model = mx.model.FeedForward(ctx=devs,
+        symbol             = network,
+        num_epoch          = 65,
+        learning_rate      = 0.001,
+        wd                 = 0.00001,
+        momentum           = 0.9)
+
+diastole_model.fit(X=data_train, eval_metric = mx.metric.np(CRPS))
+
+
+# # Predict diastole
+
+# In[7]:
+
+diastole_prob = diastole_model.predict(data_validate)
+
+
+# # Generate Submission
+
+# In[8]:
+
+def accumulate_result(validate_lst, prob):
+    sum_result = {}
+    cnt_result = {}
+    size = prob.shape[0]
+    fi = csv.reader(open(validate_lst))
+    for i in range(size):
+        line = fi.__next__() # Python2: line = fi.next()
+        idx = int(line[0])
+        if idx not in cnt_result:
+            cnt_result[idx] = 0.
+            sum_result[idx] = np.zeros((1, prob.shape[1]))
+        cnt_result[idx] += 1
+        sum_result[idx] += prob[i, :]
+    for i in cnt_result.keys():
+        sum_result[i][:] /= cnt_result[i]
+    return sum_result
+
+
+# In[9]:
+
+stytole_result = accumulate_result("./validate-label.csv", stytole_prob)
+diastole_result = accumulate_result("./validate-label.csv", diastole_prob)
+
+
+# In[10]:
+
+# we have 2 person missing due to frame selection, use udibr's hist result instead
+def doHist(data):
+    h = np.zeros(600)
+    for j in np.ceil(data).astype(int):
+        h[j:] += 1
+    h /= len(data)
+    return h
+train_csv = np.genfromtxt("./train-label.csv", delimiter=',')
+hSystole = doHist(train_csv[:, 1])
+hDiastole = doHist(train_csv[:, 2])
+
+
+# In[11]:
+
+def submission_helper(pred):
+    p = np.zeros(600)
+    pred.resize(p.shape)
+    p[0] = pred[0]
+    for j in range(1, 600):
+        a = p[j - 1]
+        b = pred[j]
+        if b < a:
+            p[j] = a
+        else:
+            p[j] = b
+    return p
+
+
+
+# In[12]:
+
+fi = csv.reader(open("data/sample_submission_validate.csv"))
+f = open("submission.csv", "w")
+fo = csv.writer(f, lineterminator='\n')
+fo.writerow(fi.__next__())
+for line in fi:
+    idx = line[0]
+    key, target = idx.split('_')
+    key = int(key)
+    out = [idx]
+    if key in stytole_result:
+        if target == 'Diastole':
+            out.extend(list(submission_helper(diastole_result[key])))
+        else:
+            out.extend(list(submission_helper(stytole_result[key])))
+    else:
+        print("Miss: %s" % idx)
+        if target == 'Diastole':
+            out.extend(hDiastole)
+        else:
+            out.extend(hSystole)
+    fo.writerow(out)
+f.close()
diff --git a/example/rnn/get_ptb_data.sh b/example/rnn/get_ptb_data.sh
new file mode 100644
index 000000000000..2b517f4ebc4d
--- /dev/null
+++ b/example/rnn/get_ptb_data.sh
@@ -0,0 +1,13 @@
+#!/bin/env bash
+
+RNN_DIR=$(cd `dirname $0`; pwd)
+DATA_DIR="${RNN_DIR}/data/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} doesn't exist, will create one";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt; 
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
index 33fdb507f0dd..b560d5d270cf 100644
--- a/example/rnn/lstm.py
+++ b/example/rnn/lstm.py
@@ -187,8 +187,8 @@ def calc_nll(seq_label_probs, X, begin):
 def train_lstm(model, X_train_batch, X_val_batch,
                num_round, update_period,
                optimizer='rmsprop', half_life=2,max_grad_norm = 5.0, **kwargs):
-    print("Training swith train.shape=%s" % str(X_train_batch.shape))
-    print("Training swith val.shape=%s" % str(X_val_batch.shape))
+    print("Training with train.shape=%s" % str(X_train_batch.shape))
+    print("Training with val.shape=%s" % str(X_val_batch.shape))
     m = model
     seq_len = len(m.seq_data)
     batch_size = m.seq_data[0].shape[0]
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index 989686414237..0290d84f6f38 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -37,6 +37,7 @@ typedef void *NDListHandle;
  * \return The last error happened at the predictor.
  */
 MXNET_DLL const char* MXGetLastError();
+
 /*!
  * \brief create a predictor
  * \param symbol_json_str The JSON string of the symbol.
@@ -57,14 +58,49 @@ MXNET_DLL const char* MXGetLastError();
  * \return 0 when success, -1 when failure.
  */
 MXNET_DLL int MXPredCreate(const char* symbol_json_str,
-                           const char* param_bytes,
-                           size_t param_size,
+                           const void* param_bytes,
+                           int param_size,
                            int dev_type, int dev_id,
                            mx_uint num_input_nodes,
                            const char** input_keys,
                            const mx_uint* input_shape_indptr,
                            const mx_uint* input_shape_data,
                            PredictorHandle* out);
+
+/*!
+ * \brief create a predictor wich customized outputs
+ * \param symbol_json_str The JSON string of the symbol.
+ * \param param_bytes The in-memory raw bytes of parameter ndarray file.
+ * \param param_size The size of parameter ndarray file.
+ * \param dev_type The device type, 1: cpu, 2:gpu
+ * \param dev_id The device id of the predictor.
+ * \param num_input_nodes Number of input nodes to the net,
+ *    For feedforward net, this is 1.
+ * \param input_keys The name of input argument.
+ *    For feedforward net, this is {"data"}
+ * \param input_shape_indptr Index pointer of shapes of each input node.
+ *    The length of this array = num_input_nodes + 1.
+ *    For feedforward net that takes 4 dimensional input, this is {0, 4}.
+ * \param input_shape_data A flatted data of shapes of each input node.
+ *    For feedforward net that takes 4 dimensional input, this is the shape data.
+ * \param num_output_nodes Number of output nodes to the net,
+ * \param output_keys The name of output argument.
+ *    For example {"global_pool"}
+ * \param out The created predictor handle.
+ * \return 0 when success, -1 when failure.
+ */
+
+MXNET_DLL int MXPredCreatePartialOut(const char* symbol_json_str,
+                                     const void* param_bytes,
+                                     int param_size,
+                                     int dev_type, int dev_id,
+                                     mx_uint num_input_nodes,
+                                     const char** input_keys,
+                                     const mx_uint* input_shape_indptr,
+                                     const mx_uint* input_shape_data,
+                                     mx_uint num_output_nodes,
+                                     const char** output_keys,
+                                     PredictorHandle* out);
 /*!
  * \brief Get the shape of output node.
  *  The returned shape_data and shape_ndim is only valid before next call to MXPred function.
@@ -142,7 +178,7 @@ MXNET_DLL int MXPredFree(PredictorHandle handle);
  * \return 0 when success, -1 when failure.
  */
 MXNET_DLL int MXNDListCreate(const char* nd_file_bytes,
-                             size_t nd_file_size,
+                             int nd_file_size,
                              NDListHandle *out,
                              mx_uint* out_length);
 /*!
diff --git a/make/readthedocs.mk b/make/readthedocs.mk
index 32e965ae0d2a..92ca8241d322 100644
--- a/make/readthedocs.mk
+++ b/make/readthedocs.mk
@@ -40,7 +40,7 @@ USE_INTEL_PATH = NONE
 
 
 # the additional link flags you want to add
-ADD_LDFLAGS =
+ADD_LDFLAGS = -lgomp
 
 # the additional compile flags you want to add
 ADD_CFLAGS = -DMSHADOW_STAND_ALONE=1
diff --git a/matlab/+mxnet/model.m b/matlab/+mxnet/model.m
new file mode 100644
index 000000000000..af61091e9fc3
--- /dev/null
+++ b/matlab/+mxnet/model.m
@@ -0,0 +1,225 @@
+classdef model < handle
+%MODEL MXNet model, supports load and forward
+
+properties
+% The symbol definition, in json format
+  symbol
+% parameter weights
+  params
+% whether or not print info
+  verbose
+end
+
+properties (Access = private)
+% mxnet predictor
+  predictor
+% the previous input size
+  prev_input_size
+% the previous device id
+  prev_dev_id
+% the previous device type (cpu or gpu)
+  prev_dev_type
+% the previous output layers
+  prev_out_layers
+end
+
+methods
+  function obj = model()
+  %CONSTRUCTOR
+  obj.predictor = libpointer('voidPtr', 0);
+  obj.prev_input_size = zeros(1,4);
+  obj.verbose = 1;
+  obj.prev_dev_id = -1;
+  obj.prev_dev_type = -1;
+  end
+
+  function delete(obj)
+  %DESTRUCTOR
+  obj.free_predictor();
+  end
+
+  function load(obj, model_prefix, num_epoch)
+  %LOAD load model from files
+  %
+  % A mxnet model is stored into two files. The first one contains the symbol
+  % definition in json format. While the second one stores all weights in binary
+  % format. For example, if we save a model using the prefix 'model/vgg19' at
+  % epoch 8, then we will get two files. 'model/vgg19-symbol.json' and
+  % 'model/vgg19-0009.params'
+  %
+  % model_prefix : the string model prefix
+  % num_epoch : the epoch to load
+  %
+  % Example:
+  %   model = mxnet.model
+  %   model.load('outptu/vgg19', 8)
+
+  % read symbol
+  obj.symbol = fileread([model_prefix, '-symbol.json']);
+
+  % read params
+  fid = fopen(sprintf('%s-%04d.params', model_prefix, num_epoch), 'rb');
+  assert(fid ~= 0);
+  obj.params = fread(fid, inf, '*ubit8');
+  fclose(fid);
+  end
+
+  function json = parse_symbol(obj)
+  json = parse_json(obj.symbol);
+  end
+
+
+  function outputs = forward(obj, input, varargin)
+  %FORWARD perform forward
+  %
+  % OUT = MODEL.FORWARD(input) returns the forward (prediction) outputs of a list
+  % of input examples
+  %
+  % Examples
+  %
+  %   % load and resize an image
+  %   img = imread('test.jpg')
+  %   img = imresize(img, [224 224])
+  %   % get the softmax output
+  %   out = model.forward(img)
+  %   % get the output of two internal layers
+  %   out = model.forward(img, {'conv4', 'conv5'})
+  %   % use gpu 0
+  %   out = model.forward(img, 'gpu', 0)
+  %   % use two gpus for a image list
+  %   imgs(:,:,:,1) = img1
+  %   imgs(:,:,:,2) = img2
+  %   out = model.forward(imgs, 'gpu', [0,1])
+
+  % parse arguments
+  dev_type = 1; % cpu in default
+  dev_id = 0;
+  out_layers = {};
+  while length(varargin) > 0
+    if ischar(varargin{1}) && strcmp(varargin{1}, 'gpu')
+      assert(length(varargin) > 1, 'arg error: no gpu id')
+      assert(isnumeric(varargin{2}))
+      dev_type = 2;
+      dev_id = varargin{2};
+      varargin = varargin(3:end);
+      continue
+    end
+
+    if ischar(varargin{1})
+      out_layers{end+1} = varargin{1};
+      varargin = varargin(2:end);
+      continue
+    end
+
+    if iscell(varargin{1})
+      out_layers = varargin{1};
+      varargin = varargin(2:end);
+      continue
+    end
+  end
+
+  siz = size(input);
+  assert(length(siz) >= 2);
+
+  % convert from matlab order (col-major) into c order (row major):
+  input = obj.convert_ndarray(input);
+
+  if obj.changed(siz, dev_type, dev_id, out_layers)
+    obj.free_predictor()
+  end
+
+  if obj.predictor.Value == 0
+    fprintf('create predictor with input size ');
+    fprintf('%d ', siz);
+    fprintf('\n');
+    csize = [ones(1, 4-length(siz)), siz(end:-1:1)];
+    callmxnet('MXPredCreatePartialOut', obj.symbol, ...
+              libpointer('voidPtr', obj.params), ...
+              length(obj.params), ...
+              int32(dev_type), int32(dev_id), ...
+              1, {'data'}, ...
+              uint32([0, 4]), ...
+              uint32(csize), ...
+              uint32(length(out_layers)), out_layers, ...
+              obj.predictor);
+  end
+
+  % feed input
+  callmxnet('MXPredSetInput', obj.predictor, 'data', single(input(:)), uint32(numel(input)));
+  % forward
+  callmxnet('MXPredForward', obj.predictor);
+
+  % get output
+  num_out = 1;
+  if ~isempty(out_layers), num_out = length(out_layers); end
+
+  if num_out == 1
+    outputs = obj.get_output(0);
+  else
+    outputs = cell(num_out,1);
+    for i = 1 : num_out
+      outputs{i} = obj.get_output(i-1);
+    end
+  end
+
+  end
+end
+
+methods (Access = private)
+  function free_predictor(obj)
+  % free the predictor
+  if obj.predictor.Value ~= 0
+    callmxnet('MXPredFree', obj.predictor);
+    obj.predictor = libpointer('voidPtr', 0);
+  end
+  end
+
+  function Y = convert_ndarray(obj, X)
+  % convert between matlab's col major and c's row major
+  siz = size(X);
+  Y = permute(X, [2 1 3:length(siz)]);
+  end
+
+  function ret = changed(obj, input_size, dev_type, dev_id, out_layers)
+  % check if arguments changed since last call
+  ret = 0;
+  if length(input_size) ~= length(obj.prev_input_size) || ...
+        any(input_size ~= obj.prev_input_size) || ...
+        dev_type ~= obj.prev_dev_type || ...
+        length(dev_id) ~= length(obj.prev_dev_id) || ...
+        any(dev_id ~= obj.prev_dev_id) || ...
+        length(out_layers) ~= length(obj.prev_out_layers) || ...
+        ~all(cellfun(@strcmp, out_layers, obj.prev_out_layers))
+    ret = 1;
+  end
+  obj.prev_input_size = input_size;
+  obj.prev_dev_type = dev_type;
+  obj.prev_dev_id = dev_id;
+  obj.prev_out_layers = out_layers;
+  end
+
+  function out = get_output(obj, index)
+  % get the i-th output
+  out_dim = libpointer('uint32Ptr', 0);
+  out_shape = libpointer('uint32PtrPtr', ones(4,1));
+  callmxnet('MXPredGetOutputShape', obj.predictor, index, out_shape, out_dim);
+  assert(out_dim.Value <= 4);
+  out_siz = out_shape.Value(1:out_dim.Value);
+  out_siz = double(out_siz(end:-1:1))';
+
+  % get output
+  out = libpointer('singlePtr', single(zeros(out_siz)));
+
+  callmxnet('MXPredGetOutput', obj.predictor, index, ...
+            out, uint32(prod(out_siz)));
+
+  % TODO convert from c order to matlab order...
+  out = reshape(out.Value, out_siz);
+  if length(out_siz) > 2
+    out = obj.convert_ndarray(out);
+  end
+  end
+
+end
+
+end
diff --git a/matlab/+mxnet/private/callmxnet.m b/matlab/+mxnet/private/callmxnet.m
new file mode 100644
index 000000000000..3ddaa8acdef7
--- /dev/null
+++ b/matlab/+mxnet/private/callmxnet.m
@@ -0,0 +1,27 @@
+function callmxnet(func, varargin)
+%CALLMXNET call mxnet functions
+
+if ~libisloaded('libmxnet')
+  cur_pwd = pwd;
+  mxnet_root = [fileparts(mfilename('fullpath')), '/../../../'];
+  cd(mxnet_root);
+  mxnet_root = pwd;
+  cd(cur_pwd);
+  assert(exist([mxnet_root, '/lib/libmxnet.so'], 'file') == 2 || ...
+         exist([mxnet_root, '/lib/libmxnet.dylib'], 'file') == 2 || ...
+         exist([mxnet_root, '/lib/libmxnet.dll'], 'file') == 2, ...
+         'you need to build mxnet first');
+  assert(exist([mxnet_root, '/include/mxnet/c_predict_api.h']) == 2, ...
+         'failed to find c_predict_api.h')
+  addpath([mxnet_root, '/lib'])
+  addpath([mxnet_root, '/include/mxnet'])
+
+  [err, warn] = loadlibrary('libmxnet', 'c_predict_api.h');
+  assert(isempty(err));
+  if warn, warn, end
+end
+
+assert(ischar(func))
+ret = calllib('libmxnet', func, varargin{:});
+assert(ret == 0)
+end
diff --git a/matlab/+mxnet/private/parse_json.m b/matlab/+mxnet/private/parse_json.m
new file mode 100644
index 000000000000..6aa0b4e5a0f3
--- /dev/null
+++ b/matlab/+mxnet/private/parse_json.m
@@ -0,0 +1,617 @@
+function data = parse_json(fname,varargin)
+%PARSE_JSON parse a JSON (JavaScript Object Notation) file or string
+%
+% Based on jsonlab (https://github.com/fangq/jsonlab) created by Qianqian Fang. Jsonlab is lisonced under BSD or GPL v3.
+
+global pos inStr len  esc index_esc len_esc isoct arraytoken
+
+if(regexp(fname,'^\s*(?:\[.+\])|(?:\{.+\})\s*$','once'))
+   string=fname;
+elseif(exist(fname,'file'))
+   try
+       string = fileread(fname);
+   catch
+       try
+           string = urlread(['file://',fname]);
+       catch
+           string = urlread(['file://',fullfile(pwd,fname)]);
+       end
+   end
+else
+   error('input file does not exist');
+end
+
+pos = 1; len = length(string); inStr = string;
+isoct=exist('OCTAVE_VERSION','builtin');
+arraytoken=find(inStr=='[' | inStr==']' | inStr=='"');
+jstr=regexprep(inStr,'\\\\','  ');
+escquote=regexp(jstr,'\\"');
+arraytoken=sort([arraytoken escquote]);
+
+% String delimiters and escape chars identified to improve speed:
+esc = find(inStr=='"' | inStr=='\' ); % comparable to: regexp(inStr, '["\\]');
+index_esc = 1; len_esc = length(esc);
+
+opt=varargin2struct(varargin{:});
+
+if(jsonopt('ShowProgress',0,opt)==1)
+    opt.progressbar_=waitbar(0,'loading ...');
+end
+jsoncount=1;
+while pos <= len
+    switch(next_char)
+        case '{'
+            data{jsoncount} = parse_object(opt);
+        case '['
+            data{jsoncount} = parse_array(opt);
+        otherwise
+            error_pos('Outer level structure must be an object or an array');
+    end
+    jsoncount=jsoncount+1;
+end % while
+
+jsoncount=length(data);
+if(jsoncount==1 && iscell(data))
+    data=data{1};
+end
+
+if(isfield(opt,'progressbar_'))
+    close(opt.progressbar_);
+end
+
+%%-------------------------------------------------------------------------
+function object = parse_object(varargin)
+    parse_char('{');
+    object = [];
+    if next_char ~= '}'
+        while 1
+            str = parseStr(varargin{:});
+            if isempty(str)
+                error_pos('Name of value at position %d cannot be empty');
+            end
+            parse_char(':');
+            val = parse_value(varargin{:});
+            object.(valid_field(str))=val;
+            if next_char == '}'
+                break;
+            end
+            parse_char(',');
+        end
+    end
+    parse_char('}');
+    if(isstruct(object))
+        object=struct2jdata(object);
+    end
+
+%%-------------------------------------------------------------------------
+
+function object = parse_array(varargin) % JSON array is written in row-major order
+global pos inStr isoct
+    parse_char('[');
+    object = cell(0, 1);
+    dim2=[];
+    arraydepth=jsonopt('JSONLAB_ArrayDepth_',1,varargin{:});
+    pbar=-1;
+    if(isfield(varargin{1},'progressbar_'))
+        pbar=varargin{1}.progressbar_;
+    end
+
+    if next_char ~= ']'
+	if(jsonopt('FastArrayParser',1,varargin{:})>=1 && arraydepth>=jsonopt('FastArrayParser',1,varargin{:}))
+            [endpos, e1l, e1r]=matching_bracket(inStr,pos);
+            arraystr=['[' inStr(pos:endpos)];
+            arraystr=regexprep(arraystr,'"_NaN_"','NaN');
+            arraystr=regexprep(arraystr,'"([-+]*)_Inf_"','$1Inf');
+            arraystr(arraystr==sprintf('\n'))=[];
+            arraystr(arraystr==sprintf('\r'))=[];
+            %arraystr=regexprep(arraystr,'\s*,',','); % this is slow,sometimes needed
+            if(~isempty(e1l) && ~isempty(e1r)) % the array is in 2D or higher D
+        	astr=inStr((e1l+1):(e1r-1));
+        	astr=regexprep(astr,'"_NaN_"','NaN');
+        	astr=regexprep(astr,'"([-+]*)_Inf_"','$1Inf');
+        	astr(astr==sprintf('\n'))=[];
+        	astr(astr==sprintf('\r'))=[];
+        	astr(astr==' ')='';
+        	if(isempty(find(astr=='[', 1))) % array is 2D
+                    dim2=length(sscanf(astr,'%f,',[1 inf]));
+        	end
+            else % array is 1D
+        	astr=arraystr(2:end-1);
+        	astr(astr==' ')='';
+        	[obj, count, errmsg, nextidx]=sscanf(astr,'%f,',[1,inf]);
+        	if(nextidx>=length(astr)-1)
+                    object=obj;
+                    pos=endpos;
+                    parse_char(']');
+                    return;
+        	end
+            end
+            if(~isempty(dim2))
+        	astr=arraystr;
+        	astr(astr=='[')='';
+        	astr(astr==']')='';
+        	astr(astr==' ')='';
+        	[obj, count, errmsg, nextidx]=sscanf(astr,'%f,',inf);
+        	if(nextidx>=length(astr)-1)
+                    object=reshape(obj,dim2,numel(obj)/dim2)';
+                    pos=endpos;
+                    parse_char(']');
+                    if(pbar>0)
+                        waitbar(pos/length(inStr),pbar,'loading ...');
+                    end
+                    return;
+        	end
+            end
+            arraystr=regexprep(arraystr,'\]\s*,','];');
+	else
+            arraystr='[';
+	end
+        try
+           if(isoct && regexp(arraystr,'"','once'))
+                error('Octave eval can produce empty cells for JSON-like input');
+           end
+           object=eval(arraystr);
+           pos=endpos;
+        catch
+         while 1
+            newopt=varargin2struct(varargin{:},'JSONLAB_ArrayDepth_',arraydepth+1);
+            val = parse_value(newopt);
+            object{end+1} = val;
+            if next_char == ']'
+                break;
+            end
+            parse_char(',');
+         end
+        end
+    end
+    if(jsonopt('SimplifyCell',0,varargin{:})==1)
+      try
+        oldobj=object;
+        object=cell2mat(object')';
+        if(iscell(oldobj) && isstruct(object) && numel(object)>1 && jsonopt('SimplifyCellArray',1,varargin{:})==0)
+            object=oldobj;
+        elseif(size(object,1)>1 && ismatrix(object))
+            object=object';
+        end
+      catch
+      end
+    end
+    parse_char(']');
+
+    if(pbar>0)
+        waitbar(pos/length(inStr),pbar,'loading ...');
+    end
+%%-------------------------------------------------------------------------
+
+function parse_char(c)
+    global pos inStr len
+    pos=skip_whitespace(pos,inStr,len);
+    if pos > len || inStr(pos) ~= c
+        error_pos(sprintf('Expected %c at position %%d', c));
+    else
+        pos = pos + 1;
+        pos=skip_whitespace(pos,inStr,len);
+    end
+
+%%-------------------------------------------------------------------------
+
+function c = next_char
+    global pos inStr len
+    pos=skip_whitespace(pos,inStr,len);
+    if pos > len
+        c = [];
+    else
+        c = inStr(pos);
+    end
+
+%%-------------------------------------------------------------------------
+
+function newpos=skip_whitespace(pos,inStr,len)
+    newpos=pos;
+    while newpos <= len && isspace(inStr(newpos))
+        newpos = newpos + 1;
+    end
+
+%%-------------------------------------------------------------------------
+function str = parseStr(varargin)
+    global pos inStr len  esc index_esc len_esc
+ % len, ns = length(inStr), keyboard
+    if inStr(pos) ~= '"'
+        error_pos('String starting with " expected at position %d');
+    else
+        pos = pos + 1;
+    end
+    str = '';
+    while pos <= len
+        while index_esc <= len_esc && esc(index_esc) < pos
+            index_esc = index_esc + 1;
+        end
+        if index_esc > len_esc
+            str = [str inStr(pos:len)];
+            pos = len + 1;
+            break;
+        else
+            str = [str inStr(pos:esc(index_esc)-1)];
+            pos = esc(index_esc);
+        end
+        nstr = length(str);
+        switch inStr(pos)
+            case '"'
+                pos = pos + 1;
+                if(~isempty(str))
+                    if(strcmp(str,'_Inf_'))
+                        str=Inf;
+                    elseif(strcmp(str,'-_Inf_'))
+                        str=-Inf;
+                    elseif(strcmp(str,'_NaN_'))
+                        str=NaN;
+                    end
+                end
+                return;
+            case '\'
+                if pos+1 > len
+                    error_pos('End of file reached right after escape character');
+                end
+                pos = pos + 1;
+                switch inStr(pos)
+                    case {'"' '\' '/'}
+                        str(nstr+1) = inStr(pos);
+                        pos = pos + 1;
+                    case {'b' 'f' 'n' 'r' 't'}
+                        str(nstr+1) = sprintf(['\' inStr(pos)]);
+                        pos = pos + 1;
+                    case 'u'
+                        if pos+4 > len
+                            error_pos('End of file reached in escaped unicode character');
+                        end
+                        str(nstr+(1:6)) = inStr(pos-1:pos+4);
+                        pos = pos + 5;
+                end
+            otherwise % should never happen
+                str(nstr+1) = inStr(pos);
+                keyboard;
+                pos = pos + 1;
+        end
+    end
+    error_pos('End of file while expecting end of inStr');
+
+%%-------------------------------------------------------------------------
+
+function num = parse_number(varargin)
+    global pos inStr isoct
+    currstr=inStr(pos:min(pos+30,end));
+    if(isoct~=0)
+        numstr=regexp(currstr,'^\s*-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+\-]?\d+)?','end');
+        [num] = sscanf(currstr, '%f', 1);
+        delta=numstr+1;
+    else
+        [num, one, err, delta] = sscanf(currstr, '%f', 1);
+        if ~isempty(err)
+            error_pos('Error reading number at position %d');
+        end
+    end
+    pos = pos + delta-1;
+
+%%-------------------------------------------------------------------------
+
+function val = parse_value(varargin)
+    global pos inStr len
+
+    if(isfield(varargin{1},'progressbar_'))
+        waitbar(pos/len,varargin{1}.progressbar_,'loading ...');
+    end
+
+    switch(inStr(pos))
+        case '"'
+            val = parseStr(varargin{:});
+            return;
+        case '['
+            val = parse_array(varargin{:});
+            return;
+        case '{'
+            val = parse_object(varargin{:});
+            return;
+        case {'-','0','1','2','3','4','5','6','7','8','9'}
+            val = parse_number(varargin{:});
+            return;
+        case 't'
+            if pos+3 <= len && strcmpi(inStr(pos:pos+3), 'true')
+                val = true;
+                pos = pos + 4;
+                return;
+            end
+        case 'f'
+            if pos+4 <= len && strcmpi(inStr(pos:pos+4), 'false')
+                val = false;
+                pos = pos + 5;
+                return;
+            end
+        case 'n'
+            if pos+3 <= len && strcmpi(inStr(pos:pos+3), 'null')
+                val = [];
+                pos = pos + 4;
+                return;
+            end
+    end
+    error_pos('Value expected at position %d');
+%%-------------------------------------------------------------------------
+
+function error_pos(msg)
+    global pos inStr len
+    poShow = max(min([pos-15 pos-1 pos pos+20],len),1);
+    if poShow(3) == poShow(2)
+        poShow(3:4) = poShow(2)+[0 -1];  % display nothing after
+    end
+    msg = [sprintf(msg, pos) ': ' ...
+    inStr(poShow(1):poShow(2)) '<error>' inStr(poShow(3):poShow(4)) ];
+    error( ['JSONparser:invalidFormat: ' msg] );
+
+%%-------------------------------------------------------------------------
+
+function str = valid_field(str)
+global isoct
+% From MATLAB doc: field names must begin with a letter, which may be
+% followed by any combination of letters, digits, and underscores.
+% Invalid characters will be converted to underscores, and the prefix
+% "x0x[Hex code]_" will be added if the first character is not a letter.
+    pos=regexp(str,'^[^A-Za-z]','once');
+    if(~isempty(pos))
+        if(~isoct)
+            str=regexprep(str,'^([^A-Za-z])','x0x${sprintf(''%X'',unicode2native($1))}_','once');
+        else
+            str=sprintf('x0x%X_%s',char(str(1)),str(2:end));
+        end
+    end
+    if(isempty(regexp(str,'[^0-9A-Za-z_]', 'once' )))
+        return;
+    end
+    if(~isoct)
+        str=regexprep(str,'([^0-9A-Za-z_])','_0x${sprintf(''%X'',unicode2native($1))}_');
+    else
+        pos=regexp(str,'[^0-9A-Za-z_]');
+        if(isempty(pos))
+            return;
+        end
+        str0=str;
+        pos0=[0 pos(:)' length(str)];
+        str='';
+        for i=1:length(pos)
+            str=[str str0(pos0(i)+1:pos(i)-1) sprintf('_0x%X_',str0(pos(i)))];
+        end
+        if(pos(end)~=length(str))
+            str=[str str0(pos0(end-1)+1:pos0(end))];
+        end
+    end
+    %str(~isletter(str) & ~('0' <= str & str <= '9')) = '_';
+
+%%-------------------------------------------------------------------------
+function endpos = matching_quote(str,pos)
+len=length(str);
+while(pos<len)
+    if(str(pos)=='"')
+        if(~(pos>1 && str(pos-1)=='\'))
+            endpos=pos;
+            return;
+        end
+    end
+    pos=pos+1;
+end
+error('unmatched quotation mark');
+%%-------------------------------------------------------------------------
+function [endpos, e1l, e1r, maxlevel] = matching_bracket(str,pos)
+global arraytoken
+level=1;
+maxlevel=level;
+endpos=0;
+bpos=arraytoken(arraytoken>=pos);
+tokens=str(bpos);
+len=length(tokens);
+pos=1;
+e1l=[];
+e1r=[];
+while(pos<=len)
+    c=tokens(pos);
+    if(c==']')
+        level=level-1;
+        if(isempty(e1r))
+            e1r=bpos(pos);
+        end
+        if(level==0)
+            endpos=bpos(pos);
+            return
+        end
+    end
+    if(c=='[')
+        if(isempty(e1l))
+            e1l=bpos(pos);
+        end
+        level=level+1;
+        maxlevel=max(maxlevel,level);
+    end
+    if(c=='"')
+        pos=matching_quote(tokens,pos+1);
+    end
+    pos=pos+1;
+end
+if(endpos==0)
+    error('unmatched "]"');
+end
+
+function opt=varargin2struct(varargin)
+%
+% opt=varargin2struct('param1',value1,'param2',value2,...)
+%   or
+% opt=varargin2struct(...,optstruct,...)
+%
+% convert a series of input parameters into a structure
+%
+% input:
+%      'param', value: the input parameters should be pairs of a string and a value
+%       optstruct: if a parameter is a struct, the fields will be merged to the output struct
+%
+% output:
+%      opt: a struct where opt.param1=value1, opt.param2=value2 ...
+%
+
+len=length(varargin);
+opt=struct;
+if(len==0) return; end
+i=1;
+while(i<=len)
+    if(isstruct(varargin{i}))
+        opt=mergestruct(opt,varargin{i});
+    elseif(ischar(varargin{i}) && i<len)
+        opt=setfield(opt,lower(varargin{i}),varargin{i+1});
+        i=i+1;
+    else
+        error('input must be in the form of ...,''name'',value,... pairs or structs');
+    end
+    i=i+1;
+end
+function val=jsonopt(key,default,varargin)
+%
+% val=jsonopt(key,default,optstruct)
+%
+% setting options based on a struct. The struct can be produced
+% by varargin2struct from a list of 'param','value' pairs
+%
+% authors:Qianqian Fang (fangq<at> nmr.mgh.harvard.edu)
+%
+% $Id: loadjson.m 371 2012-06-20 12:43:06Z fangq $
+%
+% input:
+%      key: a string with which one look up a value from a struct
+%      default: if the key does not exist, return default
+%      optstruct: a struct where each sub-field is a key
+%
+% output:
+%      val: if key exists, val=optstruct.key; otherwise val=default
+%
+
+val=default;
+if(nargin<=2) return; end
+opt=varargin{1};
+if(isstruct(opt))
+    if(isfield(opt,key))
+       val=getfield(opt,key);
+    elseif(isfield(opt,lower(key)))
+       val=getfield(opt,lower(key));
+    end
+end
+
+function s=mergestruct(s1,s2)
+%
+% s=mergestruct(s1,s2)
+%
+% merge two struct objects into one
+%
+% authors:Qianqian Fang (fangq<at> nmr.mgh.harvard.edu)
+% date: 2012/12/22
+%
+% input:
+%      s1,s2: a struct object, s1 and s2 can not be arrays
+%
+% output:
+%      s: the merged struct object. fields in s1 and s2 will be combined in s.
+
+if(~isstruct(s1) || ~isstruct(s2))
+    error('input parameters contain non-struct');
+end
+if(length(s1)>1 || length(s2)>1)
+    error('can not merge struct arrays');
+end
+fn=fieldnames(s2);
+s=s1;
+for i=1:length(fn)
+    s=setfield(s,fn{i},getfield(s2,fn{i}));
+end
+function newdata=struct2jdata(data,varargin)
+%
+% newdata=struct2jdata(data,opt,...)
+%
+% convert a JData object (in the form of a struct array) into an array
+%
+% authors:Qianqian Fang (fangq<at> nmr.mgh.harvard.edu)
+%
+% input:
+%      data: a struct array. If data contains JData keywords in the first
+%            level children, these fields are parsed and regrouped into a
+%            data object (arrays, trees, graphs etc) based on JData
+%            specification. The JData keywords are
+%               "_ArrayType_", "_ArraySize_", "_ArrayData_"
+%               "_ArrayIsSparse_", "_ArrayIsComplex_"
+%      opt: (optional) a list of 'Param',value pairs for additional options
+%           The supported options include
+%               'Recursive', if set to 1, will apply the conversion to
+%                            every child; 0 to disable
+%
+% output:
+%      newdata: the covnerted data if the input data does contain a JData
+%               structure; otherwise, the same as the input.
+%
+% examples:
+%      obj=struct('_ArrayType_','double','_ArraySize_',[2 3],
+%                 '_ArrayIsSparse_',1 ,'_ArrayData_',null);
+%      ubjdata=struct2jdata(obj);
+
+fn=fieldnames(data);
+newdata=data;
+len=length(data);
+if(jsonopt('Recursive',0,varargin{:})==1)
+  for i=1:length(fn) % depth-first
+    for j=1:len
+        if(isstruct(getfield(data(j),fn{i})))
+            newdata(j)=setfield(newdata(j),fn{i},jstruct2array(getfield(data(j),fn{i})));
+        end
+    end
+  end
+end
+if(~isempty(strmatch('x0x5F_ArrayType_',fn)) && ~isempty(strmatch('x0x5F_ArrayData_',fn)))
+  newdata=cell(len,1);
+  for j=1:len
+    ndata=cast(data(j).x0x5F_ArrayData_,data(j).x0x5F_ArrayType_);
+    iscpx=0;
+    if(~isempty(strmatch('x0x5F_ArrayIsComplex_',fn)))
+        if(data(j).x0x5F_ArrayIsComplex_)
+           iscpx=1;
+        end
+    end
+    if(~isempty(strmatch('x0x5F_ArrayIsSparse_',fn)))
+        if(data(j).x0x5F_ArrayIsSparse_)
+            if(~isempty(strmatch('x0x5F_ArraySize_',fn)))
+                dim=double(data(j).x0x5F_ArraySize_);
+                if(iscpx && size(ndata,2)==4-any(dim==1))
+                    ndata(:,end-1)=complex(ndata(:,end-1),ndata(:,end));
+                end
+                if isempty(ndata)
+                    % All-zeros sparse
+                    ndata=sparse(dim(1),prod(dim(2:end)));
+                elseif dim(1)==1
+                    % Sparse row vector
+                    ndata=sparse(1,ndata(:,1),ndata(:,2),dim(1),prod(dim(2:end)));
+                elseif dim(2)==1
+                    % Sparse column vector
+                    ndata=sparse(ndata(:,1),1,ndata(:,2),dim(1),prod(dim(2:end)));
+                else
+                    % Generic sparse array.
+                    ndata=sparse(ndata(:,1),ndata(:,2),ndata(:,3),dim(1),prod(dim(2:end)));
+                end
+            else
+                if(iscpx && size(ndata,2)==4)
+                    ndata(:,3)=complex(ndata(:,3),ndata(:,4));
+                end
+                ndata=sparse(ndata(:,1),ndata(:,2),ndata(:,3));
+            end
+        end
+    elseif(~isempty(strmatch('x0x5F_ArraySize_',fn)))
+        if(iscpx && size(ndata,2)==2)
+             ndata=complex(ndata(:,1),ndata(:,2));
+        end
+        ndata=reshape(ndata(:),data(j).x0x5F_ArraySize_);
+    end
+    newdata{j}=ndata;
+  end
+  if(len==1)
+      newdata=newdata{1};
+  end
+end
diff --git a/matlab/README.md b/matlab/README.md
new file mode 100644
index 000000000000..2e288c48a0d1
--- /dev/null
+++ b/matlab/README.md
@@ -0,0 +1,81 @@
+# MATLAB binding for mxnet
+
+### How to use
+
+The only requirment is build mxnet to get `lib/libmxnet.so`. Sample usage
+
+- Load model and data:
+
+  ```matlab
+  img = single(imresize(imread('cat.png'), [224 224])) - 120;
+  model = mxnet.model;
+  model.load('model/Inception_BN', 39);
+  ```
+
+- Get prediction:
+
+  ```matlab
+  pred = model.forward(img);
+  ```
+
+- Do feature extraction on GPU 0:
+
+  ```matlab
+  feas = model.forward(img, 'gpu', 0, {'max_pool_5b_pool', 'global_pool', 'fc'});
+  ```
+
+- See [demo.m](demo.m) for more examples
+
+### Note on Implementation
+
+We use `loadlibrary` to load mxnet library directly into Matlab and `calllib` to
+call MXNet functions. Note that Matlab uses the column-major to store N-dim
+arraies while and MXNet uses the row-major. So assume we create an array in
+matlab with
+
+```matlab
+X = zeros([2,3,4,5]);
+```
+
+If we pass the memory of `X` into MXNet, then the correct shape will be
+`[5,4,3,2]` in MXNet. When processing images, MXNet assumes the data layout is
+
+```c++
+example x channel x width x height
+```
+
+while in matlab we often store images by
+
+```matlab
+width x height x channel x example
+```
+
+So we should permuate the dimensions by `X = permute(X, [2, 1, 3, 4])` before
+passing `X` into MXNet.
+
+### FAQ
+
+1. You may get the error `GLIBCXX_x.x.xx` is not found. Such as on Ubuntu 14.04:
+
+```
+> In loadlibrary (line 359)
+Error using loadlibrary (line 447)
+There was an error loading the library "/home/muli/work/mxnet/lib/libmxnet.so"
+/usr/local/MATLAB/R2015a/bin/glnxa64/../../sys/os/glnxa64/libstdc++.so.6:
+version `GLIBCXX_3.4.18' not found (required by
+/home/muli/work/mxnet/lib/libmxnet.so)
+
+Caused by:
+    Error using loaddefinedlibrary
+    /usr/local/MATLAB/R2015a/bin/glnxa64/../../sys/os/glnxa64/libstdc++.so.6:
+    version `GLIBCXX_3.4.18' not found (required by
+    /home/muli/work/mxnet/lib/libmxnet.so)
+```
+
+   One way to fix it is to link `MATLAB_ROOT/sys/os/glnxa64/libstdc++.so.6` to
+   your system's `libstdc++`. For example
+
+```bash
+muli@ghc:/usr/local/MATLAB/R2015a/sys/os/glnxa64$ sudo rm -r libstdc++.so.6
+muli@ghc:/usr/local/MATLAB/R2015a/sys/os/glnxa64$ sudo ln -s /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.19 libstdc++.so.6
+```
diff --git a/matlab/demo.m b/matlab/demo.m
new file mode 100644
index 000000000000..b938af573c4a
--- /dev/null
+++ b/matlab/demo.m
@@ -0,0 +1,55 @@
+%% Download sample image and model
+if ~exist('cat.png', 'file')
+  assert(~system('wget --no-check-certificate https://raw.githubusercontent.com/dmlc/mxnet.js/master/data/cat.png'));
+end
+
+if ~exist('model/Inception_BN-0039.params', 'file')
+  assert(~system('wget --no-check-certificate https://s3.amazonaws.com/dmlc/model/inception-bn.tar.gz'));
+  assert(~system('tar -zxvf inception-bn.tar.gz'))
+end
+
+%% Load the model
+clear model
+model = mxnet.model;
+model.load('model/Inception_BN', 39);
+
+%% Load and resize the image
+img = imresize(imread('cat.png'), [224 224]);
+img = single(img) - 120;
+%% Run prediction
+pred = model.forward(img);
+
+%% load the labels
+labels = {};
+fid = fopen('model/synset.txt', 'r');
+assert(fid >= 0);
+tline = fgetl(fid);
+while ischar(tline)
+  labels{end+1} = tline;
+  tline = fgetl(fid);
+end
+fclose(fid);
+
+%% find the predict label
+[p, i] = max(pred);
+fprintf('the best result is %s, with probability %f\n', labels{i}, p)
+
+%% Print the last 10 layers in the symbol
+
+sym = model.parse_symbol();
+layers = {};
+for i = 1 : length(sym.nodes)
+  if ~strcmp(sym.nodes{i}.op, 'null')
+    layers{end+1} = sym.nodes{i}.name;
+  end
+end
+fprintf('layer name: %s\n', layers{end-10:end})
+
+%% Extract feature from internal layers
+
+feas = model.forward(img, {'max_pool_5b_pool', 'global_pool', 'fc'});
+feas(:)
+
+%% If GPU is available
+% feas = model.forward(img, 'gpu', 0, {'max_pool_5b_pool', 'global_pool', 'fc'});
+% feas(:)
diff --git a/matlab/tests/prepare_data.m b/matlab/tests/prepare_data.m
new file mode 100644
index 000000000000..6d450cdd36c1
--- /dev/null
+++ b/matlab/tests/prepare_data.m
@@ -0,0 +1,36 @@
+%% download cifar10 dataset
+system('wget https://www.cs.toronto.edu/~kriz/cifar-10-matlab.tar.gz')
+system('tar -xzvf cifar-10-matlab.tar.gz')
+load cifar-10-batches-mat/test_batch.mat
+
+%% convert test dataset of cifar10, and save
+X = reshape(data', [32, 32, 3, 10000]);
+X = permute(X, [2 1 3 4]);
+Y = labels + 1;
+
+
+save cifar10-test X Y
+%% preview one picture
+imshow(imresize(X(:,:,:,2), [128, 128]))
+
+%%
+
+!wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
+!wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
+!gunzip t10k-images-idx3-ubyte.gz
+!gunzip t10k-labels-idx1-ubyte.gz
+
+%%
+
+fid = fopen('t10k-images-idx3-ubyte', 'r');
+d = fread(fid, inf, '*uint8');
+fclose(fid);
+X = reshape(d(17:end), [28 28 1 10000]);
+X = permute(X, [2 1 3 4]);
+
+fid = fopen('t10k-labels-idx1-ubyte', 'r');
+d = fread(fid, inf, '*uint8');
+fclose(fid);
+Y = d(9:end) + 1;
+
+save mnist-test X Y
diff --git a/matlab/tests/test_prediction.m b/matlab/tests/test_prediction.m
new file mode 100644
index 000000000000..fe7d7a68ecc3
--- /dev/null
+++ b/matlab/tests/test_prediction.m
@@ -0,0 +1,105 @@
+%% prepare
+
+addpath('..')
+
+if ~exist('mnist-test.mat', 'file')
+  system('wget --no-check-certificate https://github.com/dmlc/web-data/raw/master/mxnet/matlab/mnist-test.mat');
+end
+
+if ~exist('model/mnist-lenet-0-0010.params', 'file')
+  system('wget --no-check-certificate https://github.com/dmlc/web-data/raw/master/mxnet/matlab/mnist-lenet.tar.gz');
+  system('tar -zxf mnist-lenet.tar.gz');
+end
+
+%% load data and model
+
+load mnist-test
+clear model
+model = mxnet.model;
+model.load('model/mnist-lenet-0', 10);
+
+%% predict
+
+err = 0;
+batch = 1000;
+for i = 1 : length(Y) / batch
+  ix = (i-1)*batch+1 : i*batch;
+  x = X(:,:,:,ix);
+  pred = model.forward(x, 'gpu', 0);
+  [~, k] = max(pred);
+  err = err + nnz(k ~= Y(ix)');
+end
+
+err = err / length(Y);
+fprintf('prediction error: %f\n', err)
+
+%%
+% ix = 1:2;
+% x = X(:,:,:,ix);
+% pred = model.forward(x, {'pooling1', 'fullyconnected1', 'softmax'});
+
+%%
+% batch = 1000;
+% e = 0;
+% for i = 1 : batch
+%   x = single(X(:,:,:,i));
+%   pred = model.forward(x);
+%   [~, k] = max(pred);
+%   e = e + (k == Y(i));
+% end
+
+% e / batch
+
+% %% load data
+% load cifar10-test.mat
+% img_mean = [123.68, 116.779, 103.939];
+
+% %%
+% clear model
+% model = mxnet.model;
+% model.load('model/cifar10-incept-bn-0', 20);
+
+% %%
+% batch = 100;
+% x = zeros(28,28,3,batch);
+% for i = 1 : batch
+%   x(:,:,:,i) = single(imresize(X(:,:,:,i), [28, 28]));
+%   x = x(:,:,[3 2 1],:);
+% end
+% % x = permute(x, [2 1 3 4]);
+
+% x = x - 120;
+% % for i = 1 : 3
+% %   x(:,:,i,:) = x(:,:,i,:) - img_mean(i);
+% % end
+
+
+% pred = model.forward(x, 'gpu', 0);
+
+% [~,i] = max(reshape(pred(:), 10, batch));
+% nnz(i' == Y(1:batch)) / length(i)
+
+% %%
+
+% batch = 100;
+% e = 0;
+% for i = 1 : batch
+%   x = single(imresize(X(:,:,:,i), [28, 28])) - 120;
+%   for j = 1 : 3
+%     x(:,:,j) = x(:,:,j);
+%   end
+%   pred = model.forward(x);
+%   [~, k] = max(pred);
+%   e = e + (k == Y(i));
+% end
+
+% e / batch
+
+
+% %% load bin
+
+% a = fopen('mean.bin', 'r');
+% yy = fread(a, 14, '*int32');
+% mm = fread(a, inf, '*single');
+% fclose(a)
+% % nn = mm(14:end-6);
diff --git a/mshadow b/mshadow
index 120acae34880..47521c6f8e0d 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 120acae3488099d8f27886448cb225aca1c86a31
+Subproject commit 47521c6f8e0d62a0224bc5bb19b60cc6a0d6a95c
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
index 0deb7bac6cf4..ca30f1c69f89 100644
--- a/python/mxnet/recordio.py
+++ b/python/mxnet/recordio.py
@@ -157,7 +157,7 @@ def unpack_img(s, iscolor=-1):
     img = cv2.imdecode(img, iscolor)
     return header, img
 
-def pack_img(header, img, quality=80, img_fmt='.JPEG'):
+def pack_img(header, img, quality=80, img_fmt='.jpg'):
     """pack an image into MXImageRecord
 
     Parameters
@@ -167,7 +167,9 @@ def pack_img(header, img, quality=80, img_fmt='.JPEG'):
     img : numpy.ndarray
         image to pack
     quality : int
-        quality for JPEG encoding. 1-100
+        quality for JPEG encoding. 1-100, or compression for PNG encoding. 1-9.
+    img_fmt : str
+        Encoding of the image. .jpg for JPEG, .png for PNG.
 
     Returns
     -------
@@ -175,6 +177,14 @@ def pack_img(header, img, quality=80, img_fmt='.JPEG'):
         The packed string
     """
     assert opencv_available
-    ret, buf = cv2.imencode(img_fmt, img, [cv2.IMWRITE_JPEG_QUALITY, quality])
+    jpg_formats = set(['.jpg', '.jpeg', '.JPG', '.JPEG'])
+    png_formats = set(['.png', '.PNG'])
+    encode_params = None
+    if img_fmt in jpg_formats:
+        encode_params = [cv2.IMWRITE_JPEG_QUALITY, quality]
+    elif img_fmt in png_formats:
+        encode_params = [cv2.IMWRITE_PNG_COMPRESSION, quality]
+
+    ret, buf = cv2.imencode(img_fmt, img, encode_params)
     assert ret, 'failed encoding image'
     return pack(header, buf.tostring())
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index d0cc9082c5dd..e65f9e761598 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -54,6 +54,10 @@
           <argLine>-Djava.library.path=${project.parent.basedir}/native/${platform}/target</argLine> 
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.scalastyle</groupId>
+        <artifactId>scalastyle-maven-plugin</artifactId>
+      </plugin>
     </plugins>
   </build>
   <dependencies>
diff --git a/scala-package/core/src/main/resources/scalastyle_config.xml b/scala-package/core/src/main/resources/scalastyle_config.xml
new file mode 100644
index 000000000000..f5b043c57cce
--- /dev/null
+++ b/scala-package/core/src/main/resources/scalastyle_config.xml
@@ -0,0 +1,9 @@
+<scalastyle commentFilter="enabled">
+  <name>Scalastyle standard configuration</name>
+    <check level="warning" class="org.scalastyle.file.FileTabChecker" enabled="true" />
+    <check level="warning" class="org.scalastyle.file.FileLengthChecker" enabled="true">
+      <parameters>
+        <parameter name="maxFileLength">800</parameter>
+      </parameters>
+    </check>
+</scalastyle>
\ No newline at end of file
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
new file mode 100644
index 000000000000..561ba19389a4
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
@@ -0,0 +1,108 @@
+package ml.dmlc.mxnet
+
+/**
+ * Base class of all evaluation metrics
+ * @param name Metric name
+ *
+ * @author Yuan Tang
+ */
+abstract class EvalMetric(protected val name: String) {
+
+  protected var numInst: Int = 0
+  protected var sumMetric: Float = 0.0f
+
+  /**
+   * Update the internal evaluation.
+   *
+   * @param labels The labels of the data
+   * @param preds Predicted values.
+   */
+  def update(labels: NDArray, preds: NDArray): Unit
+
+  /**
+   * Clear the internal statistics to initial state.
+   */
+  def reset: Unit = {
+    this.numInst = 0
+    this.sumMetric = 0.0f
+  }
+
+  /**
+   * Get the current evaluation result.
+   * @return name, Name of the metric
+   *         value, Value of the evaluation
+   */
+  def get: (String, Float) = {
+    (this.name, this.sumMetric / this.numInst)
+  }
+}
+
+
+/**
+ * Calculate accuracy
+ */
+class Accuracy extends EvalMetric("accuracy") {
+  def update(labels: NDArray, preds: NDArray): Unit = {
+
+    require(labels.size == preds.size, "labels and predictions should have the same length.")
+
+    (0 to preds.size) foreach (i => {
+      val pred: NDArray = preds.slice(i, i)
+      val label: NDArray = labels.slice(i, i)
+
+//      require(label.shape(0) < predLabel.shape(0), "Should not have more predict labels than actual labels ")
+    })
+  }
+}
+
+
+/**
+ * Calculate Mean Absolute Error loss
+ */
+class MAE extends EvalMetric("mae") {
+  def update(labels: NDArray, preds: NDArray): Unit = {
+
+    require(labels.size == preds.size, "labels and predictions should have the same length.")
+
+    for ( (label, pred) <- (labels.toArray zip preds.toArray)) {
+
+    }
+  }
+}
+
+
+/**
+ * Calculate Root Mean Squred Error loss
+ */
+class RMSE extends EvalMetric("rmse") {
+  def update(labels: NDArray, preds: NDArray): Unit = {
+
+    require(labels.size == preds.size, "labels and predictions should have the same length.")
+
+    for ( (label, pred) <- (labels.toArray zip preds.toArray)) {
+
+    }
+  }
+}
+
+
+/**
+ * Custom evaluation metric that takes a NDArray function.
+ * @param fEval Customized evaluation function.
+ * @param name The name of the metric
+ */
+class CustomMetric(var fEval: () => Unit, override val name: String) extends EvalMetric(name) {
+  def update(labels: NDArray, preds: NDArray): Unit = {
+
+    require(labels.size == preds.size, "labels and predictions should have the same length.")
+
+    for ( (label, pred) <- (labels.toArray zip preds.toArray)) {
+
+    }
+  }
+}
+
+
+
+
+
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
new file mode 100644
index 000000000000..4eb51f74cc5f
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
@@ -0,0 +1,11 @@
+package ml.dmlc.mxnet
+
+/**
+ * Created by yuantang on 12/23/15.
+ */
+abstract class Executor(var argArrays: Array[NDArray]) {
+  def forward
+  def backward
+  def setMonitorCallback(callback: (String, NDArray) => Any)
+
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LRScheduler.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LRScheduler.scala
index 35e05786fe74..756194b94566 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LRScheduler.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LRScheduler.scala
@@ -51,4 +51,3 @@ class FactorScheduler(protected var step: Int, protected var factor: Float) exte
     this.baseLR
   }
 }
-
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
index 5f2aee984757..96fe46a82815 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
@@ -41,12 +41,15 @@ class LibInfo {
                                 ndim: MXUintRef,
                                 data: ArrayBuffer[Int]): Int
   @native def mxNDArraySyncCopyToCPU(handle: NDArrayHandle,
-                                     data: Array[Float],
+                                     data: Array[MXFloat],
                                      size: Int): Int
   @native def mxNDArraySlice(handle: NDArrayHandle,
                              start: MXUint,
                              end: MXUint,
                              sliceHandle: NDArrayHandle): Int
+  @native def mxNDArraySyncCopyFromCPU(handle: NDArrayHandle,
+                                       source: Array[MXFloat],
+                                       size: Int): Int
   @native def mxKVStoreCreate(name: String, handle: KVStoreHandle): Int
   @native def mxKVStoreInit(handle: KVStoreHandle,
                             len: MXUint,
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Monitor.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Monitor.scala
new file mode 100644
index 000000000000..1c9c1edbabc1
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Monitor.scala
@@ -0,0 +1,114 @@
+package ml.dmlc.mxnet
+
+import org.slf4j.LoggerFactory
+import scala.collection.mutable
+
+/**
+ * Monitor outputs, weights, and gradients for debugging.
+ *
+ * @author Yuan Tang
+ *
+ * @param interval Number of batches between printing.
+ * @param statFunc A function that computes statistics of tensors.
+ *                 Takes a NDArray and returns a NDArray. defaults
+ *                 to mean absolute value |x|/size(x).
+ */
+class Monitor(protected val interval: Int, protected var statFunc: (NDArray) => NDArray = null) {
+
+  private val logger = LoggerFactory.getLogger(classOf[Monitor])
+
+  if (statFunc == null) {
+    // TODO: more details here
+    statFunc = (x: NDArray) => x
+  }
+
+  private var activated: Boolean = false
+  private var queue =  new mutable.Queue[(Int, String, NDArray)]
+  private var step: Int = 0
+  private var exes =  new mutable.Queue[Executor]
+
+  protected val statHelper = (name: String, arr: NDArray) => {
+    if (activated) {
+      // TODO: more details here
+      queue ++= List((step, name, statFunc(arr)))
+    }
+  }
+
+
+  /**
+   * Install callback to executor.
+   * Supports installing to multiple exes
+   * @param exe the Executor (returned by symbol.bind) to install to.
+   */
+  def install(exe: Executor) = {
+    exe.setMonitorCallback(statHelper)
+    exes ++= List(exe)
+  }
+
+
+  /**
+   * Start collecting stats for current batch.
+   * Call before forward
+   */
+  def tic = {
+    if (step % interval == 0) {
+      exes.foreach { exe =>
+        exe.argArrays.foreach {arr => arr.waitToRead()}
+      }
+      queue =  new mutable.Queue[(Int, String, NDArray)]
+      activated = true
+    }
+    step += 1
+  }
+
+
+  /**
+   * End collecting for current batch and return results.
+   * Call after computation of current batch.
+   */
+  def toc: mutable.Queue[(Int, String, String)] = {
+
+    if (activated) {
+      exes.foreach { exe =>
+        exe.argArrays.foreach {arr => arr.waitToRead()}
+      }
+      exes.foreach { exe =>
+        null
+        // TODO: need to implement Symbol first
+      /*  for name, array in zip(exe._symbol.list_arguments(), exe.arg_arrays):
+          self.queue.append((self.step, name, self.stat_func(array)))*/
+      }
+    } else {
+      return new mutable.Queue[(Int, String, String)]
+    }
+
+    activated = false
+
+    val res = new mutable.Queue[(Int, String, String)]
+
+    queue.foreach { q =>
+      val (n, k, v) = q
+      if (v.shape.sameElements(Array(1))) {
+        res ++= List((n, k, v.toScalar.toString))
+      } else {
+        res ++= List((n, k, v.toArray.toString))
+      }
+    }
+
+    queue = new mutable.Queue[(Int, String, NDArray)]
+
+    return res
+  }
+
+  /**
+   * End collecting and print results
+   */
+  def tocPrint = {
+    val res = toc
+    res.foreach { re =>
+      val (n, k, v) = re
+      logger.info(s"Batch: ${n} ${k} ${v}")
+    }
+  }
+
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
index 9b8a55b99c77..91a8e526634e 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
@@ -201,6 +201,10 @@ object NDArray {
     new NDArray(handle = NDArray._newAllocHandle(shape, context, delayAlloc = false))
   }
 
+  def empty(shape: Int *): NDArray = empty(shape.toArray)
+
+  def empty(ctx: Context, shape: Int *): NDArray = empty(shape.toArray, ctx)
+
   /**
    * Create a new NDArray filled with 0, with specified shape.
    *
@@ -211,10 +215,14 @@ object NDArray {
    */
   def zeros(shape: Array[Int], ctx: Context=null): NDArray = {
     val arr = empty(shape, ctx)
-    arr(0).set(0f)
+    arr.set(0f)
     arr
   }
 
+  def zeros(shape: Int *): NDArray = zeros(shape.toArray)
+
+  def zeros(ctx: Context, shape: Int *): NDArray = zeros(shape.toArray, ctx)
+
   /**
    * Create a new NDArray filled with 1, with specified shape.
    * @param shape shape of the NDArray.
@@ -223,10 +231,34 @@ object NDArray {
    */
   def ones(shape: Array[Int], ctx: Context=null): NDArray = {
     val arr = empty(shape, ctx)
-    arr(0).set(1f)
+    arr.set(1f)
     arr
   }
 
+  def ones(shape: Int *): NDArray = ones(shape.toArray)
+
+  def ones(ctx: Context, shape: Int *): NDArray = ones(shape.toArray, ctx)
+
+  /**
+   * Clip ndarray elements to range (from, to)
+   * @param array ndarray to be clipped
+   * @param min array min elements
+   * @param max array max elements
+   * @return a new clipped [[NDArray]]
+   */
+  def clip(array: NDArray, min: Float, max: Float): NDArray = {
+    NDArray._genericNDArrayFunction("clip", Array(array, min, max))(0)
+  }
+
+  /**
+   * Take sqrt of the src
+   * @param src Source input to the function
+   * @return new [[NDArray]]
+   */
+  def sqrt(src: NDArray): NDArray = {
+    NDArray._unaryNDArrayFunction("sqrt", src)
+  }
+
   /**
    * Create a new NDArray that copies content from source_array.
    * @param sourceArr Source data to create NDArray from.
@@ -285,7 +317,10 @@ class NDArray(val handle: NDArrayHandle, val writable: Boolean = true) {
    * Peform an synchronize copy from the array.
    * @param source The data source we should like to copy from.
    */
-  def _syncCopyfrom(source: Array[Float]): Unit = ???
+  private def syncCopyfrom(source: Array[Float]): Unit = {
+    require(source.length == size, "array size do not match the size of NDArray")
+    checkCall(_LIB.mxNDArraySyncCopyFromCPU(handle, source, source.length))
+  }
 
   /**
    * Return a sliced NDArray that shares memory with current one.
@@ -296,14 +331,14 @@ class NDArray(val handle: NDArrayHandle, val writable: Boolean = true) {
    *
    * @return a sliced NDArray that shares memory with current one.
    */
-  private def _slice(start: Int, stop: Int): NDArray = {
+  def slice(start: Int, stop: Int): NDArray = {
     val sliceHandle = new NDArrayHandle()
     checkCall(_LIB.mxNDArraySlice(handle, start, stop, sliceHandle))
     new NDArray(handle = sliceHandle, writable = this.writable)
   }
 
-  private def _slice(start: Int): NDArray = {
-    _slice(start, shape(0))
+  def slice(start: Int): NDArray = {
+    slice(start, shape(0))
   }
 
   /**
@@ -314,9 +349,6 @@ class NDArray(val handle: NDArrayHandle, val writable: Boolean = true) {
    */
   def waitToRead(): Unit = ???
 
-  def apply(sliceStart: Int): NDArray = _slice(sliceStart)
-  def apply(sliceStart: Int, sliceEnd: Int): NDArray = _slice(sliceStart, sliceEnd)
-
   /**
    * Get context of current NDArray.
    * @return The context of current NDArray.
@@ -334,10 +366,17 @@ class NDArray(val handle: NDArrayHandle, val writable: Boolean = true) {
     this
   }
 
-  def set(other: NDArray) = {
+  def set(other: NDArray): NDArray = {
+    require(writable, "trying to assign to a readonly NDArray")
     other.copyTo(this)
   }
 
+  def set(other: Array[Float]): NDArray = {
+    require(writable, "trying to assign to a readonly NDArray")
+    syncCopyfrom(other)
+    this
+  }
+
   def +(other: NDArray): NDArray = {
     NDArray._binaryNDArrayFunction("_plus", this, other)
   }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
index 0d0cd38d6638..f9a58f5ca4db 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
@@ -1,9 +1,11 @@
 package ml.dmlc.mxnet
 
+import scala.collection.mutable
+
 object Optimizer {
   def getUpdater(optimizer: Optimizer): MXKVStoreUpdater = {
     new MXKVStoreUpdater {
-      private val states = new scala.collection.mutable.HashMap[Int, AnyRef]
+      val states = new scala.collection.mutable.HashMap[Int, AnyRef]
       override def update(index: Int, grad: NDArray, weight: NDArray, handle: AnyRef): Unit = {
         val state = states.getOrElseUpdate(index, optimizer.createState(index, weight))
         optimizer.update(index, weight, grad, state)
@@ -12,7 +14,11 @@ object Optimizer {
   }
 }
 
-abstract class Optimizer extends Serializable {
+abstract class Optimizer(protected var rescaleGrad: Float = 1f) extends Serializable {
+  protected var lrScale: mutable.Map[Int, Float] = mutable.HashMap.empty[Int, Float]
+  protected var numUpdate: Int = 0
+  protected val indexUpdateCount: mutable.Map[Int, Int] = mutable.HashMap.empty[Int, Int]
+
   /**
    * Update the parameters.
    * @param index An unique integer key used to index the parameters
@@ -21,10 +27,27 @@ abstract class Optimizer extends Serializable {
    * @param state NDArray or other objects returned by initState
    *              The auxiliary state used in optimization.
    */
-  def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = ???
+  // TODO: make state a ClassTag
+  def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit
 
   // Create additional optimizer state such as momentum.
+  // TODO: make returned state a ClassTag
   def createState(index: Int, weight: NDArray): AnyRef
+
+  // Set individual learning rate scale for parameters
+  def setLrScale(lrScale: Map[Int, Float]) {
+    this.lrScale = mutable.Map(lrScale.toSeq: _*)
+  }
+
+  /**
+   * update num_update
+   * @param index The index will be updated
+   */
+  protected def updateCount(index: Int): Unit = {
+    val count = indexUpdateCount.getOrElseUpdate(index, 0) + 1
+    indexUpdateCount.update(index, count)
+    numUpdate = Math.max(count, numUpdate)
+  }
 }
 
 trait MXKVStoreUpdater {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
new file mode 100644
index 000000000000..5616506e78e6
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
@@ -0,0 +1,86 @@
+import ml.dmlc.mxnet.{NDArray, Optimizer, LRScheduler}
+import ml.dmlc.mxnet.NDArrayConversions._
+
+/**
+ * Adam optimizer as described in [King2014]
+ *
+ * [King2014] Diederik Kingma, Jimmy Ba,
+ * Adam: A Method for Stochastic Optimization,
+ * http://arxiv.org/abs/1412.6980
+ *
+ * @author Yuan Tang
+ *
+ * @param learningRate Float, Step size.
+ * @param beta1 Float, Exponential decay rate for the first moment estimates.
+ * @param beta2 Float, Exponential decay rate for the second moment estimates.
+ * @param epsilon Float
+ * @param decayFactor Float
+ * @param wd Float, L2 regularization coefficient add to all the weights
+ * @param rescaleGrad Float, rescaling factor of gradient.
+ * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
+ * @param lrScheduler The learning rate scheduler
+ */
+class Adam(var learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2: Float = 0.999f,
+          val epsilon: Float = 0.00000001f, val decayFactor: Float = 1-0.00000001f, val wd: Float = 0.0f,
+           rescaleGrad: Float = 1f, val clipGradient: Float = 0f,
+          val lrScheduler: LRScheduler = null) extends Optimizer(rescaleGrad: Float) {
+
+  protected var time: Int = 0
+  protected var timeFirstIndex: Int = 0
+  /**
+   * Update the parameters.
+   * @param index An unique integer key used to index the parameters
+   * @param weight weight ndarray
+   * @param grad grad ndarray
+   * @param state NDArray or other objects returned by initState
+   *              The auxiliary state used in optimization.
+   */
+  override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
+    val lr =
+      (if (lrScheduler != null) {
+        val scheduledLr = lrScheduler(numUpdate)
+        updateCount(index)
+        scheduledLr
+      } else {
+        this.learningRate
+      }) * lrScale.getOrElse(index, 1f)
+
+    var (mean, variance)  = state
+
+    if (timeFirstIndex == 0) {
+      timeFirstIndex = index
+      time = 0
+    } else if (timeFirstIndex == index) {
+      time += 1
+    }
+
+    val t1: Int = time + 1
+    learningRate = (lr * math.sqrt(1.0 - math.pow(beta2, t1))/(1.0 - math.pow(beta1, t1))) toFloat
+    val beta1t = beta1 * math.pow(decayFactor, t1 - 1) toFloat
+
+    var resdGrad = grad * rescaleGrad
+    if (clipGradient != 0f) {
+      resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
+    }
+
+    val meanT = beta1t * mean.asInstanceOf[NDArray] + (1.0 - beta1t) * resdGrad toScalar
+    val varianceT = beta2 * variance.asInstanceOf[NDArray] + (1.0f - beta2) * resdGrad * resdGrad toScalar
+
+    var step = learningRate * meanT / (math.sqrt(varianceT) + epsilon)
+
+    if (wd > 0.0f) {
+      step += (lr * wd * weight).toScalar
+    }
+
+    weight += -step.toFloat
+    mean = meanT
+    variance = varianceT
+  }
+
+  // Create additional optimizer state: mean, variance
+  override def createState(index: Int, weight: NDArray): AnyRef = {
+    timeFirstIndex = 0
+    (NDArray.zeros(weight.shape, weight.context), // mean
+      NDArray.zeros(weight.shape, weight.context)) // variance
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
new file mode 100644
index 000000000000..bd434dab8ae2
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
@@ -0,0 +1,55 @@
+package ml.dmlc.mxnet.optimizer
+
+import ml.dmlc.mxnet.{Optimizer, LRScheduler, NDArray}
+import ml.dmlc.mxnet.NDArrayConversions._
+
+/**
+ * A very simple SGD optimizer with momentum and weight regularization.
+ * @author Yizhi Liu
+ */
+class SGD(val learningRate: Float = 0.01f, val momentum: Float = 0.0f,
+          val wd: Float = 0.0001f, rescaleGrad: Float = 1f, val clipGradient: Float = 0f,
+          val lrScheduler: LRScheduler = null) extends Optimizer(rescaleGrad: Float) {
+  /**
+   * Update the parameters.
+   * @param index An unique integer key used to index the parameters
+   * @param weight weight ndarray
+   * @param grad grad ndarray
+   * @param state NDArray or other objects returned by initState
+   *              The auxiliary state used in optimization.
+   */
+  override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
+    // TODO(bing) implement wd_bias, wd_gamma, wd_beta (copy from python package)
+    val lr =
+      (if (lrScheduler != null) {
+        val scheduledLr = lrScheduler(numUpdate)
+        updateCount(index)
+        scheduledLr
+      } else {
+        this.learningRate
+      }) * lrScale.getOrElse(index, 1f)
+
+    var resdGrad = grad * rescaleGrad
+    if (clipGradient != 0f) {
+      resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
+    }
+    if (state != null) {
+      val mom = state.asInstanceOf[NDArray]
+      mom *= momentum
+      mom += -lr * (grad + wd * weight)
+      weight += mom
+    } else {
+      require(momentum == 0f)
+      weight += -lr * (grad + wd * weight)
+    }
+  }
+
+  // Create additional optimizer state such as momentum.
+  override def createState(index: Int, weight: NDArray): AnyRef = {
+    if (momentum == 0.0f) {
+      null
+    } else {
+      NDArray.zeros(weight.shape, weight.context)
+    }
+  }
+}
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
index 4dfa62e19621..dba751df7916 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
@@ -5,29 +5,41 @@ import ml.dmlc.mxnet.NDArrayConversions._
 
 class NDArraySuite extends FunSuite with BeforeAndAfterAll {
   test("to java array") {
-    val ndarray = NDArray.zeros(Array(2, 2))
+    val ndarray = NDArray.zeros(2, 2)
     assert(ndarray.toArray === Array(0f, 0f, 0f, 0f))
   }
 
   test("to scalar") {
-    val ndzeros = NDArray.zeros(Array(1))
+    val ndzeros = NDArray.zeros(1)
     assert(ndzeros.toScalar === 0f)
-    val ndones = NDArray.ones(Array(1))
+    val ndones = NDArray.ones(1)
     assert(ndones.toScalar === 1f)
   }
 
   test ("call toScalar on an ndarray which is not a scalar") {
-    intercept[Exception] { NDArray.zeros(Array(1,1)).toScalar }
+    intercept[Exception] { NDArray.zeros(1, 1).toScalar }
   }
 
   test("size and shape") {
-    val ndzeros = NDArray.zeros(Array(4, 1))
+    val ndzeros = NDArray.zeros(4, 1)
     assert(ndzeros.shape === Array(4, 1))
     assert(ndzeros.size === 4)
   }
 
+  test("set scalar value") {
+    val ndarray = NDArray.empty(2, 1)
+    ndarray.set(10f)
+    assert(ndarray.toArray === Array(10f, 10f))
+  }
+
+  test("copy from java array") {
+    val ndarray = NDArray.empty(4, 1)
+    ndarray.set(Array(1f, 2f, 3f, 4f))
+    assert(ndarray.toArray === Array(1f, 2f, 3f, 4f))
+  }
+
   test("plus") {
-    val ndzeros = NDArray.zeros(Array(2, 1))
+    val ndzeros = NDArray.zeros(2, 1)
     val ndones = ndzeros + 1f
     assert(ndones.toArray === Array(1f, 1f))
     assert((ndones + ndzeros).toArray === Array(1f, 1f))
@@ -38,7 +50,7 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("minus") {
-    val ndones = NDArray.ones(Array(2, 1))
+    val ndones = NDArray.ones(2, 1)
     val ndzeros = ndones - 1f
     assert(ndzeros.toArray === Array(0f, 0f))
     assert((ndones - ndzeros).toArray === Array(1f, 1f))
@@ -50,7 +62,7 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("multiplication") {
-    val ndones = NDArray.ones(Array(2, 1))
+    val ndones = NDArray.ones(2, 1)
     val ndtwos = ndones * 2
     assert(ndtwos.toArray === Array(2f, 2f))
     assert((ndones * ndones).toArray === Array(1f, 1f))
@@ -61,7 +73,7 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll {
   }
 
   test("division") {
-    val ndones = NDArray.ones(Array(2, 1))
+    val ndones = NDArray.ones(2, 1)
     val ndzeros = ndones - 1f
     val ndhalves = ndones / 2
     assert(ndhalves.toArray === Array(0.5f, 0.5f))
@@ -73,4 +85,15 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll {
     assert(ndhalves.toArray === Array(1f, 1f))
   }
 
+  test("clip") {
+    val ndarray = NDArray.empty(3, 2)
+    ndarray.set(Array(1f, 2f, 3f, 4f, 5f, 6f))
+    assert(NDArray.clip(ndarray, 2f, 5f).toArray === Array(2f, 2f, 3f, 4f, 5f, 5f))
+  }
+
+  test("sqrt") {
+    val ndarray = NDArray.empty(4, 1)
+    ndarray.set(Array(0f, 1f, 4f, 9f))
+    assert(NDArray.sqrt(ndarray).toArray === Array(0f, 1f, 2f, 3f))
+  }
 }
diff --git a/scala-package/native/osx-x86_64/pom.xml b/scala-package/native/osx-x86_64/pom.xml
index 9a4b8c45defc..b3719dcb7b38 100644
--- a/scala-package/native/osx-x86_64/pom.xml
+++ b/scala-package/native/osx-x86_64/pom.xml
@@ -13,6 +13,10 @@
   <name>MXNet Scala Package - Native OSX-x86_64</name>
   <url>http://maven.apache.org</url>
 
+  <properties>
+    <opencv.pkg.tmpfile>opencv.pkg.txt</opencv.pkg.tmpfile>
+  </properties>
+
   <packaging>jnilib</packaging>
 
   <dependencies>
@@ -66,9 +70,15 @@
                     <isset property="cxx" />
                   </not>
                   <then>
-                    <property name="cxx" value="clang++" />
+                    <property name="cxx" value="g++" />
                   </then>
                 </if>
+                <!-- echo opencv libs to a temp file and then load it into a property -->
+                <exec executable="pkg-config" output="${opencv.pkg.tmpfile}" failonerror="true">
+                  <arg line="--libs opencv" />
+                </exec>
+                <loadfile srcfile="${opencv.pkg.tmpfile}" property="ldflags.opencv" />
+                <delete file="${opencv.pkg.tmpfile}" />
               </tasks>
             </configuration>
           </execution>
@@ -93,6 +103,14 @@
               </fileNames>
             </source>
           </sources>
+          <compilerStartOptions>
+            <compilerStartOption>-std=c++0x</compilerStartOption>
+          </compilerStartOptions>
+          <compilerMiddleOptions>
+            <compilerMiddleOption>
+              -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas
+            </compilerMiddleOption>
+          </compilerMiddleOptions>
           <compilerEndOptions>
             <compilerEndOption>-I../../../include</compilerEndOption>
             <compilerEndOption>-I../../../dmlc-core/include</compilerEndOption>
@@ -101,15 +119,19 @@
             <compilerEndOption>-DMSHADOW_USE_CBLAS=${use.cblas}</compilerEndOption>
             <compilerEndOption>-DMSHADOW_USE_MKL=${use.mkl}</compilerEndOption>
             <compilerEndOption>-fPIC</compilerEndOption>
-            <compilerEndOption>-std=c++11</compilerEndOption>
           </compilerEndOptions>
+          <linkerStartOptions>
+            <linkerStartOption>-shared</linkerStartOption>
+          </linkerStartOptions>
           <linkerMiddleOptions>
-            <linkerMiddleOption>-dynamiclib</linkerMiddleOption>
-            <linkerMiddleOption>../../../lib/libmxnet.a</linkerMiddleOption>
+            <linkerMiddleOption>-fopenmp</linkerMiddleOption>
+            <linkerMiddleOption>-lm</linkerMiddleOption>
+            <linkerMiddleOption>-framework Accelerate ${ldflags.opencv}</linkerMiddleOption>
             <linkerMiddleOption>-framework JavaVM ${ldflags.blas}</linkerMiddleOption>
-            <linkerMiddleOption>-Wl,-dead_strip</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
+            <linkerMiddleOption>../../../dmlc-core/libdmlc.a</linkerMiddleOption>
+            <linkerMiddleOption>-force_load ../../../lib/libmxnet.a</linkerMiddleOption>
           </linkerMiddleOptions>
         </configuration>
 
diff --git a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
index 2f1c95e0a6d0..a78a4b126659 100644
--- a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
@@ -204,6 +204,15 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxNDArraySlice(JNIEnv *env, jo
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxNDArraySyncCopyFromCPU
+  (JNIEnv *env, jobject obj, jobject ndArrayHandle, jfloatArray sourceArr, jint arrSize) {
+  jlong arrayPtr = getLongField(env, ndArrayHandle);
+  jfloat *sourcePtr = env->GetFloatArrayElements(sourceArr, NULL);
+  int ret = MXNDArraySyncCopyFromCPU((NDArrayHandle)arrayPtr, (const mx_float *)sourcePtr, arrSize);
+  env->ReleaseFloatArrayElements(sourceArr, sourcePtr, 0);
+  return ret;
+}
+
 // The related c api MXKVStoreSetUpdater function takes a c function pointer as its parameter,
 // while we write java functions here in scala-package.
 // Thus we have to wrap the function in a java object, and run env->CallVoidMethod(obj) once updater is invoked,
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 85206d11409b..25c6b9cdcf1b 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -131,6 +131,29 @@
             </execution>
           </executions>
         </plugin>
+        <plugin>
+          <groupId>org.scalastyle</groupId>
+          <artifactId>scalastyle-maven-plugin</artifactId>
+          <version>0.8.0</version>
+          <configuration>
+            <verbose>false</verbose>
+            <failOnViolation>true</failOnViolation>
+            <includeTestSourceDirectory>true</includeTestSourceDirectory>
+            <failOnWarning>false</failOnWarning>
+            <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
+            <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
+            <configLocation>${basedir}/src/main/resources/scalastyle_config.xml</configLocation>
+            <outputFile>${project.basedir}/scalastyle_output.xml</outputFile>
+            <outputEncoding>UTF-8</outputEncoding>
+          </configuration>
+          <executions>
+            <execution>
+              <goals>
+                <goal>check</goal>
+              </goals>
+            </execution>
+          </executions>
+        </plugin>
         <plugin>
           <groupId>net.alchim31.maven</groupId>
           <artifactId>scala-maven-plugin</artifactId>
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 3ce1e5be1690..47a069cca816 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -9,7 +9,8 @@
 #include <mxnet/symbolic.h>
 #include <mxnet/ndarray.h>
 #include <memory>
-
+#include <unordered_set>
+#include <unordered_map>
 #include "./c_api_error.h"
 
 using namespace mxnet;
@@ -36,14 +37,40 @@ struct MXAPINDList {
 };
 
 int MXPredCreate(const char* symbol_json_str,
-                 const char* param_bytes,
-                 size_t param_size,
+                 const void* param_bytes,
+                 int param_size,
                  int dev_type, int dev_id,
                  mx_uint num_input_nodes,
                  const char** input_keys,
                  const mx_uint* input_shape_indptr,
                  const mx_uint* input_shape_data,
-                 PredictorHandle* out) {
+                PredictorHandle* out) {
+  return MXPredCreatePartialOut(
+      symbol_json_str,
+      param_bytes,
+      param_size,
+      dev_type,
+      dev_id,
+      num_input_nodes,
+      input_keys,
+      input_shape_indptr,
+      input_shape_data,
+      0,
+      NULL,
+      out);
+}
+
+int MXPredCreatePartialOut(const char* symbol_json_str,
+                           const void* param_bytes,
+                           int param_size,
+                           int dev_type, int dev_id,
+                           mx_uint num_input_nodes,
+                           const char** input_keys,
+                           const mx_uint* input_shape_indptr,
+                           const mx_uint* input_shape_data,
+                           mx_uint num_output_nodes,
+                           const char** output_keys,
+                           PredictorHandle* out) {
   MXAPIPredictor* ret = new MXAPIPredictor();
   API_BEGIN();
   Symbol sym;
@@ -54,9 +81,37 @@ int MXPredCreate(const char* symbol_json_str,
     dmlc::JSONReader reader(&is);
     sym.Load(&reader);
   }
+  // looks likely to output the internal results
+  if (num_output_nodes != 0) {
+    Symbol internal = sym.GetInternals();
+    std::vector<std::string> all_out = internal.ListOutputs();
+    std::vector<Symbol> out_syms(num_output_nodes);
+    for (mx_uint i = 0; i < num_output_nodes; ++i) {
+      std::string out_key(output_keys[i]);
+      out_key += "_output";
+      for (size_t j = 0; j < all_out.size(); ++j) {
+        if (all_out[j] == out_key) {
+          out_syms[i] = internal[j];
+          break;
+        }
+        CHECK_NE(j, all_out.size() - 1) << "didn't find node name: " << out_key;
+      }
+    }
+    sym = Symbol::CreateGroup(out_syms);
+  }
+
   // load the parameters
   std::unordered_map<std::string, NDArray> arg_params, aux_params;
   {
+    std::unordered_set<std::string> arg_names, aux_names;
+    std::vector<std::string> arg_names_vec = sym.ListArguments();
+    std::vector<std::string> aux_names_vec = sym.ListAuxiliaryStates();
+    for (size_t i = 0; i < arg_names_vec.size(); ++i) {
+      arg_names.insert(arg_names_vec[i]);
+    }
+    for (size_t i = 0; i < aux_names_vec.size(); ++i) {
+      aux_names.insert(aux_names_vec[i]);
+    }
     std::vector<NDArray> data;
     std::vector<std::string> names;
     dmlc::MemoryFixedSizeStream fi((void*)param_bytes, param_size);  // NOLINT(*)
@@ -65,10 +120,16 @@ int MXPredCreate(const char* symbol_json_str,
         << "Invalid param file format";
     for (size_t i = 0; i < names.size(); ++i) {
       if (!strncmp(names[i].c_str(), "aux:", 4)) {
-        aux_params[std::string(names[i].c_str() + 4)]  = data[i];
+        std::string name(names[i].c_str() + 4);
+        if (aux_names.count(name) != 0) {
+          aux_params[name] = data[i];
+        }
       }
       if (!strncmp(names[i].c_str(), "arg:", 4)) {
-        arg_params[std::string(names[i].c_str() + 4)]  = data[i];
+        std::string name(names[i].c_str() + 4);
+        if (arg_names.count(name) != 0) {
+          arg_params[name] = data[i];
+        }
       }
     }
   }
@@ -192,7 +253,7 @@ int MXPredFree(PredictorHandle handle) {
 }
 
 int MXNDListCreate(const char* nd_file_bytes,
-                   size_t nd_file_size,
+                   int nd_file_size,
                    NDListHandle *out,
                    mx_uint* out_length) {
   MXAPINDList* ret = new MXAPINDList();
@@ -242,5 +303,3 @@ int MXNDListFree(NDListHandle handle) {
   delete static_cast<MXAPINDList*>(handle);
   API_END();
 }
-
-
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 5a640bc97802..53d6e6092d29 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -132,7 +132,9 @@ class ImageAugmenter {
 
     // normal augmentation by affine transformation.
     if (param_.max_rotate_angle > 0 || param_.max_shear_ratio > 0.0f
-        || param_.rotate > 0 || rotate_list_.size() > 0) {
+        || param_.rotate > 0 || rotate_list_.size() > 0 || param_.max_random_scale != 1.0
+        || param_.min_random_scale != 1.0 || param_.max_aspect_ratio != 0.0f
+        || param_.max_img_size != 1e10f || param_.min_img_size != 0.0f) {
       std::uniform_real_distribution<float> rand_uniform(0, 1);
       // shear
       float s = rand_uniform(*prnd) * param_.max_shear_ratio * 2 - param_.max_shear_ratio;
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index 57db5f2d7846..dd3545d682f4 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -21,21 +21,12 @@ namespace io {
 struct BatchParam : public dmlc::Parameter<BatchParam> {
   /*! \brief label width */
   index_t batch_size;
-  /*! \brief input shape */
-  TShape data_shape;
-  /*! \brief label width */
-  index_t label_width;
   /*! \brief use round roubin to handle overflow batch */
   bool round_batch;
   // declare parameters
   DMLC_DECLARE_PARAMETER(BatchParam) {
     DMLC_DECLARE_FIELD(batch_size)
         .describe("Batch Param: Batch size.");
-    DMLC_DECLARE_FIELD(data_shape)
-        .set_expect_ndim(3).enforce_nonzero()
-        .describe("Dataset Param: Shape of each instance generated by the DataIter.");
-    DMLC_DECLARE_FIELD(label_width).set_default(1)
-        .describe("Dataset Param: Label width.");
     DMLC_DECLARE_FIELD(round_batch).set_default(true)
         .describe("Batch Param: Use round robin to handle overflow batch.");
   }
@@ -45,40 +36,26 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
 class BatchLoader : public IIterator<TBlobBatch> {
  public:
   explicit BatchLoader(IIterator<DataInst> *base):
-      base_(base), head_(1), num_overflow_(0) {}
+      base_(base), head_(1), num_overflow_(0) {
+  }
+
   virtual ~BatchLoader(void) {
     delete base_;
-    // Free space for TblobBatch
-    mshadow::FreeSpace(&data_holder_);
-    mshadow::FreeSpace(&label_holder_);
   }
+
   inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init batch param, it could have similar param with
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // init object attributes
-    std::vector<size_t> data_shape_vec;
-    data_shape_vec.push_back(param_.batch_size);
-    for (size_t shape_dim = 0; shape_dim < param_.data_shape.ndim(); ++shape_dim) {
-      data_shape_vec.push_back(param_.data_shape[shape_dim]);
-    }
-    data_shape_ = TShape(data_shape_vec.begin(), data_shape_vec.end());
-    std::vector<size_t> label_shape_vec;
-    label_shape_vec.push_back(param_.batch_size);
-    label_shape_vec.push_back(param_.label_width);
-    label_shape_ = TShape(label_shape_vec.begin(), label_shape_vec.end());
     // Init space for out_
     out_.inst_index = new unsigned[param_.batch_size];
     out_.batch_size = param_.batch_size;
     out_.data.clear();
-    data_holder_ = mshadow::NewTensor<mshadow::cpu>(data_shape_.get<4>(), 0.0f);
-    label_holder_ = mshadow::NewTensor<mshadow::cpu>(label_shape_.get<2>(), 0.0f);
-    out_.data.push_back(TBlob(data_holder_));
-    out_.data.push_back(TBlob(label_holder_));
     // init base iterator
     base_->Init(kwargs);
   }
-  inline void BeforeFirst(void) {
+
+  virtual void BeforeFirst(void) {
     if (param_.round_batch == 0 || num_overflow_ == 0) {
       // otherise, we already called before first
       base_->BeforeFirst();
@@ -87,7 +64,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
     }
     head_ = 1;
   }
-  inline bool Next(void) {
+  virtual bool Next(void) {
     out_.num_batch_padd = 0;
     out_.batch_size = param_.batch_size;
     this->head_ = 0;
@@ -99,11 +76,15 @@ class BatchLoader : public IIterator<TBlobBatch> {
     while (base_->Next()) {
       const DataInst& d = base_->Value();
       out_.inst_index[top] = d.index;
-      mshadow::Copy(out_.data[1].get<mshadow::cpu, 2, float>()[top],
-              d.data[1].get<mshadow::cpu, 1, float>());
-      mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
-              d.data[0].get<mshadow::cpu, 3, float>());
-      if (++ top >= param_.batch_size) {
+      if (data_.size() == 0) {
+        this->InitData(d);
+      }
+      for (size_t i = 0; i < d.data.size(); ++i) {
+        CHECK_EQ(unit_size_[i], d.data[i].Size());
+        mshadow::Copy(data_[i].Slice(top * unit_size_[i], (top + 1) * unit_size_[i]),
+                      d.data[i].get_with_shape<cpu, 1, real_t>(mshadow::Shape1(unit_size_[i])));
+      }
+      if (++top >= param_.batch_size) {
         return true;
       }
     }
@@ -115,10 +96,12 @@ class BatchLoader : public IIterator<TBlobBatch> {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
           out_.inst_index[top] = d.index;
-          mshadow::Copy(out_.data[1].get<mshadow::cpu, 2, float>()[top],
-                  d.data[1].get<mshadow::cpu, 1, float>());
-          mshadow::Copy(out_.data[0].get<mshadow::cpu, 4, float>()[top],
-                  d.data[0].get<mshadow::cpu, 3, float>());
+          // copy data
+          for (size_t i = 0; i < d.data.size(); ++i) {
+            CHECK_EQ(unit_size_[i], d.data[i].Size());
+            mshadow::Copy(data_[i].Slice(top * unit_size_[i], (top + 1) * unit_size_[i]),
+                          d.data[i].get_with_shape<cpu, 1, real_t>(mshadow::Shape1(unit_size_[i])));
+          }
         }
         out_.num_batch_padd = num_overflow_;
       } else {
@@ -144,13 +127,31 @@ class BatchLoader : public IIterator<TBlobBatch> {
   /*! \brief number of overflow instances that readed in round_batch mode */
   int num_overflow_;
   /*! \brief data shape */
-  TShape data_shape_;
-  /*! \brief label shape */
-  TShape label_shape_;
+  std::vector<TShape> shape_;
+  /*! \brief unit size */
+  std::vector<size_t> unit_size_;
   /*! \brief tensor to hold data */
-  mshadow::Tensor<mshadow::cpu, 4, real_t> data_holder_;
-  /*! \brief tensor to hold label */
-  mshadow::Tensor<mshadow::cpu, 2, real_t> label_holder_;
+  std::vector<mshadow::TensorContainer<mshadow::cpu, 1, real_t> > data_;
+  // initialize the data holder by using from the first batch.
+  inline void InitData(const DataInst& first_batch) {
+    shape_.resize(first_batch.data.size());
+    data_.resize(first_batch.data.size());
+    unit_size_.resize(first_batch.data.size());
+    for (size_t i = 0; i < first_batch.data.size(); ++i) {
+      TShape src_shape = first_batch.data[i].shape_;
+      // init object attributes
+      std::vector<index_t> shape_vec;
+      shape_vec.push_back(param_.batch_size);
+      for (index_t dim = 0; dim < src_shape.ndim(); ++dim) {
+        shape_vec.push_back(src_shape[dim]);
+      }
+      TShape dst_shape(shape_vec.begin(), shape_vec.end());
+      shape_[i] = dst_shape;
+      data_[i].Resize(mshadow::Shape1(dst_shape.Size()));
+      unit_size_[i] = src_shape.Size();
+      out_.data.push_back(TBlob(data_[i].dptr_, dst_shape, cpu::kDevMask));
+    }
+  }
 };  // class BatchLoader
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_csv.cc b/src/io/iter_csv.cc
new file mode 100644
index 000000000000..314f4c5d0847
--- /dev/null
+++ b/src/io/iter_csv.cc
@@ -0,0 +1,141 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file iter_csv.cc
+ * \brief define a CSV Reader to read in arrays
+ */
+#include <mxnet/io.h>
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <dmlc/data.h>
+#include "./iter_prefetcher.h"
+#include "./iter_batchloader.h"
+
+namespace mxnet {
+namespace io {
+// CSV parameters
+struct CSVIterParam : public dmlc::Parameter<CSVIterParam> {
+  /*! \brief path to the csv file */
+  std::string data_csv;
+  /*! \brief input shape */
+  TShape data_shape;
+  /*! \brief path to label csv file */
+  std::string label_csv;
+  /*! \brief path to label csv file */
+  TShape label_shape;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(CSVIterParam) {
+    DMLC_DECLARE_FIELD(data_csv)
+        .describe("Dataset Param: Data csv path.");
+    DMLC_DECLARE_FIELD(data_shape)
+        .describe("Dataset Param: Shape of the data.");
+    DMLC_DECLARE_FIELD(label_csv).set_default("NULL")
+        .describe("Dataset Param: Label csv path. If is NULL, all labels will be returned as 0");
+    index_t shape1[] = {1};
+    DMLC_DECLARE_FIELD(label_shape).set_default(TShape(shape1, shape1 + 1))
+        .describe("Dataset Param: Shape of the label.");
+  }
+};
+
+class CSVIter: public IIterator<DataInst> {
+ public:
+  CSVIter() {
+    out_.data.resize(2);
+  }
+  virtual ~CSVIter() {}
+
+  // intialize iterator loads data in
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    data_parser_.reset(dmlc::Parser<uint32_t>::Create(param_.data_csv.c_str(), 0, 1, "csv"));
+    if (param_.label_csv != "NULL") {
+      label_parser_.reset(dmlc::Parser<uint32_t>::Create(param_.label_csv.c_str(), 0, 1, "csv"));
+    } else {
+      dummy_label.set_pad(false);
+      dummy_label.Resize(mshadow::Shape1(1));
+      dummy_label = 0.0f;
+    }
+  }
+
+  virtual void BeforeFirst() {
+    data_parser_->BeforeFirst();
+    if (label_parser_.get() != nullptr) {
+      label_parser_->BeforeFirst();
+    }
+    data_ptr_ = label_ptr_ = 0;
+    data_size_ = label_size_ = 0;
+    inst_counter_ = 0;
+    end_ = false;
+  }
+
+  virtual bool Next() {
+    if (end_) return false;
+    while (data_ptr_ >= data_size_) {
+      if (!data_parser_->Next()) {
+        end_ = true; return false;
+      }
+      data_ptr_ = 0;
+      data_size_ = data_parser_->Value().size;
+    }
+    out_.index = inst_counter_++;
+    CHECK_LT(data_ptr_, data_size_);
+    out_.data[0] = AsTBlob(data_parser_->Value()[data_ptr_++], param_.data_shape);
+
+    if (label_parser_.get() != nullptr) {
+      while (label_ptr_ >= label_size_) {
+        CHECK(label_parser_->Next())
+            << "Data CSV's row is smaller than the number of rows in label_csv";
+        label_ptr_ = 0;
+        label_size_ = label_parser_->Value().size;
+      }
+      CHECK_LT(label_ptr_, label_size_);
+      out_.data[1] = AsTBlob(label_parser_->Value()[label_ptr_++], param_.label_shape);
+    } else {
+      out_.data[1] = dummy_label;
+    }
+    return true;
+  }
+
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ private:
+  inline TBlob AsTBlob(const dmlc::Row<uint32_t>& row, const TShape& shape) {
+    CHECK_EQ(row.length, shape.Size())
+        << "The data size in CSV do not match size of shape: "
+        << "specified shape=" << shape << ", the csv row-length=" << row.length;
+    const real_t* ptr = row.value;
+    return TBlob((real_t*)ptr, shape, cpu::kDevMask);  // NOLINT(*)
+  }
+
+  CSVIterParam param_;
+  // output instance
+  DataInst out_;
+  // internal instance counter
+  unsigned inst_counter_{0};
+  // at end
+  bool end_{false};
+  // dummy label
+  mshadow::TensorContainer<cpu, 1, real_t> dummy_label;
+  // label parser
+  size_t label_ptr_{0}, label_size_{0};
+  size_t data_ptr_{0}, data_size_{0};
+  std::unique_ptr<dmlc::Parser<uint32_t> > label_parser_;
+  std::unique_ptr<dmlc::Parser<uint32_t> > data_parser_;
+};
+
+
+DMLC_REGISTER_PARAMETER(CSVIterParam);
+
+MXNET_REGISTER_IO_ITER(CSVIter)
+.describe("Create iterator for dataset in csv.")
+.add_arguments(CSVIterParam::__FIELDS__())
+.set_body([]() {
+    return new PrefetcherIter(
+        new BatchLoader(
+            new CSVIter()));
+  });
+
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 1da7f71c0fd6..2db6d344c101 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -247,30 +247,31 @@ ParseNext(std::vector<InstVector> *out_vec) {
       cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
       // -1 to keep the number of channel of the encoded image, and not force gray or color.
       res = cv::imdecode(buf, -1);
-      int n_channels = res.channels();
+      const int n_channels = res.channels();
       res = augmenters_[tid]->Process(res, prnds_[tid]);
       out.Push(static_cast<unsigned>(rec.image_index()),
                mshadow::Shape3(n_channels, res.rows, res.cols),
                mshadow::Shape1(param_.label_width));
 
       mshadow::Tensor<cpu, 3> data = out.data().Back();
-      // Substract mean value on each channel.
-      if (n_channels == 3) {
-        for (int i = 0; i < res.rows; ++i) {
-          for (int j = 0; j < res.cols; ++j) {
-            cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-            data[0][i][j] = bgr[2];
-            data[1][i][j] = bgr[1];
-            data[2][i][j] = bgr[0];
-          }
-        }
-      } else {
-        for (int i = 0; i < res.rows; ++i) {
-          for (int j = 0; j < res.cols; ++j) {
-            data[0][i][j] = res.at<uint8_t>(i, j);
+
+      // For RGB or RGBA data, swap the B and R channel:
+      // OpenCV store as BGR (or BGRA) and we want RGB (or RGBA)
+      std::vector<int> swap_indices;
+      if (n_channels == 1) swap_indices = {0};
+      if (n_channels == 3) swap_indices = {2, 1, 0};
+      if (n_channels == 4) swap_indices = {2, 1, 0, 3};
+
+      for (int i = 0; i < res.rows; ++i) {
+        uchar* im_data = res.ptr<uchar>(i);
+        for (int j = 0; j < res.cols; ++j) {
+          for (int k = 0; k < n_channels; ++k) {
+              data[k][i][j] = im_data[swap_indices[k]];
           }
+          im_data += n_channels;
         }
       }
+
       mshadow::Tensor<cpu, 1> label = out.label().Back();
       if (label_map_ != nullptr) {
         mshadow::Copy(label, label_map_->Find(rec.image_index()));
diff --git a/src/io/iter_normalize.h b/src/io/iter_normalize.h
index ad91ba691254..6435f53d8e0b 100644
--- a/src/io/iter_normalize.h
+++ b/src/io/iter_normalize.h
@@ -37,6 +37,8 @@ struct ImageNormalizeParam :  public dmlc::Parameter<ImageNormalizeParam> {
   float mean_g;
   /*! \brief mean value for b channel */
   float mean_b;
+  /*! \brief mean value for alpha channel */
+  float mean_a;
   /*! \brief scale on color space */
   float scale;
   /*! \brief maximum ratio of contrast variation */
@@ -58,9 +60,11 @@ struct ImageNormalizeParam :  public dmlc::Parameter<ImageNormalizeParam> {
     DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
         .describe("Augmentation Param: Mean value on R channel.");
     DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
-        .describe("Augmentation: Mean value on G channel.");
+        .describe("Augmentation Param: Mean value on G channel.");
     DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
-        .describe("Augmentation: Mean value on B channel.");
+        .describe("Augmentation Param: Mean value on B channel.");
+    DMLC_DECLARE_FIELD(mean_a).set_default(0.0f)
+        .describe("Augmentation Param: Mean value on Alpha channel.");
     DMLC_DECLARE_FIELD(scale).set_default(1.0f)
         .describe("Augmentation Param: Scale in color space.");
     DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
@@ -86,7 +90,8 @@ class ImageNormalizeIter : public IIterator<DataInst> {
     param_.InitAllowUnknown(kwargs);
     base_->Init(kwargs);
     rnd_.seed(kRandMagic + param_.seed);
-
+    outimg_.set_pad(false);
+    meanimg_.set_pad(false);
     if (param_.mean_img.length() != 0) {
       std::unique_ptr<dmlc::Stream> fi(
           dmlc::Stream::Create(param_.mean_img.c_str(), "r", true));
@@ -175,14 +180,16 @@ class ImageNormalizeIter : public IIterator<DataInst> {
     float illumination =
         rand_uniform(rnd_) * param_.max_random_illumination * 2 - param_.max_random_illumination;
 
-    if (param_.mean_r > 0.0f || param_.mean_g > 0.0f || param_.mean_b > 0.0f) {
-      // If the input has 3 channels, we substract the mean value on each
-      if (data.shape_[0] == 3) {
-        data[0] -= param_.mean_r;
+    if (param_.mean_r > 0.0f || param_.mean_g > 0.0f ||
+        param_.mean_b > 0.0f || param_.mean_a > 0.0f) {
+      // substract mean per channel
+      data[0] -= param_.mean_r;
+      if (data.shape_[0] >= 3) {
         data[1] -= param_.mean_g;
         data[2] -= param_.mean_b;
-      } else {
-        data[0] -= param_.mean_r;
+      }
+      if (data.shape_[0] == 4) {
+        data[3] -= param_.mean_a;
       }
       if ((param_.rand_mirror && coin_flip(rnd_)) || param_.mirror) {
         outimg_ = mirror(data * contrast + illumination) * param_.scale;
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
new file mode 100644
index 000000000000..aedd0c9eddbc
--- /dev/null
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -0,0 +1,124 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cudnn_batch_norm-inl.h
+ * \brief
+ * \author Junyuan Xie
+*/
+
+#ifndef MXNET_OPERATOR_CUDNN_BATCH_NORM_INL_H_
+#define MXNET_OPERATOR_CUDNN_BATCH_NORM_INL_H_
+#include <vector>
+#include <map>
+#include <string>
+#include <utility>
+#include "./batch_norm-inl.h"
+
+namespace mxnet {
+namespace op {
+
+namespace cudnnbatchnorm {
+enum CuDNNBatchNormOpInputs {kData, kGamma, kBeta};
+enum CuDNNBatchNormOpOutputs {kOut, kMean, kInvVar};
+enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar};
+}  // namespace cudnnbatchnorm
+
+struct CuDNNBatchNormParam : public dmlc::Parameter<CuDNNBatchNormParam> {
+  float eps;
+  float momentum;
+  DMLC_DECLARE_PARAMETER(CuDNNBatchNormParam) {
+    DMLC_DECLARE_FIELD(eps).set_default(1e-3f)
+    .describe("Epsilon to prevent div 0");
+    DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
+    .describe("Momentum for moving average");
+  }
+};
+
+template<typename xpu>
+Operator *CreateOp(CuDNNBatchNormParam param);
+
+
+#if DMLC_USE_CXX11
+class CuDNNBatchNormProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 3) << "Input:[data, gamma, beta]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    in_shape->at(1) = TShape(Shape1(dshape[1]));
+    in_shape->at(2) = TShape(Shape1(dshape[1]));
+
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    out_shape->push_back(Shape1(dshape[1]));
+    out_shape->push_back(Shape1(dshape[1]));
+
+    aux_shape->clear();
+    aux_shape->push_back(Shape1(dshape[1]));
+    aux_shape->push_back(Shape1(dshape[1]));
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new CuDNNBatchNormProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "CuDNNBatchNorm";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[cudnnbatchnorm::kOut],
+            out_data[cudnnbatchnorm::kMean],
+            out_data[cudnnbatchnorm::kInvVar],
+            in_data[cudnnbatchnorm::kData],
+            in_data[cudnnbatchnorm::kGamma],
+            in_data[cudnnbatchnorm::kBeta]
+           };
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    return 3;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "gamma", "beta"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "mean", "inv_var"};
+  }
+
+  std::vector<std::string> ListAuxiliaryStates() const override {
+    return {"moving_mean", "moving_inv_var"};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  CuDNNBatchNormParam param_;
+};  // class BatchNormProp
+
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CUDNN_BATCH_NORM_INL_H_
diff --git a/src/operator/cudnn_batch_norm.cc b/src/operator/cudnn_batch_norm.cc
new file mode 100644
index 000000000000..a950e62f5102
--- /dev/null
+++ b/src/operator/cudnn_batch_norm.cc
@@ -0,0 +1,30 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cudnn_batch_norm.cc
+ * \brief
+ * \author Junyuan Xie
+*/
+
+#include "./cudnn_batch_norm-inl.h"
+namespace mxnet {
+namespace op {
+#if CUDNN_MAJOR >= 4
+template<>
+Operator *CreateOp<cpu>(CuDNNBatchNormParam param) {
+  LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu.";
+  return NULL;
+}
+
+Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(CuDNNBatchNormParam);
+
+MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp)
+.describe("Apply batch normalization to input.")
+.add_argument("data", "Symbol", "Input data to batch normalization")
+.add_arguments(CuDNNBatchNormParam::__FIELDS__());
+#endif  // CUDNN_MAJOR >= 4
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/cudnn_batch_norm.cu b/src/operator/cudnn_batch_norm.cu
new file mode 100644
index 000000000000..f70c21ef327a
--- /dev/null
+++ b/src/operator/cudnn_batch_norm.cu
@@ -0,0 +1,173 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cudnn_batch_norm.cu
+ * \brief
+ * \author Junyuan Xie
+*/
+
+#include "./cudnn_batch_norm-inl.h"
+
+namespace mxnet {
+namespace op {
+#if CUDNN_MAJOR >= 4
+class CuDNNBatchNormOp : public Operator {
+ public:
+  explicit CuDNNBatchNormOp(CuDNNBatchNormParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = CUDNN_DATA_FLOAT;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(aux_states.size(), 2);
+    if (ctx.is_train) {
+      CHECK_EQ(out_data.size(), 3);
+      CHECK_EQ(req.size(), 3);
+    } else {
+      CHECK_GE(out_data.size(), 1);
+      CHECK_GE(req.size(), 1);
+    }
+    CHECK_EQ(req[cudnnbatchnorm::kOut], kWriteTo);
+    CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2);
+    CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4);
+
+    if (!init_cudnn_) {
+      for (int i = 0; i < 4; ++i) {
+        if (i < in_data[cudnnbatchnorm::kData].ndim()) {
+          shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i];
+        } else {
+          shape_[i] = 1;
+        }
+      }
+      CHECK_EQ(cudnnCreateTensorDescriptor(&io_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&mean_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(io_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          shape_[0],
+                                          shape_[1],
+                                          shape_[2],
+                                          shape_[3]), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(mean_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          1,
+                                          shape_[1],
+                                          1,
+                                          1), CUDNN_STATUS_SUCCESS);
+      init_cudnn_  = true;
+    }
+
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4> x = in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
+    Tensor<gpu, 1> gamma = in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    Tensor<gpu, 1> beta = in_data[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    Tensor<gpu, 4> y = out_data[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, real_t>(shape_, s);
+    Tensor<gpu, 1> moving_mean = aux_states[cudnnbatchnorm::kMovingMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    Tensor<gpu, 1> moving_inv_var = aux_states[cudnnbatchnorm::kMovingInvVar].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    float a = 1.0f, b = 0.0f;
+    if (ctx.is_train) {
+      Tensor<gpu, 1> save_mean = out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1> save_inv_var = out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+      CHECK_EQ(cudnnBatchNormalizationForwardTraining(s->dnn_handle_,
+                                                      CUDNN_BATCHNORM_SPATIAL,
+                                                      &a,
+                                                      &b,
+                                                      io_desc_,
+                                                      x.dptr_,
+                                                      io_desc_,
+                                                      y.dptr_,
+                                                      mean_desc_,
+                                                      gamma.dptr_,
+                                                      beta.dptr_,
+                                                      param_.momentum,
+                                                      moving_mean.dptr_,
+                                                      moving_inv_var.dptr_,
+                                                      param_.eps,
+                                                      save_mean.dptr_,
+                                                      save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
+    } else {
+      CHECK_EQ(cudnnBatchNormalizationForwardInference(s->dnn_handle_,
+                                                       CUDNN_BATCHNORM_SPATIAL,
+                                                       &a,
+                                                       &b,
+                                                       io_desc_,
+                                                       x.dptr_,
+                                                       io_desc_,
+                                                       y.dptr_,
+                                                       mean_desc_,
+                                                       gamma.dptr_,
+                                                       beta.dptr_,
+                                                       moving_mean.dptr_,
+                                                       moving_inv_var.dptr_,
+                                                       param_.eps), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_grad.size(), 3);
+
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4> x = in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
+    Tensor<gpu, 4> dx = in_grad[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
+    Tensor<gpu, 4> dy = out_grad[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, real_t>(shape_, s);
+    Tensor<gpu, 1> gamma = in_data[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    Tensor<gpu, 1> dbeta = in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    Tensor<gpu, 1> dgamma = in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    Tensor<gpu, 1> save_mean = out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    Tensor<gpu, 1> save_inv_var = out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
+    float a = 1.0f, b = 0.0f;
+    CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    CHECK_EQ(cudnnBatchNormalizationBackward(s->dnn_handle_,
+                                             CUDNN_BATCHNORM_SPATIAL,
+                                             &a,
+                                             &b,
+                                             io_desc_,
+                                             x.dptr_,
+                                             io_desc_,
+                                             dy.dptr_,
+                                             io_desc_,
+                                             dx.dptr_,
+                                             mean_desc_,
+                                             gamma.dptr_,
+                                             dgamma.dptr_,
+                                             dbeta.dptr_,
+                                             param_.eps,
+                                             save_mean.dptr_,
+                                             save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
+  }
+
+ private:
+  bool init_cudnn_;
+  cudnnDataType_t dtype_;
+  cudnnTensorDescriptor_t io_desc_, mean_desc_;
+  mshadow::Shape<4> shape_;
+  CuDNNBatchNormParam param_;
+};
+
+template<>
+Operator *CreateOp<gpu>(CuDNNBatchNormParam param) {
+  return new CuDNNBatchNormOp(param);
+}
+#endif  // CUDNN_MAJOR >= 4
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/embedding-inl.h b/src/operator/embedding-inl.h
index ec37dbc968f1..f0c9ccaf3431 100644
--- a/src/operator/embedding-inl.h
+++ b/src/operator/embedding-inl.h
@@ -53,6 +53,14 @@ class EmbeddingOp : public Operator {
     CHECK_EQ(req[embedding::kOut], kWriteTo);
     CHECK_EQ(in_data.size(), 2);
     CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data[embedding::kData].ndim(), 1)
+        << "Embedding layer expects its input to be one-dimensional. "
+        << in_data[embedding::kData].ndim()
+        << " dimensional input is given instead";
+    CHECK_EQ(in_data[embedding::kWeight].ndim(), 2)
+        << "Embedding layer expects its weight to be two-dimensional. "
+        << in_data[embedding::kWeight].ndim()
+        << " dimensional input is given instead";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 1> data = in_data[embedding::kData].get<xpu, 1, real_t>(s);
     Tensor<xpu, 2> wmat = in_data[embedding::kWeight].get<xpu, 2, real_t>(s);
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 0134135c58e5..2a1b4884d43e 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -300,7 +300,7 @@ void GraphExecutor::InitGraph(const Symbol &symbol,
   }
   std::sort(head_nodes.begin(), head_nodes.end());
   head_nodes.resize(std::unique(head_nodes.begin(), head_nodes.end()) - head_nodes.begin());
-  std::vector<uint32_t> fwd_nodes = graph_.PostDFSOrder(head_nodes, {});
+  std::vector<uint32_t> fwd_nodes = graph_.PostDFSOrder(head_nodes, std::unordered_set<uint32_t>());
   num_forward_nodes_ = fwd_nodes.size();
 
   std::unordered_set<uint32_t> fwd_set(fwd_nodes.begin(), fwd_nodes.end());
diff --git a/src/symbol/static_graph.cc b/src/symbol/static_graph.cc
index 5cb46b2be203..0ad7de4d6c55 100644
--- a/src/symbol/static_graph.cc
+++ b/src/symbol/static_graph.cc
@@ -255,6 +255,7 @@ void StaticGraph::MakeBackwardPass(std::vector<uint32_t> *head_grad_nodes,
     if (type == "Dropout") return false;
     if (type == "Concat") return false;
     if (type == "SoftmaxOutput") return false;
+    if (type == "CuDNNBatchNorm") return false;
     ++pcounter[0];
     if (pcounter[0] % mirror_step == 0) return false;
     return true;
diff --git a/src/symbol/static_graph.h b/src/symbol/static_graph.h
index db93602b80bf..639a47d8a4a7 100644
--- a/src/symbol/static_graph.h
+++ b/src/symbol/static_graph.h
@@ -183,7 +183,8 @@ class StaticGraph {
    * \return a post DFS visit order of nodes that can reach heads.
    */
   std::vector<uint32_t> PostDFSOrder(const std::vector<uint32_t>& head_nodes,
-                                     const std::unordered_set<uint32_t>& banned = {}) const;
+                                     const std::unordered_set<uint32_t>& banned
+                                     = std::unordered_set<uint32_t>()) const;
   /*!
    * \brief infer the node shapes in the computation graph.
    *
diff --git a/tests/python/unittest/check_utils.py b/tests/python/unittest/check_utils.py
index 6c858c9aa5a2..5fe415593584 100644
--- a/tests/python/unittest/check_utils.py
+++ b/tests/python/unittest/check_utils.py
@@ -50,7 +50,7 @@ def numeric_grad(executor, location, eps=1e-4):
         a[:] = np.asarray(l)
     approx_grads = [np.zeros_like(l) for l in location]
 
-    executor.forward()
+    executor.forward(is_train=True)
     f_x = executor.outputs[0].asnumpy()
 
     x_copy = [np.copy(x) for x in location]
@@ -62,7 +62,7 @@ def numeric_grad(executor, location, eps=1e-4):
             # set initial states. Need to set all due to inplace operations
             for inp, val in zip(args, location):
                 inp[:] = val
-            executor.forward()
+            executor.forward(is_train=True)
             f_eps = executor.outputs[0].asnumpy()
             ap_grad.ravel()[i] = (f_eps - f_x) / eps
             loc.ravel()[i] = reset.ravel()[i]
@@ -72,7 +72,7 @@ def numeric_grad(executor, location, eps=1e-4):
 
 rng = np.random.RandomState(1234)
 
-def check_numeric_gradient(sym, location, numeric_eps=1e-4, check_eps=1e-2):
+def check_numeric_gradient(sym, location, aux_states=[], numeric_eps=1e-4, check_eps=1e-2):
     """
     Verify an operation by checking backwards pass via
     finite difference method.
@@ -114,8 +114,9 @@ def random_projection(shape):
 
     arr_data = [mx.nd.array(l) for l in location] + [mx.nd.empty(out_shape[0])]
     arr_grad = [mx.nd.empty(l.shape) for l in location] + [mx.nd.empty(out_shape[0])]
+    arr_aux = [mx.nd.array(l) for l in aux_states]
 
-    executor = out.bind(mx.cpu(), args=arr_data, args_grad=arr_grad)
+    executor = out.bind(mx.cpu(), args=arr_data, args_grad=arr_grad, aux_states=arr_aux)
 
     location = location + [random_projection(out_shape[0])]
     inps = executor.arg_arrays
@@ -132,7 +133,7 @@ def random_projection(shape):
 
     assert len(executor.outputs) == 1
 
-    executor.forward()
+    executor.forward(is_train=True)
     executor.backward()
     # remove the proj from grads
     symbolic_grad = [g.asnumpy() for g in executor.grad_arrays[0:-1]]
@@ -140,9 +141,10 @@ def random_projection(shape):
     # refactor forward out of here as this no longer computes correct forward pass
     numeric_gradients = numeric_grad(executor, location, eps=numeric_eps)
 
-    for numeric, symbolic in zip(numeric_gradients, symbolic_grad):
+    for name, numeric, symbolic in zip(out.list_arguments(), numeric_gradients, symbolic_grad):
         rel = reldiff(numeric, symbolic)
-        assert rel <= check_eps
+        if rel > check_eps:
+            raise Exception("Numeric check failed for %s. relative error of %f expected <= %f"%(name, rel, check_eps))
 
 def check_symbolic_forward(sym, location, expected, check_eps=1e-5):
     """ Compare foward call to expected value.
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 4579bdbdf4cb..decbb08bbec7 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -621,6 +621,29 @@ def test_nearest_upsampling():
                     shapes = [(1,3,base*root_scale*scale**(num_shape-1-i),base*root_scale*scale**(num_shape-1-i)) for i in range(num_shape)]
                     check_nearest_upsampling_with_shape(shapes, scale, root_scale)
 
+def test_batchnorm_training():
+    for shape in [(2, 3), (2, 3, 2, 2)]:
+        data_tmp = np.random.normal(size=shape)
+        s = shape[1],
+        gamma = np.ones(s)
+        beta = np.ones(s)
+        gamma[1] = 3
+        beta[0] = 3
+
+        rolling_mean = np.random.uniform(size=s)
+        rolling_std = np.random.uniform(size=s)
+
+        data = mx.symbol.Variable('data')
+        test = mx.symbol.BatchNorm(data, fix_gamma=False)
+
+        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=5e-2)
+
+        # Gamma needs to be fixed at one when fix_gamma is true,
+        gamma = np.ones(s)
+
+        test = mx.symbol.BatchNorm(data, fix_gamma=True)
+        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=5e-2)
+
 if __name__ == '__main__':
     test_nearest_upsampling()
     test_binary_op_duplicate_input()
@@ -641,5 +664,6 @@ def test_nearest_upsampling():
     test_abs()
     test_round_ceil_floor()
     test_deconvolution()
+    test_batchnorm_training()
     #check_softmax_with_shape((3,4), mx.cpu())
     #check_multi_softmax_with_shape((3,4,5), mx.cpu())
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index 8e637285eef0..f0d667ab213a 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -113,6 +113,10 @@ fi
 
 
 if [ ${TASK} == "scala_test" ]; then
+    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+        LIB_GOMP_PATH=`find /usr/local/lib -name libgomp.dylib | grep -v i386 | head -n1`
+        ln -sf $LIB_GOMP_PATH /usr/local/lib/libgomp.dylib
+    fi
     make all || exit -1
     # use cached dir for storing data
     rm -rf ${PWD}/data
diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index f7472ff628b8..fb8599ca471d 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -10,6 +10,7 @@
  */
 #include <cctype>
 #include <cstring>
+#include <string>
 #include <vector>
 #include <iomanip>
 #include <sstream>
@@ -25,13 +26,15 @@ int main(int argc, char *argv[]) {
   if (argc < 4) {
     printf("Usage: <image.lst> <image_root_dir> <output.rec> [additional parameters in form key=value]\n"\
            "Possible additional parameters:\n"\
-           "\tcolor=USE_COLOR[default=1] Use color (1) or gray image (0)\n"\
+           "\tcolor=USE_COLOR[default=1] Force color (1), gray image (0) or keep source unchanged (-1).\n"\
            "\tresize=newsize resize the shorter edge of image to the newsize, original images will be packed by default\n"\
            "\tlabel_width=WIDTH[default=1] specify the label_width in the list, by default set to 1\n"\
            "\tnsplit=NSPLIT[default=1] used for part generation, logically split the image.list to NSPLIT parts by position\n"\
-           "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n"
-           "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it square.\n"
-           "\tquality=QUALITY[default=80] JPEG quality for encoding, 1-100.\n");
+           "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n"\
+           "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it square.\n"\
+           "\tquality=QUALITY[default=80] JPEG quality for encoding (1-100, default: 80) or PNG compression for encoding (1-9, default: 3).\n"\
+           "\tencoding=ENCODING[default='.jpg'] Encoding type. Can be '.jpg' or '.png'\n"\
+           "\tunchanged=UNCHANGED[default=0] Keep the original image encoding, size and color. If set to 1, it will ignore the others parameters.\n");
     return 0;
   }
   int label_width = 1;
@@ -41,6 +44,8 @@ int main(int argc, char *argv[]) {
   int center_crop = 0;
   int quality = 80;
   int color_mode = CV_LOAD_IMAGE_COLOR;
+  int unchanged=0;
+  std::string encoding(".jpg");
   for (int i = 4; i < argc; ++i) {
     char key[128], val[128];
     if (sscanf(argv[i], "%[^=]=%s", key, val) == 2) {
@@ -51,8 +56,18 @@ int main(int argc, char *argv[]) {
       if (!strcmp(key, "center_crop")) center_crop = atoi(val);
       if (!strcmp(key, "quality")) quality = atoi(val);
       if (!strcmp(key, "color")) color_mode = atoi(val);
+      if (!strcmp(key, "encoding")) encoding = std::string(val);
+      if (!strcmp(key, "unchanged")) unchanged = atoi(val);
     }
   }
+  // Check parameters ranges
+  if(color_mode != -1 && color_mode != 0 && color_mode != 1) {
+      LOG(FATAL) << "Color mode must be -1, 0 or 1.";
+  }
+  if(encoding != std::string(".jpg") && encoding != std::string(".png")) {
+      LOG(FATAL) << "Encoding mode must be .jpg or .png.";
+  }
+  
   if (new_size > 0) {
     LOG(INFO) << "New Image Size: Short Edge " << new_size;
   } else {
@@ -64,6 +79,14 @@ int main(int argc, char *argv[]) {
   if (color_mode == 0) {
     LOG(INFO) << "Use gray images";
   }
+  if (color_mode == -1) {
+    LOG(INFO) << "Keep original color mode";
+  } 
+  LOG(INFO) << "Encoding is " << encoding;
+
+  if(encoding == std::string(".png") and quality > 9) {
+      quality = 3;
+  }
   
   using namespace dmlc;
   const static size_t kBufferSize = 1 << 20UL;
@@ -81,15 +104,22 @@ int main(int argc, char *argv[]) {
   }
   LOG(INFO) << "Write to output: " << os.str();
   dmlc::Stream *fo = dmlc::Stream::Create(os.str().c_str(), "w");
-  LOG(INFO) << "Output: " << argv[3];
+  LOG(INFO) << "Output: " << os.str();
   dmlc::RecordIOWriter writer(fo);
   std::string fname, path, blob;
   std::vector<unsigned char> decode_buf;
   std::vector<unsigned char> encode_buf;
   std::vector<int> encode_params;
-  encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
-  encode_params.push_back(quality);
-  LOG(INFO) << "JPEG encoding quality: " << quality;
+  if(encoding == std::string(".png")) {
+      encode_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
+      encode_params.push_back(quality);
+      LOG(INFO) << "PNG encoding compression: " << quality;
+  }
+  else {
+      encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
+      encode_params.push_back(quality);
+      LOG(INFO) << "JPEG encoding quality: " << quality;
+  }
   dmlc::InputSplit::Blob line;
 
   while (flist->NextRecord(&line)) {
@@ -124,33 +154,37 @@ int main(int argc, char *argv[]) {
       if (nread != kBufferSize) break;
     }
     delete fi;
-    if (new_size > 0) {
+    
+    if(unchanged != 1) {
       cv::Mat img = cv::imdecode(decode_buf, color_mode);
       CHECK(img.data != NULL) << "OpenCV decode fail:" << path;
-      if (center_crop) {
+      cv::Mat res = img;
+      if (new_size > 0) {
+        if (center_crop) {
+          if (img.rows > img.cols) {
+            int margin = (img.rows - img.cols)/2;
+            img = img(cv::Range(margin, margin+img.cols), cv::Range(0, img.cols));
+          } else {
+            int margin = (img.cols - img.rows)/2;
+            img = img(cv::Range(0, img.rows), cv::Range(margin, margin + img.rows));
+          }
+        }
         if (img.rows > img.cols) {
-          int margin = (img.rows - img.cols)/2;
-          img = img(cv::Range(margin, margin+img.cols), cv::Range(0, img.cols));
+          cv::resize(img, res, cv::Size(new_size, img.rows * new_size / img.cols),
+                     0, 0, CV_INTER_LINEAR);
         } else {
-          int margin = (img.cols - img.rows)/2;
-          img = img(cv::Range(0, img.rows), cv::Range(margin, margin + img.rows));
+          cv::resize(img, res, cv::Size(new_size * img.cols / img.rows, new_size),
+                     0, 0, CV_INTER_LINEAR);
         }
       }
-      cv::Mat res;
-      if (img.rows > img.cols) {
-        cv::resize(img, res, cv::Size(new_size, img.rows * new_size / img.cols),
-                0, 0, CV_INTER_LINEAR);
-      } else {
-        cv::resize(img, res, cv::Size(new_size * img.cols / img.rows, new_size),
-                0, 0, CV_INTER_LINEAR);
-      }
       encode_buf.clear();
-      CHECK(cv::imencode(".jpg", res, encode_buf, encode_params));
+      CHECK(cv::imencode(encoding, res, encode_buf, encode_params));
       size_t bsize = blob.size();
       blob.resize(bsize + encode_buf.size());
       memcpy(BeginPtr(blob) + bsize,
              BeginPtr(encode_buf), encode_buf.size());
-    } else {
+    }
+    else {
       size_t bsize = blob.size();
       blob.resize(bsize + decode_buf.size());
       memcpy(BeginPtr(blob) + bsize,
diff --git a/tools/im2rec.py b/tools/im2rec.py
index 3bb04ab96cd4..9875c1db531b 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -41,7 +41,7 @@ def make_list(prefix_out, root, recursive, exts, num_chunks, train_ratio):
     image_list = list_image(root, recursive, exts)
     random.shuffle(image_list)
     N = len(image_list)
-    chunk_size = N/num_chunks
+    chunk_size = (N+num_chunks-1)/num_chunks
     for i in xrange(num_chunks):
         chunk = image_list[i*chunk_size:(i+1)*chunk_size]
         if num_chunks > 1:
@@ -70,9 +70,12 @@ def write_record(args, image_list):
     record = mx.recordio.MXRecordIO(args.prefix+'.rec', 'w')
     lock = threading.Lock()
     tic = [time.time()]
+    color_modes = {-1: cv2.IMREAD_UNCHANGED,
+                    0: cv2.IMREAD_GRAYSCALE,
+                    1: cv2.IMREAD_COLOR}
     def worker(i):
         item = source.pop(0)
-        img = cv2.imread(os.path.join(args.root, item[1]))
+        img = cv2.imread(os.path.join(args.root, item[1]), color_modes[args.color])
         if args.center_crop:
             if img.shape[0] > img.shape[1]:
                 margin = (img.shape[0] - img.shape[1])/2;
@@ -87,7 +90,7 @@ def worker(i):
                 newsize = (args.resize, img.shape[1]*args.resize/img.shape[0])
             img = cv2.resize(img, newsize)
         header = mx.recordio.IRHeader(0, item[2], item[0], 0)
-        s = mx.recordio.pack_img(header, img, quality=args.quality)
+        s = mx.recordio.pack_img(header, img, quality=args.quality, img_fmt=args.encoding)
         lock.acquire()
         record.write(s)
         sink.append(item)
@@ -141,12 +144,19 @@ def main():
     rgroup.add_argument('--center_crop', type=bool, default=False,
         help='specify whether to crop the center image to make it rectangular.')
     rgroup.add_argument('--quality', type=int, default=80,
-        help='JPEG quality for encoding, 1-100.')
+        help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
     rgroup.add_argument('--num_thread', type=int, default=1,
         help='number of thread to use for encoding. order of images will be different\
         from the input list if >1. the input list will be modified to match the\
         resulting order.')
-
+    rgroup.add_argument('--color', type=int, default=1, choices=[-1, 0, 1],
+        help='specify the color mode of the loaded image.\
+        1: Loads a color image. Any transparency of image will be neglected. It is the default flag.\
+        0: Loads image in grayscale mode.\
+        -1:Loads image as such including alpha channel.')
+    rgroup.add_argument('--encoding', type=str, default='.jpg', choices=['.jpg', '.png'],
+        help='specify the encoding of the images.')
+        
     args = parser.parse_args()
     
     if args.list:
@@ -157,4 +167,4 @@ def main():
         write_record(args, image_list)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()