diff --git a/.gitignore b/.gitignore
index 516320555b63..bbf8acb67285 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,5 @@ deps
 # R
 *.Rcheck
 *.rds
+*.Rproj
+.Rproj.user
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index 313c4ec2a52f..f3b3e0d4018f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,3 @@
-# disable sudo to use container based build
 sudo: false
 
 language: cpp
@@ -18,6 +17,7 @@ env:
   - TASK=cpp_test
   # run tests/python
   - TASK=python_test
+  #- TASK=r_test
 
   # TODO, R test, distributed test, clang, more g++ versions
 
@@ -27,6 +27,8 @@ matrix:
       env: TASK=lint
     - os: osx
       env: TASK=doc
+    - os: linux
+      env: TASK=r_test
 
 # dependent apt packages
 addons:
@@ -67,10 +69,13 @@ cache:
 
 before_cache:
   - dmlc-core/scripts/travis/travis_before_cache.sh
+  
+after_failure:
+  - tests/travis/travis_after_failure.sh
 
 notifications:
 # Emails are sent to the committer's git-configured email address by default,
   email:
     on_success: change
     on_failure: always
-  slack: dmlc:NmroCzntCiWOuxUZpii40USd
+  #slack: dmlc:NmroCzntCiWOuxUZpii40USd
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67e0b881df5b..05ce844b45f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,9 @@ mxnet_option(USE_OPENCV  "Build with OpenCV support" ON)
 mxnet_option(USE_OPENMP  "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN   "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
 mxnet_option(USE_CUDA    "Build with CUDA support"   ON)
+mxnet_option(USE_DIST_KVSTORE    "Build with DIST_KVSTORE support"   OFF)
+
+
 
 include(mshadow/cmake/mshadow.cmake)
 include(mshadow/cmake/Utils.cmake)
@@ -65,8 +68,8 @@ if(USE_CUDNN)
 endif()
 
 add_subdirectory("dmlc-core")
-if(NOT MSVC)
-  add_subdirectory("ps-lite")
+if(USE_DIST_KVSTORE)
+	add_subdirectory("ps-lite")
 endif()
 
 mxnet_source_group("Source"   GLOB_RECURSE "src/*.cc")
@@ -91,6 +94,7 @@ if(USE_CUDA)
   list(APPEND SOURCE ${cuda_objs} ${cuda})
 endif()
 
+
 if(NOT MSVC)
   # Only add c++11 flags and definitions after cuda compiling
   add_definitions(-DDMLC_USE_CXX11)
@@ -102,12 +106,18 @@ endif()
 add_library(mxnet SHARED ${SOURCE})
 target_link_libraries(mxnet ${mshadow_LINKER_LIBS})
 target_link_libraries(mxnet dmlccore)
-if(NOT MSVC)
-  target_link_libraries(mxnet pslite)
-  target_link_libraries(mxnet ${pslite_LINKER_LIBS})
-endif()
+
+
+
 set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 
+if(USE_DIST_KVSTORE)
+	add_definitions(-DMXNET_USE_DIST_KVSTORE)
+	target_link_libraries(mxnet pslite)
+	target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+	include_directories(SYSTEM ${pslite_INCLUDE_DIR})
+endif()
+
 # ---[ Linter target
 if(MSVC)
   find_package(PythonInterp 2)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9f72042fb3ce..8299f53aa9ca 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -26,6 +26,12 @@ The committers are the granted write access to the project.
   - Mingjie is the initiator, and contributes the design of the dependency engine.
 * [Chuntao Hong](https://github.com/hjk41)
   - Chuntao is the initiator and provides the initial design of engine.
+* [Chiyuan Zhang](https://github.com/pluskid)
+  - Chiyuan is the creator of MXNet Julia Package.
+* [Qiang Kou](https://github.com/thirdwing)
+  - KK is a R ninja, he makes mxnet available for R users.
+* [Tong He](https://github.com/hetong007)
+  - Tong is the major maintainer of MXNetR, he designs the mxnet interface and wrote many of the tutorials on R.
 
 ### Become a Comitter
 MXNet is a opensource project and we are actively looking for new comitters
@@ -39,12 +45,10 @@ List of Contributors
 --------------------
 * [Full List of Contributors](https://github.com/dmlc/mxnet/graphs/contributors)
   - To contributors: please add your name to the list when you submit a patch to the project:)
-* [Qiang Kou](https://github.com/thirdwing)
-  - KK is a R ninja, he makes mxnet available for R users.
-* [Tong He](https://github.com/hetong007)
-  - Tong is the major maintainer of MXNetR, he designs the mxnet interface and wrote many of the tutorials on R.
 * [Feng Wang](https://github.com/happynear)
   - Feng makes mxnet compatible with Windows Visual Studio.
+* [Jack Deng](https://github.com/jdeng)
+  - Jack created the amalgamation script and Go bind for mxnet.
 * [Li Dong](https://github.com/donglixp)
 * [Piji Li](https://github.com/lipiji)
 * [Hu Shiwen](https://github.com/yajiedesign)
@@ -55,3 +59,8 @@ List of Contributors
 * [Xiaodong](https://github.com/XD-DENG)
 * [Nan Xiao](https://github.com/road2stat)
 * [Junyuan Xie](https://github.com/piiswrong)
+* [Wei Wu](https://github.com/tornadomeet)
+* [Yuan Tang](https://github.com/terrytangyuan)
+  - Yuan set up the R Travis environment to make development safer.
+* [Michaël Benesty](https://github.com/pommedeterresautee)
+  -Michaël contributes the R visualization module of mxnet
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 27a81e75861e..f42bed506a71 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -5,10 +5,21 @@ Version: 0.5
 Date: 2015-10-02
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qkou@umail.iu.edu>
-Description: MXNet is a deep learning framework designed for both efficiency and flexibility. It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
+Description: MXNet is a deep learning framework designed for both efficiency
+    and flexibility. It allows you to mix the flavours of deep learning programs
+    together to maximize the efficiency and your productivity.
 License: BSD
 URL: https://github.com/dmlc/mxnet/R-package
 BugReports: https://github.com/dmlc/mxnet/issues
-Imports: methods, Rcpp (>= 0.11.1)
-Suggests: testthat
+Imports:
+    methods,
+    Rcpp (>= 0.11.1),
+    DiagrammeR,
+    data.table,
+    jsonlite,
+    magrittr,
+    stringr
+Suggests:
+    testthat
 LinkingTo: Rcpp
+RoxygenNote: 5.0.0
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 83411a400c25..08e08746b12c 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -1,4 +1,4 @@
-# Generated by roxygen2 (4.1.1): do not edit by hand
+# Generated by roxygen2: do not edit by hand
 
 S3method(Ops,MXNDArray)
 S3method(as.array,MXNDArray)
@@ -9,6 +9,7 @@ S3method(predict,MXFeedForwardModel)
 S3method(print,MXNDArray)
 export(arguments)
 export(ctx)
+export(graph.viz)
 export(is.mx.context)
 export(is.mx.dataiter)
 export(is.mx.ndarray)
@@ -40,11 +41,14 @@ export(mx.model.FeedForward.create)
 export(mx.model.load)
 export(mx.model.save)
 export(mx.nd.array)
-export(mx.nd.choose.element)
+export(mx.nd.choose.element.0index)
 export(mx.nd.clip)
 export(mx.nd.copyto)
 export(mx.nd.dot)
+export(mx.nd.exp)
 export(mx.nd.load)
+export(mx.nd.log)
+export(mx.nd.norm)
 export(mx.nd.ones)
 export(mx.nd.save)
 export(mx.nd.sqrt)
@@ -58,6 +62,7 @@ export(mx.set.seed)
 export(mx.simple.bind)
 export(mx.symbol.Activation)
 export(mx.symbol.BatchNorm)
+export(mx.symbol.BlockGrad)
 export(mx.symbol.Concat)
 export(mx.symbol.Convolution)
 export(mx.symbol.Dropout)
@@ -74,10 +79,26 @@ export(mx.symbol.Reshape)
 export(mx.symbol.SliceChannel)
 export(mx.symbol.Softmax)
 export(mx.symbol.Variable)
+export(mx.symbol.exp)
 export(mx.symbol.infer.shape)
 export(mx.symbol.load)
+export(mx.symbol.log)
 export(mx.symbol.save)
+export(mx.symbol.sqrt)
+export(mx.symbol.square)
 export(mxnet.export)
 export(outputs)
 import(Rcpp)
 import(methods)
+importFrom(DiagrammeR,combine_edges)
+importFrom(DiagrammeR,create_edges)
+importFrom(DiagrammeR,create_graph)
+importFrom(DiagrammeR,create_nodes)
+importFrom(DiagrammeR,render_graph)
+importFrom(data.table,":=")
+importFrom(data.table,as.data.table)
+importFrom(data.table,data.table)
+importFrom(data.table,setkey)
+importFrom(jsonlite,fromJSON)
+importFrom(magrittr,"%>%")
+importFrom(stringr,str_extract_all)
diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index 6110dd51eac8..635c45268922 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -2,7 +2,7 @@
 # Generated by mxnet.export, do not edit by hand.
 ######
 
-#' Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs
+#' Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs. This function assume rhs uses 0-based index.
 #' 
 #' @param lhs  NDArray
 #'     Left operand to the function.
@@ -11,7 +11,7 @@
 #' @return out The result mx.ndarray
 #' 
 #' @export
-#' @name mx.nd.choose.element
+#' @name mx.nd.choose.element.0index
 NULL
 
 #' Clip ndarray elements to range (a_min, a_max)
@@ -40,10 +40,40 @@ NULL
 #' @name mx.nd.dot
 NULL
 
-#' Take square root of the src
+#' Take exp of the src
 #' 
 #' @param src  NDArray
-#'     Source input to the function.
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.exp
+NULL
+
+#' Take log of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.log
+NULL
+
+#' Take L2 norm of the src.The result will be ndarray of shape (1,) on the same device.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.norm
+NULL
+
+#' Take sqrt of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -53,7 +83,7 @@ NULL
 #' Take square of the src
 #' 
 #' @param src  NDArray
-#'     Source input to the function.
+#'     Source input to the function
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -208,6 +238,19 @@ mx.symbol.BatchNorm <- function(...) {
   mx.varg.symbol.BatchNorm(list(...))
 }
 
+#' Get output from a symbol and pass 0 gradient back
+#' 
+#' @param data  Symbol
+#'     Input data.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.BlockGrad <- function(...) {
+  mx.varg.symbol.BlockGrad(list(...))
+}
+
 #' Perform an feature concat on channel dim (dim 1) over all the inputs.
 #' 
 #' @param num.args  int, required
@@ -238,7 +281,7 @@ mx.symbol.Concat <- function(...) {
 #' @param num.filter  int (non-negative), required
 #'     convolution filter(channel) number
 #' @param num.group  int (non-negative), optional, default=1
-#'     number of groups partition
+#'     Number of groups partition. This option is not supported by CuDNN, you can use SliceChannel to num_group,apply convolution and concat instead to achieve the same need.
 #' @param workspace  long (non-negative), optional, default=512
 #'     Tmp workspace for convolution (MB)
 #' @param no.bias  boolean, optional, default=False
@@ -442,6 +485,8 @@ mx.symbol.SliceChannel <- function(...) {
 #'     Input data to softmax.
 #' @param grad.scale  float, optional, default=1
 #'     Scale the gradient by a float factor
+#' @param multi.output  boolean, optional, default=False
+#'     If set to true, for a (n,k,x_1,..,x_n) dimensionalinput tensor, softmax will generate n*x_1*...*x_n output, eachhas k classes
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -450,3 +495,55 @@ mx.symbol.SliceChannel <- function(...) {
 mx.symbol.Softmax <- function(...) {
   mx.varg.symbol.Softmax(list(...))
 }
+
+#' Take exp of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.exp <- function(...) {
+  mx.varg.symbol.exp(list(...))
+}
+
+#' Take log of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.log <- function(...) {
+  mx.varg.symbol.log(list(...))
+}
+
+#' Take sqrt of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.sqrt <- function(...) {
+  mx.varg.symbol.sqrt(list(...))
+}
+
+#' Take square of the src
+#' 
+#' @param src  Symbol
+#'     Source symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.square <- function(...) {
+  mx.varg.symbol.square(list(...))
+}
diff --git a/R-package/R/viz.graph.R b/R-package/R/viz.graph.R
new file mode 100644
index 000000000000..c02eb2d1a97b
--- /dev/null
+++ b/R-package/R/viz.graph.R
@@ -0,0 +1,158 @@
+#' 
+#' Convert symbol to dot object for visualization purpose.
+#' 
+#' @importFrom magrittr %>%
+#' @importFrom stringr str_extract_all
+#' @importFrom magrittr %>%
+#' @importFrom data.table data.table
+#' @importFrom data.table as.data.table
+#' @importFrom data.table :=
+#' @importFrom data.table setkey
+#' @importFrom jsonlite fromJSON
+#' @importFrom DiagrammeR create_nodes
+#' @importFrom DiagrammeR create_graph
+#' @importFrom DiagrammeR create_edges
+#' @importFrom DiagrammeR combine_edges
+#' @importFrom DiagrammeR render_graph
+#' 
+#' @param model a \code{string} representing the path to a file containing the \code{JSon} of a model dump or the actual model dump.
+#' @param graph.title a \code{string} displayed on top of the viz.
+#' @param graph.title.font.name a \code{string} representing the font to use for the title.
+#' @param  graph.title.font.size a \code{numeric} representing the size of the font to use for the title.
+#' @param graph.width.px a \code{numeric} representing the size (width) of the graph. In pixels
+#' @param graph.height.px a \code{numeric} representing the size (height) of the graph. In pixels
+#'     
+#' @return a graph object ready to be displayed with the \code{print} function.
+#'
+#' @export
+graph.viz <- function(model, graph.title = "Computation graph", graph.title.font.name = "Helvetica", graph.title.font.size = 30, graph.width.px = 500, graph.height.px = 500){
+  # generate color code for each type of node.
+  get.color <- function(type) {
+    switch(
+      EXPR = type,
+      "data" = "#8dd3c7",
+      "FullyConnected" = ,
+      "Convolution" = "#fb8072",
+      "LeakyReLU" = ,
+      "Activation" = "#ffffb3",
+      "BatchNorm" = "#bebada",
+      "Pooling" = "#80b1d3",
+      "Flatten" = ,
+      "Reshape" = ,
+      "Concat" = "#fdb462",
+      "Softmax" = "#b3de69",
+      "#fccde5" # default value
+    )
+  }
+  
+  get.shape <- function(type) {
+    switch(
+      EXPR = type,
+      "data" = "star",
+      #     "FullyConnected" = ,
+      #     "Convolution" = "#fb8072",
+      #    "LeakyReLU" = ,
+      #    "Activation" = "diamond",
+      #     "BatchNorm" = "#bebada",
+      "Pooling" = "oval",
+      "Flatten" = ,
+      "Reshape" = ,
+      "Concat" = "invtriangle",
+      #     "Softmax" = "#b3de69",
+      "box" # default value
+    )
+  }
+  
+  # extract IDs from string list
+  str2tuple <- function(str) str_extract_all(str, "\\d+") %>% unlist %>% as.numeric
+  
+  # generate text content for each node.
+  get.label <- function(type, mat.row) {
+    switch(
+      EXPR = type,
+      "FullyConnected" = mat.row[,param.num_hidden] %>% paste("FullyConnected", ., sep = "\n"),
+      "Convolution" = {
+        kernel.parameters <- mat.row[,param.kernel] %>% str2tuple
+        stride.parameters <- mat.row[,param.stride] %>% str2tuple
+        num_filter.parameters <- mat.row[,param.num_filter] %>% str2tuple
+        paste0("Convolution\n", kernel.parameters[1], "x", kernel.parameters[2],
+               "/", stride.parameters[1], ", ", num_filter.parameters)
+      }, 
+      "LeakyReLU" = ,
+      "Activation" = mat.row[,param.act_type] %>% paste0(type, "\n", .),
+      "Pooling" = {
+        pool_type.parameters <- mat.row[,param.pool_type] %>% str2tuple
+        kernel.parameters <- mat.row[,param.kernel] %>% str2tuple
+        stride.parameters <- mat.row[,param.stride] %>% str2tuple
+        paste0("Pooling\n", pool_type.parameters, "\n", kernel.parameters[1], "x", 
+               kernel.parameters[2], "/", stride.parameters[1])
+      },
+      type # default value
+    )
+  }
+  
+  mx.model.json <- fromJSON(model, flatten = T)
+  mx.model.nodes <- mx.model.json$nodes %>% as.data.table
+  mx.model.nodes[,id:= .I - 1]
+  setkey(mx.model.nodes, id)
+  mx.model.json$heads[1,] %>% {mx.model.nodes[id %in% .,op:=name]} # add nodes from heads (mainly data node)
+  mx.model.nodes[,color:= get.color(op), by = id] # by=id to have an execution row per row
+  mx.model.nodes[,shape:= get.shape(op), by = id] # by=id to have an execution row per row
+  mx.model.nodes[,label:= get.label(op, .SD), by = id] # by=id to have an execution row per row
+  
+  nodes.to.keep <-
+    mx.model.nodes[op != "null",id] %>% unique %>% sort
+  nodes.to.remove <-
+    mx.model.nodes[,id] %>% unique %>% setdiff(nodes.to.keep) %>% sort
+  
+  nodes <-
+    create_nodes(
+      nodes = mx.model.nodes[id %in% nodes.to.keep, id],
+      label = mx.model.nodes[id %in% nodes.to.keep, label],
+      type = "lower",
+      style = "filled",
+      fillcolor  = mx.model.nodes[id %in% nodes.to.keep, color],
+      shape = mx.model.nodes[id %in% nodes.to.keep, shape],
+      data = mx.model.nodes[id %in% nodes.to.keep, id],
+      #fixedsize = TRUE,
+      width = "1.3",
+      height = "0.8034"
+    )
+  
+  mx.model.nodes[,has.connection:= sapply(inputs, function(x)
+    length(x) > 0)]
+  
+  nodes.to.insert <-
+    mx.model.nodes[id %in% nodes.to.keep &
+                     has.connection == T, .(id, inputs)]
+  
+  edges <- NULL
+  for (i in 1:nrow(nodes.to.insert)) {
+    current.id <- nodes.to.insert[i, id]
+    origin <-
+      nodes.to.insert[i, inputs][[1]][,1] %>% setdiff(nodes.to.remove) %>% unique
+    destination <- rep(current.id, length(origin))
+    edges.temp <- create_edges(from = origin,
+                               to = destination,
+                               relationship = "leading_to")
+    if (is.null(edges))
+      edges <- edges.temp
+    else
+      edges <- combine_edges(edges.temp, edges)
+  }
+  
+  graph <-
+    create_graph(
+      nodes_df = nodes,
+      edges_df = edges,
+      directed = TRUE,
+      # node_attrs = c("fontname = Helvetica"),
+      graph_attrs = paste0("label = \"", graph.title, "\"") %>% c(paste0("fontname = ", graph.title.font.name)) %>% c(paste0("fontsize = ", graph.title.font.size)) %>% c("labelloc = t"),
+      # node_attrs = "fontname = Helvetica",
+      edge_attrs = c("color = gray20", "arrowsize = 0.8", "arrowhead = vee")
+    )
+  
+  return(render_graph(graph, width = graph.width.px, height = graph.height.px))
+}
+
+globalVariables(c("color", "shape", "label", "id", ".", "op"))
diff --git a/R-package/demo/basic_convnet.R b/R-package/demo/basic_convnet.R
deleted file mode 100644
index c132a6c2f3d5..000000000000
--- a/R-package/demo/basic_convnet.R
+++ /dev/null
@@ -1,51 +0,0 @@
-require(mxnet)
-
-batch.size = 100
-data = mx.symbol.Variable("data")
-conv1= mx.symbol.Convolution(data = data, name="conv1", num_filter=32, kernel=c(3,3), stride=c(2,2))
-
-bn1 = mx.symbol.BatchNorm(data = conv1, name="bn1")
-act1 = mx.symbol.Activation(data = bn1, name="relu1", act_type="relu")
-
-mp1 = mx.symbol.Pooling(data = act1, name = "mp1", kernel=c(2,2), stride=c(2,2), pool_type="max")
-
-conv2= mx.symbol.Convolution(data = mp1, name="conv2", num_filter=32, kernel=c(3,3), stride=c(2,2))
-bn2 = mx.symbol.BatchNorm(data = conv2, name="bn2")
-act2 = mx.symbol.Activation(data = bn2, name="relu2", act_type="relu")
-
-mp2 = mx.symbol.Pooling(data = act2, name = "mp2", kernel=c(2,2), stride=c(2,2), pool_type="max")
-
-
-fl = mx.symbol.Flatten(data = mp2, name="flatten")
-fc2 = mx.symbol.FullyConnected(data = fl, name="fc2", num_hidden=10)
-softmax = mx.symbol.Softmax(data = fc2, name = "sm")
-
-dtrain = mx.varg.io.MNISTIter(list(
-  image="data/train-images-idx3-ubyte",
-  label="data/train-labels-idx1-ubyte",
-  data.shape=c(1, 28, 28),
-  batch.size=batch.size,
-  shuffle=TRUE,
-  flat=FALSE,
-  silent=0,
-  seed=10))
-
-dtest = mx.varg.io.MNISTIter(list(
-  image="data/t10k-images-idx3-ubyte",
-  label="data/t10k-labels-idx1-ubyte",
-  data.shape=c(1, 28, 28),
-  batch.size=batch.size,
-  shuffle=FALSE,
-  flat=TRUE,
-  silent=0))
-
-mx.set.seed(0)
-devices = lapply(1:2, function(i) {
-  mx.cpu(i)
-})
-model <- mx.model.FeedForward.create(softmax, X=dtrain, eval.data=dtest,
-                                     ctx=devices, num.round=1,
-                                     learning.rate=0.1, momentum=0.9,
-                                     initializer=mx.init.uniform(0.07),
-                                     batch.end.callback=mx.callback.log.train.metric(100))
-
diff --git a/R-package/demo/basic_io.R b/R-package/demo/basic_io.R
deleted file mode 100644
index 97fa94791e42..000000000000
--- a/R-package/demo/basic_io.R
+++ /dev/null
@@ -1,18 +0,0 @@
-require(mxnet)
-# To run this, run python/mxnet/test_io.py to get data first
-iter = mx.varg.io.MNISTIter(list(
-  image="data/train-images-idx3-ubyte",
-  label="data/train-labels-idx1-ubyte",
-  data.shape=c(784),
-  batch.size=3,
-  shuffle=TRUE,
-  flat=TRUE,
-  silent=0,
-  seed=10))
-
-iter$reset()
-print(iter$iter.next())
-data = iter$value()
-
-print(as.array(data$label))
-print(dim(data$data))
diff --git a/R-package/demo/basic_nn.R b/R-package/demo/basic_nn.R
deleted file mode 100644
index 36e033f04009..000000000000
--- a/R-package/demo/basic_nn.R
+++ /dev/null
@@ -1,74 +0,0 @@
-require(mxnet)
-# A basic neural net training
-# To run this, run python/mxnet/test_io.py to get data first
-
-# Network configuration
-batch.size <- 100
-data <- mx.symbol.Variable("data")
-fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
-act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
-fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
-act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
-fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-softmax <- mx.symbol.Softmax(fc3, name = "sm")
-
-dtrain = mx.io.MNISTIter(
-  image="data/train-images-idx3-ubyte",
-  label="data/train-labels-idx1-ubyte",
-  data.shape=c(784),
-  batch.size=batch.size,
-  shuffle=TRUE,
-  flat=TRUE,
-  silent=0,
-  seed=10)
-
-accuracy <- function(label, pred) {
-  ypred = max.col(as.array(pred))
-  return(sum((as.array(label) + 1) == ypred) / length(label))
-}
-mx.set.seed(0)
-# Training parameters
-ctx <- mx.cpu()
-input.shape <- c(batch.size, 784)
-symbol <- softmax
-init <- mx.init.uniform(0.07)
-opt <- mx.opt.create("sgd", learning.rate=0.05, momentum=0.9, rescale.grad=1.0/batch.size)
-
-# Training procedure
-texec <- mx.simple.bind(symbol, ctx=ctx, data=input.shape, grad.req=TRUE)
-shapes <- lapply(texec$ref.arg.arrays, dim)
-names(shapes) <- names(texec$arg.arrays)
-arg.arrays <- mx.init.create(init, shapes, ctx)
-mx.exec.update.arg.arrays(texec, arg.arrays, match.name=TRUE)
-
-updater <- mx.opt.get.updater(opt, texec$ref.arg.arrays)
-nround <- 10
-tic <- proc.time()
-
-for (iteration in 1 : nround) {
-  nbatch <- 0
-  train.acc <- 0
-  while (dtrain$iter.next()) {
-    batch <- dtrain$value()
-    label <- batch$label
-    names(batch) <- c("data", "sm_label")
-    # copy data arguments to executor
-    mx.exec.update.arg.arrays(texec, batch, match.name=TRUE)
-    # forward pass
-    mx.exec.forward(texec, is.train=TRUE)
-    # copy prediction out
-    out.pred <- mx.nd.copyto(texec$outputs[[1]], mx.cpu())
-    # backward pass
-    mx.exec.backward(texec)
-    arg.arrays <- updater(texec$arg.arrays, texec$ref.grad.arrays)
-    mx.exec.update.arg.arrays(texec, arg.arrays, skip.null=TRUE)
-    nbatch <- nbatch + 1
-    train.acc <- train.acc + accuracy(label, out.pred)
-    if (nbatch %% 100 == 0) {
-      print(paste("Train-acc=", train.acc / nbatch))
-      print(proc.time() - tic)
-    }
-  }
-  dtrain$reset()
-  print(paste("Train-acc=", train.acc / nbatch))
-}
diff --git a/R-package/demo/basic_training.R b/R-package/demo/basic_training.R
deleted file mode 100644
index 595469db6c76..000000000000
--- a/R-package/demo/basic_training.R
+++ /dev/null
@@ -1,44 +0,0 @@
-# This is an example of training using R's array
-
-require(mxnet)
-
-# Network configuration
-batch.size <- 100
-data <- mx.symbol.Variable("data")
-fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
-act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
-fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
-act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
-fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-softmax <- mx.symbol.Softmax(fc3, name = "sm")
-
-dtrain = mx.io.MNISTIter(
-  image="data/train-images-idx3-ubyte",
-  label="data/train-labels-idx1-ubyte",
-  data.shape=c(784),
-  batch.size=batch.size,
-  flat=TRUE,
-  silent=0,
-  seed=10)
-
-dtest = mx.io.MNISTIter(
-  image="data/t10k-images-idx3-ubyte",
-  label="data/t10k-labels-idx1-ubyte",
-  data.shape=c(784),
-  batch.size=batch.size,
-  shuffle=FALSE,
-  flat=TRUE,
-  silent=0)
-# X is R's array, we load from mxnet's native iter structure, but you don't have to
-X = mx.io.extract(dtrain, "data")
-y = mx.io.extract(dtrain, "label")
-
-devices = lapply(1:2, function(i) {
-  mx.cpu(i)
-})
-# create the model
-model <- mx.model.FeedForward.create(softmax, X=X, y=y,
-                                     ctx=devices, num.round=1,
-                                     learning.rate=0.1, momentum=0.9,
-                                     initializer=mx.init.uniform(0.07),
-                                     batch.end.callback=mx.callback.log.train.metric(100))
diff --git a/R-package/man/Ops.MXNDArray.Rd b/R-package/man/Ops.MXNDArray.Rd
index ec635008bedf..7a79f0a37474 100644
--- a/R-package/man/Ops.MXNDArray.Rd
+++ b/R-package/man/Ops.MXNDArray.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{Ops.MXNDArray}
 \alias{Ops.MXNDArray}
diff --git a/R-package/man/arguments.Rd b/R-package/man/arguments.Rd
index 6e8b5ad18b3b..07bf02b2cfa9 100644
--- a/R-package/man/arguments.Rd
+++ b/R-package/man/arguments.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{arguments}
 \alias{arguments}
diff --git a/R-package/man/as.array.MXNDArray.Rd b/R-package/man/as.array.MXNDArray.Rd
index 1960ff01d198..34e635cdf804 100644
--- a/R-package/man/as.array.MXNDArray.Rd
+++ b/R-package/man/as.array.MXNDArray.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{as.array.MXNDArray}
 \alias{as.array.MXNDArray}
diff --git a/R-package/man/as.matrix.MXNDArray.Rd b/R-package/man/as.matrix.MXNDArray.Rd
index 2173cf01489d..68f9afdd230b 100644
--- a/R-package/man/as.matrix.MXNDArray.Rd
+++ b/R-package/man/as.matrix.MXNDArray.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{as.matrix.MXNDArray}
 \alias{as.matrix.MXNDArray}
diff --git a/R-package/man/ctx.Rd b/R-package/man/ctx.Rd
index a0b5274cb4c8..ed370a130a80 100644
--- a/R-package/man/ctx.Rd
+++ b/R-package/man/ctx.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{ctx}
 \alias{ctx}
diff --git a/R-package/man/dim.MXNDArray.Rd b/R-package/man/dim.MXNDArray.Rd
index 139268e713b8..1c68efc8b8ea 100644
--- a/R-package/man/dim.MXNDArray.Rd
+++ b/R-package/man/dim.MXNDArray.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{dim.MXNDArray}
 \alias{dim.MXNDArray}
diff --git a/R-package/man/graph.viz.Rd b/R-package/man/graph.viz.Rd
new file mode 100644
index 000000000000..c9fd94a5e44f
--- /dev/null
+++ b/R-package/man/graph.viz.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/viz.graph.R
+\name{graph.viz}
+\alias{graph.viz}
+\title{Convert symbol to dot object for visualization purpose.}
+\usage{
+graph.viz(model, graph.title = "Computation graph",
+  graph.title.font.name = "Helvetica", graph.title.font.size = 30,
+  graph.width.px = 500, graph.height.px = 500)
+}
+\arguments{
+\item{model}{a \code{string} representing the path to a file containing the \code{JSon} of a model dump or the actual model dump.}
+
+\item{graph.title}{a \code{string} displayed on top of the viz.}
+
+\item{graph.title.font.name}{a \code{string} representing the font to use for the title.}
+
+\item{graph.title.font.size}{a \code{numeric} representing the size of the font to use for the title.}
+
+\item{graph.width.px}{a \code{numeric} representing the size (width) of the graph. In pixels}
+
+\item{graph.height.px}{a \code{numeric} representing the size (height) of the graph. In pixels}
+}
+\value{
+a graph object ready to be displayed with the \code{print} function.
+}
+\description{
+Convert symbol to dot object for visualization purpose.
+}
+
diff --git a/R-package/man/is.mx.context.Rd b/R-package/man/is.mx.context.Rd
index 6a2874208075..a05d2f72e644 100644
--- a/R-package/man/is.mx.context.Rd
+++ b/R-package/man/is.mx.context.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/context.R
 \name{is.mx.context}
 \alias{is.mx.context}
diff --git a/R-package/man/is.mx.dataiter.Rd b/R-package/man/is.mx.dataiter.Rd
index 47cc5b0bc37e..e010af6b1984 100644
--- a/R-package/man/is.mx.dataiter.Rd
+++ b/R-package/man/is.mx.dataiter.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/io.R
 \name{is.mx.dataiter}
 \alias{is.mx.dataiter}
diff --git a/R-package/man/is.mx.ndarray.Rd b/R-package/man/is.mx.ndarray.Rd
index e3e9d5ef9a81..80994cddadc1 100644
--- a/R-package/man/is.mx.ndarray.Rd
+++ b/R-package/man/is.mx.ndarray.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{is.mx.ndarray}
 \alias{is.mx.ndarray}
@@ -17,5 +17,6 @@ mat = mx.nd.array(1:10)
 is.mx.ndarray(mat)
 mat2 = 1:10
 is.mx.ndarray(mat2)
+
 }
 
diff --git a/R-package/man/is.mx.symbol.Rd b/R-package/man/is.mx.symbol.Rd
index adee39247584..54546c9acca6 100644
--- a/R-package/man/is.mx.symbol.Rd
+++ b/R-package/man/is.mx.symbol.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{is.mx.symbol}
 \alias{is.mx.symbol}
diff --git a/R-package/man/length.MXNDArray.Rd b/R-package/man/length.MXNDArray.Rd
index 79f6cc156481..059fab3d706c 100644
--- a/R-package/man/length.MXNDArray.Rd
+++ b/R-package/man/length.MXNDArray.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{length.MXNDArray}
 \alias{length.MXNDArray}
diff --git a/R-package/man/mx.apply.Rd b/R-package/man/mx.apply.Rd
index 44707a4c87fe..3bfb9ca6945e 100644
--- a/R-package/man/mx.apply.Rd
+++ b/R-package/man/mx.apply.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{mx.apply}
 \alias{mx.apply}
diff --git a/R-package/man/mx.callback.log.train.metric.Rd b/R-package/man/mx.callback.log.train.metric.Rd
index fb5502d8e4db..bdb2feaed8c1 100644
--- a/R-package/man/mx.callback.log.train.metric.Rd
+++ b/R-package/man/mx.callback.log.train.metric.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callback.R
 \name{mx.callback.log.train.metric}
 \alias{mx.callback.log.train.metric}
diff --git a/R-package/man/mx.callback.save.checkpoint.Rd b/R-package/man/mx.callback.save.checkpoint.Rd
index 92814aa3b041..defcce8e3d3f 100644
--- a/R-package/man/mx.callback.save.checkpoint.Rd
+++ b/R-package/man/mx.callback.save.checkpoint.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callback.R
 \name{mx.callback.save.checkpoint}
 \alias{mx.callback.save.checkpoint}
diff --git a/R-package/man/mx.cpu.Rd b/R-package/man/mx.cpu.Rd
index 4e3dcb6282c5..2c008239ddad 100644
--- a/R-package/man/mx.cpu.Rd
+++ b/R-package/man/mx.cpu.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/context.R
 \name{mx.cpu}
 \alias{mx.cpu}
diff --git a/R-package/man/mx.ctx.default.Rd b/R-package/man/mx.ctx.default.Rd
index 6f599ba4453c..95d014966c25 100644
--- a/R-package/man/mx.ctx.default.Rd
+++ b/R-package/man/mx.ctx.default.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/context.R
 \name{mx.ctx.default}
 \alias{mx.ctx.default}
@@ -7,7 +7,7 @@
 mx.ctx.default(new = NULL)
 }
 \arguments{
-\item{new,}{optional takes \code{mx.cpu()} or \code{mx.gpu(id)}, new default ctx.}
+\item{new, }{optional takes \code{mx.cpu()} or \code{mx.gpu(id)}, new default ctx.}
 }
 \value{
 The default context.
diff --git a/R-package/man/mx.exec.backward.Rd b/R-package/man/mx.exec.backward.Rd
index f4c922533160..0d62b1ff755e 100644
--- a/R-package/man/mx.exec.backward.Rd
+++ b/R-package/man/mx.exec.backward.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/executor.R
 \name{mx.exec.backward}
 \alias{mx.exec.backward}
diff --git a/R-package/man/mx.exec.forward.Rd b/R-package/man/mx.exec.forward.Rd
index 9af80853dcc7..83529ef0ec9f 100644
--- a/R-package/man/mx.exec.forward.Rd
+++ b/R-package/man/mx.exec.forward.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/executor.R
 \name{mx.exec.forward}
 \alias{mx.exec.forward}
diff --git a/R-package/man/mx.exec.update.arg.arrays.Rd b/R-package/man/mx.exec.update.arg.arrays.Rd
index b9bda6faffd9..64844961f61d 100644
--- a/R-package/man/mx.exec.update.arg.arrays.Rd
+++ b/R-package/man/mx.exec.update.arg.arrays.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/executor.R
 \name{mx.exec.update.arg.arrays}
 \alias{mx.exec.update.arg.arrays}
diff --git a/R-package/man/mx.exec.update.aux.arrays.Rd b/R-package/man/mx.exec.update.aux.arrays.Rd
index e66d3365e541..c7511904fdc1 100644
--- a/R-package/man/mx.exec.update.aux.arrays.Rd
+++ b/R-package/man/mx.exec.update.aux.arrays.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/executor.R
 \name{mx.exec.update.aux.arrays}
 \alias{mx.exec.update.aux.arrays}
diff --git a/R-package/man/mx.gpu.Rd b/R-package/man/mx.gpu.Rd
index a45710c7d278..5546073c2f07 100644
--- a/R-package/man/mx.gpu.Rd
+++ b/R-package/man/mx.gpu.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/context.R
 \name{mx.gpu}
 \alias{mx.gpu}
diff --git a/R-package/man/mx.init.create.Rd b/R-package/man/mx.init.create.Rd
index fd76a0d66a58..847128320f75 100644
--- a/R-package/man/mx.init.create.Rd
+++ b/R-package/man/mx.init.create.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/initializer.R
 \name{mx.init.create}
 \alias{mx.init.create}
diff --git a/R-package/man/mx.init.internal.default.Rd b/R-package/man/mx.init.internal.default.Rd
index 195e70106ad2..0b167f040752 100644
--- a/R-package/man/mx.init.internal.default.Rd
+++ b/R-package/man/mx.init.internal.default.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/initializer.R
 \name{mx.init.internal.default}
 \alias{mx.init.internal.default}
diff --git a/R-package/man/mx.init.normal.Rd b/R-package/man/mx.init.normal.Rd
index aea30fd79fd2..2b8fe05ffe9b 100644
--- a/R-package/man/mx.init.normal.Rd
+++ b/R-package/man/mx.init.normal.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/initializer.R
 \name{mx.init.normal}
 \alias{mx.init.normal}
diff --git a/R-package/man/mx.init.uniform.Rd b/R-package/man/mx.init.uniform.Rd
index 0b8afd52921b..3e0d6a93fe9e 100644
--- a/R-package/man/mx.init.uniform.Rd
+++ b/R-package/man/mx.init.uniform.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/initializer.R
 \name{mx.init.uniform}
 \alias{mx.init.uniform}
diff --git a/R-package/man/mx.io.ImageRecordIter.Rd b/R-package/man/mx.io.ImageRecordIter.Rd
index a1a4c6633f58..4e13ca79790a 100644
--- a/R-package/man/mx.io.ImageRecordIter.Rd
+++ b/R-package/man/mx.io.ImageRecordIter.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.io.ImageRecordIter}
 \alias{mx.io.ImageRecordIter}
diff --git a/R-package/man/mx.io.MNISTIter.Rd b/R-package/man/mx.io.MNISTIter.Rd
index 798f71797991..2e239022319e 100644
--- a/R-package/man/mx.io.MNISTIter.Rd
+++ b/R-package/man/mx.io.MNISTIter.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.io.MNISTIter}
 \alias{mx.io.MNISTIter}
diff --git a/R-package/man/mx.io.arrayiter.Rd b/R-package/man/mx.io.arrayiter.Rd
index cb0db7d4a7fa..c6651dcb12fc 100644
--- a/R-package/man/mx.io.arrayiter.Rd
+++ b/R-package/man/mx.io.arrayiter.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/io.R
 \name{mx.io.arrayiter}
 \alias{mx.io.arrayiter}
diff --git a/R-package/man/mx.io.extract.Rd b/R-package/man/mx.io.extract.Rd
index 2897d87465cf..12a778b6609c 100644
--- a/R-package/man/mx.io.extract.Rd
+++ b/R-package/man/mx.io.extract.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/io.R
 \name{mx.io.extract}
 \alias{mx.io.extract}
diff --git a/R-package/man/mx.kv.create.Rd b/R-package/man/mx.kv.create.Rd
index 4ad4d4c64b0a..2a602daea55e 100644
--- a/R-package/man/mx.kv.create.Rd
+++ b/R-package/man/mx.kv.create.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/kvstore.R
 \name{mx.kv.create}
 \alias{mx.kv.create}
diff --git a/R-package/man/mx.metric.accuracy.Rd b/R-package/man/mx.metric.accuracy.Rd
index 174d77fed8f9..afbca8357ae7 100644
--- a/R-package/man/mx.metric.accuracy.Rd
+++ b/R-package/man/mx.metric.accuracy.Rd
@@ -1,15 +1,10 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/metric.R
 \docType{data}
 \name{mx.metric.accuracy}
 \alias{mx.metric.accuracy}
 \title{Accuracy metric for classification}
-\format{\preformatted{List of 3
- $ init  :function ()  
- $ update:function (label, pred, state)  
- $ get   :function (state)  
- - attr(*, "class")= chr "mx.metric"
-}}
+\format{An object of class \code{mx.metric} of length 3.}
 \usage{
 mx.metric.accuracy
 }
diff --git a/R-package/man/mx.metric.custom.Rd b/R-package/man/mx.metric.custom.Rd
index 5671c931ca2a..eb745decef34 100644
--- a/R-package/man/mx.metric.custom.Rd
+++ b/R-package/man/mx.metric.custom.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/metric.R
 \name{mx.metric.custom}
 \alias{mx.metric.custom}
diff --git a/R-package/man/mx.metric.mae.Rd b/R-package/man/mx.metric.mae.Rd
index a98df21f7d7f..6bade0e5a8ee 100644
--- a/R-package/man/mx.metric.mae.Rd
+++ b/R-package/man/mx.metric.mae.Rd
@@ -1,15 +1,10 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/metric.R
 \docType{data}
 \name{mx.metric.mae}
 \alias{mx.metric.mae}
 \title{MAE (Mean Absolute Error) metric for regression}
-\format{\preformatted{List of 3
- $ init  :function ()  
- $ update:function (label, pred, state)  
- $ get   :function (state)  
- - attr(*, "class")= chr "mx.metric"
-}}
+\format{An object of class \code{mx.metric} of length 3.}
 \usage{
 mx.metric.mae
 }
diff --git a/R-package/man/mx.metric.rmse.Rd b/R-package/man/mx.metric.rmse.Rd
index 76b4696a910b..636dc37a8d0c 100644
--- a/R-package/man/mx.metric.rmse.Rd
+++ b/R-package/man/mx.metric.rmse.Rd
@@ -1,15 +1,10 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/metric.R
 \docType{data}
 \name{mx.metric.rmse}
 \alias{mx.metric.rmse}
 \title{RMSE (Root Mean Squared Error) metric for regression}
-\format{\preformatted{List of 3
- $ init  :function ()  
- $ update:function (label, pred, state)  
- $ get   :function (state)  
- - attr(*, "class")= chr "mx.metric"
-}}
+\format{An object of class \code{mx.metric} of length 3.}
 \usage{
 mx.metric.rmse
 }
diff --git a/R-package/man/mx.metric.rmsle.Rd b/R-package/man/mx.metric.rmsle.Rd
index 3e2737fe07b7..ffab1b3cb227 100644
--- a/R-package/man/mx.metric.rmsle.Rd
+++ b/R-package/man/mx.metric.rmsle.Rd
@@ -1,15 +1,10 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/metric.R
 \docType{data}
 \name{mx.metric.rmsle}
 \alias{mx.metric.rmsle}
 \title{RMSLE (Root Mean Squared Logarithmic Error) metric for regression}
-\format{\preformatted{List of 3
- $ init  :function ()  
- $ update:function (label, pred, state)  
- $ get   :function (state)  
- - attr(*, "class")= chr "mx.metric"
-}}
+\format{An object of class \code{mx.metric} of length 3.}
 \usage{
 mx.metric.rmsle
 }
diff --git a/R-package/man/mx.model.FeedForward.create.Rd b/R-package/man/mx.model.FeedForward.create.Rd
index e64ad24dae44..a3a80338cfa3 100644
--- a/R-package/man/mx.model.FeedForward.create.Rd
+++ b/R-package/man/mx.model.FeedForward.create.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/model.R
 \name{mx.model.FeedForward.create}
 \alias{mx.model.FeedForward.create}
@@ -28,7 +28,7 @@ The number of iterations over training data to train the model.}
 \item{optimizer}{string, default="sgd"
 The optimization method.}
 
-\item{initializer,}{initializer object. default=mx.init.uniform(0.01)
+\item{initializer, }{initializer object. default=mx.init.uniform(0.01)
 The initialization scheme for parameters.}
 
 \item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional
diff --git a/R-package/man/mx.model.load.Rd b/R-package/man/mx.model.load.Rd
index c5b8781c80e7..72a6b33514c2 100644
--- a/R-package/man/mx.model.load.Rd
+++ b/R-package/man/mx.model.load.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/model.R
 \name{mx.model.load}
 \alias{mx.model.load}
diff --git a/R-package/man/mx.model.save.Rd b/R-package/man/mx.model.save.Rd
index 0cbc724b2eb1..dbf13f653837 100644
--- a/R-package/man/mx.model.save.Rd
+++ b/R-package/man/mx.model.save.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/model.R
 \name{mx.model.save}
 \alias{mx.model.save}
diff --git a/R-package/man/mx.nd.array.Rd b/R-package/man/mx.nd.array.Rd
index 061ba0912094..95cefb4e93fd 100644
--- a/R-package/man/mx.nd.array.Rd
+++ b/R-package/man/mx.nd.array.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{mx.nd.array}
 \alias{mx.nd.array}
@@ -23,5 +23,6 @@ Create a new \code{mx.ndarray} that copies the content from src on ctx.
 mat = mx.nd.array(x)
 mat = 1 - mat + (2 * mat)/(mat + 0.5)
 as.array(mat)
+
 }
 
diff --git a/R-package/man/mx.nd.choose.element.Rd b/R-package/man/mx.nd.choose.element.0index.Rd
similarity index 52%
rename from R-package/man/mx.nd.choose.element.Rd
rename to R-package/man/mx.nd.choose.element.0index.Rd
index 19db0393ec76..bae9776cdae1 100644
--- a/R-package/man/mx.nd.choose.element.Rd
+++ b/R-package/man/mx.nd.choose.element.0index.Rd
@@ -1,8 +1,8 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
-\name{mx.nd.choose.element}
-\alias{mx.nd.choose.element}
-\title{Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs}
+\name{mx.nd.choose.element.0index}
+\alias{mx.nd.choose.element.0index}
+\title{Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs. This function assume rhs uses 0-based index.}
 \arguments{
 \item{lhs}{NDArray
 Left operand to the function.}
@@ -14,6 +14,6 @@ Right operand to the function.}
 out The result mx.ndarray
 }
 \description{
-Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs
+Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs. This function assume rhs uses 0-based index.
 }
 
diff --git a/R-package/man/mx.nd.clip.Rd b/R-package/man/mx.nd.clip.Rd
index 5256bd98f54e..8185f36fdcfa 100644
--- a/R-package/man/mx.nd.clip.Rd
+++ b/R-package/man/mx.nd.clip.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.nd.clip}
 \alias{mx.nd.clip}
diff --git a/R-package/man/mx.nd.copyto.Rd b/R-package/man/mx.nd.copyto.Rd
index 2c8d721572ab..6c3e1c0bd73e 100644
--- a/R-package/man/mx.nd.copyto.Rd
+++ b/R-package/man/mx.nd.copyto.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{mx.nd.copyto}
 \alias{mx.nd.copyto}
diff --git a/R-package/man/mx.nd.dot.Rd b/R-package/man/mx.nd.dot.Rd
index 40c9d8e0550c..2576d70be2c8 100644
--- a/R-package/man/mx.nd.dot.Rd
+++ b/R-package/man/mx.nd.dot.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.nd.dot}
 \alias{mx.nd.dot}
diff --git a/R-package/man/mx.nd.exp.Rd b/R-package/man/mx.nd.exp.Rd
new file mode 100644
index 000000000000..b876d8235f51
--- /dev/null
+++ b/R-package/man/mx.nd.exp.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.exp}
+\alias{mx.nd.exp}
+\title{Take exp of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take exp of the src
+}
+
diff --git a/R-package/man/mx.nd.load.Rd b/R-package/man/mx.nd.load.Rd
index 850e943eebae..0deb2e0af36d 100644
--- a/R-package/man/mx.nd.load.Rd
+++ b/R-package/man/mx.nd.load.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{mx.nd.load}
 \alias{mx.nd.load}
@@ -18,5 +18,6 @@ mx.nd.save(mat, 'temp.mat')
 mat2 = mx.nd.load('temp.mat')
 as.array(mat)
 as.array(mat2)
+
 }
 
diff --git a/R-package/man/mx.nd.log.Rd b/R-package/man/mx.nd.log.Rd
new file mode 100644
index 000000000000..f5825e55d6b2
--- /dev/null
+++ b/R-package/man/mx.nd.log.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.log}
+\alias{mx.nd.log}
+\title{Take log of the src}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take log of the src
+}
+
diff --git a/R-package/man/mx.nd.norm.Rd b/R-package/man/mx.nd.norm.Rd
new file mode 100644
index 000000000000..7e2feb7a889f
--- /dev/null
+++ b/R-package/man/mx.nd.norm.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.nd.norm}
+\alias{mx.nd.norm}
+\title{Take L2 norm of the src.The result will be ndarray of shape (1,) on the same device.}
+\arguments{
+\item{src}{NDArray
+Source input to the function}
+}
+\value{
+out The result mx.ndarray
+}
+\description{
+Take L2 norm of the src.The result will be ndarray of shape (1,) on the same device.
+}
+
diff --git a/R-package/man/mx.nd.ones.Rd b/R-package/man/mx.nd.ones.Rd
index 2f7bc8acb290..c191c4c26578 100644
--- a/R-package/man/mx.nd.ones.Rd
+++ b/R-package/man/mx.nd.ones.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{mx.nd.ones}
 \alias{mx.nd.ones}
@@ -21,5 +21,6 @@ mat2 = mx.nd.ones(c(5,5))
 as.array(mat)
 mat3 = mx.nd.ones(c(3,3,3))
 as.array(mat3)
+
 }
 
diff --git a/R-package/man/mx.nd.save.Rd b/R-package/man/mx.nd.save.Rd
index bca981d47c96..7f20599183a4 100644
--- a/R-package/man/mx.nd.save.Rd
+++ b/R-package/man/mx.nd.save.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{mx.nd.save}
 \alias{mx.nd.save}
@@ -20,5 +20,6 @@ mx.nd.save(mat, 'temp.mat')
 mat2 = mx.nd.load('temp.mat')
 as.array(mat)
 as.array(mat2)
+
 }
 
diff --git a/R-package/man/mx.nd.sqrt.Rd b/R-package/man/mx.nd.sqrt.Rd
index af96445d89f7..2a7e3a0fe5e5 100644
--- a/R-package/man/mx.nd.sqrt.Rd
+++ b/R-package/man/mx.nd.sqrt.Rd
@@ -1,16 +1,16 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.nd.sqrt}
 \alias{mx.nd.sqrt}
-\title{Take square root of the src}
+\title{Take sqrt of the src}
 \arguments{
 \item{src}{NDArray
-Source input to the function.}
+Source input to the function}
 }
 \value{
 out The result mx.ndarray
 }
 \description{
-Take square root of the src
+Take sqrt of the src
 }
 
diff --git a/R-package/man/mx.nd.square.Rd b/R-package/man/mx.nd.square.Rd
index 063b1359ee6e..ea5025a8b7a0 100644
--- a/R-package/man/mx.nd.square.Rd
+++ b/R-package/man/mx.nd.square.Rd
@@ -1,11 +1,11 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.nd.square}
 \alias{mx.nd.square}
 \title{Take square of the src}
 \arguments{
 \item{src}{NDArray
-Source input to the function.}
+Source input to the function}
 }
 \value{
 out The result mx.ndarray
diff --git a/R-package/man/mx.nd.zeros.Rd b/R-package/man/mx.nd.zeros.Rd
index 6d522abbec08..3736bbbe90da 100644
--- a/R-package/man/mx.nd.zeros.Rd
+++ b/R-package/man/mx.nd.zeros.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{mx.nd.zeros}
 \alias{mx.nd.zeros}
@@ -21,5 +21,6 @@ mat2 = mx.nd.zeros(c(5,5))
 as.array(mat)
 mat3 = mx.nd.zeroes(c(3,3,3))
 as.array(mat3)
+
 }
 
diff --git a/R-package/man/mx.opt.create.Rd b/R-package/man/mx.opt.create.Rd
index 813baf90454d..b1c0c07b97ac 100644
--- a/R-package/man/mx.opt.create.Rd
+++ b/R-package/man/mx.opt.create.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/optimizer.R
 \name{mx.opt.create}
 \alias{mx.opt.create}
diff --git a/R-package/man/mx.opt.get.updater.Rd b/R-package/man/mx.opt.get.updater.Rd
index db63d7cb6637..b0cb07b649c9 100644
--- a/R-package/man/mx.opt.get.updater.Rd
+++ b/R-package/man/mx.opt.get.updater.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/optimizer.R
 \name{mx.opt.get.updater}
 \alias{mx.opt.get.updater}
diff --git a/R-package/man/mx.opt.sgd.Rd b/R-package/man/mx.opt.sgd.Rd
index f81e3ad81cfe..6493c4c37176 100644
--- a/R-package/man/mx.opt.sgd.Rd
+++ b/R-package/man/mx.opt.sgd.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/optimizer.R
 \name{mx.opt.sgd}
 \alias{mx.opt.sgd}
diff --git a/R-package/man/mx.rnorm.Rd b/R-package/man/mx.rnorm.Rd
index 43a63c000394..8c87a325dd97 100644
--- a/R-package/man/mx.rnorm.Rd
+++ b/R-package/man/mx.rnorm.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/random.R
 \name{mx.rnorm}
 \alias{mx.rnorm}
@@ -13,17 +13,19 @@ mx.rnorm(shape, mean = 0, sd = 1, ctx = NULL)
 
 \item{sd}{numeric, The standard deviations.}
 
-\item{ctx,}{optional The context device of the array. mx.ctx.default() will be used in default.}
+\item{ctx, }{optional The context device of the array. mx.ctx.default() will be used in default.}
 }
 \description{
 Generate nomal distribution with mean and sd.
 }
 \examples{
+
 mx.set.seed(0)
 as.array(mx.runif(2))
 # 0.5488135 0.5928446
 mx.set.seed(0)
 as.array(mx.rnorm(2))
 # 2.212206 1.163079
+
 }
 
diff --git a/R-package/man/mx.runif.Rd b/R-package/man/mx.runif.Rd
index bf89e502c488..565b96ce3043 100644
--- a/R-package/man/mx.runif.Rd
+++ b/R-package/man/mx.runif.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/random.R
 \name{mx.runif}
 \alias{mx.runif}
@@ -13,17 +13,19 @@ mx.runif(shape, min = 0, max = 1, ctx = NULL)
 
 \item{max}{numeric, The upper bound of distribution.}
 
-\item{ctx,}{optional The context device of the array. mx.ctx.default() will be used in default.}
+\item{ctx, }{optional The context device of the array. mx.ctx.default() will be used in default.}
 }
 \description{
 Generate uniform distribution in [low, high) with specified shape.
 }
 \examples{
+
 mx.set.seed(0)
 as.array(mx.runif(2))
 # 0.5488135 0.5928446
 mx.set.seed(0)
 as.array(mx.rnorm(2))
 # 2.212206 1.163079
+
 }
 
diff --git a/R-package/man/mx.set.seed.Rd b/R-package/man/mx.set.seed.Rd
index 4139f4ab9e08..4ab6e67ea7ef 100644
--- a/R-package/man/mx.set.seed.Rd
+++ b/R-package/man/mx.set.seed.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/random.R
 \name{mx.set.seed}
 \alias{mx.set.seed}
@@ -23,11 +23,13 @@ random number generations. It can also be quite costly to seed these PRNGs.
 So we introduced \code{mx.set.seed} for mxnet specific device random numbers.
 }
 \examples{
+
 mx.set.seed(0)
 as.array(mx.runif(2))
 # 0.5488135 0.5928446
 mx.set.seed(0)
 as.array(mx.rnorm(2))
 # 2.212206 1.163079
+
 }
 
diff --git a/R-package/man/mx.simple.bind.Rd b/R-package/man/mx.simple.bind.Rd
index 4745a200477a..72af44cca995 100644
--- a/R-package/man/mx.simple.bind.Rd
+++ b/R-package/man/mx.simple.bind.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/executor.R
 \name{mx.simple.bind}
 \alias{mx.simple.bind}
diff --git a/R-package/man/mx.symbol.Activation.Rd b/R-package/man/mx.symbol.Activation.Rd
index b3c2f38780ab..3fd9892faedc 100644
--- a/R-package/man/mx.symbol.Activation.Rd
+++ b/R-package/man/mx.symbol.Activation.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Activation}
 \alias{mx.symbol.Activation}
diff --git a/R-package/man/mx.symbol.BatchNorm.Rd b/R-package/man/mx.symbol.BatchNorm.Rd
index 838e89ce0db2..2f7a984d5d97 100644
--- a/R-package/man/mx.symbol.BatchNorm.Rd
+++ b/R-package/man/mx.symbol.BatchNorm.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.BatchNorm}
 \alias{mx.symbol.BatchNorm}
diff --git a/R-package/man/mx.symbol.BlockGrad.Rd b/R-package/man/mx.symbol.BlockGrad.Rd
new file mode 100644
index 000000000000..6a7e6037e7a6
--- /dev/null
+++ b/R-package/man/mx.symbol.BlockGrad.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.BlockGrad}
+\alias{mx.symbol.BlockGrad}
+\title{Get output from a symbol and pass 0 gradient back}
+\usage{
+mx.symbol.BlockGrad(...)
+}
+\arguments{
+\item{data}{Symbol
+Input data.}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Get output from a symbol and pass 0 gradient back
+}
+
diff --git a/R-package/man/mx.symbol.Concat.Rd b/R-package/man/mx.symbol.Concat.Rd
index 8254d0fadabe..e290ede87c9a 100644
--- a/R-package/man/mx.symbol.Concat.Rd
+++ b/R-package/man/mx.symbol.Concat.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Concat}
 \alias{mx.symbol.Concat}
diff --git a/R-package/man/mx.symbol.Convolution.Rd b/R-package/man/mx.symbol.Convolution.Rd
index ba5f0d666cf8..8914c6cbec78 100644
--- a/R-package/man/mx.symbol.Convolution.Rd
+++ b/R-package/man/mx.symbol.Convolution.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Convolution}
 \alias{mx.symbol.Convolution}
@@ -29,7 +29,7 @@ pad for convolution: (y, x)}
 convolution filter(channel) number}
 
 \item{num.group}{int (non-negative), optional, default=1
-number of groups partition}
+Number of groups partition. This option is not supported by CuDNN, you can use SliceChannel to num_group,apply convolution and concat instead to achieve the same need.}
 
 \item{workspace}{long (non-negative), optional, default=512
 Tmp workspace for convolution (MB)}
diff --git a/R-package/man/mx.symbol.Dropout.Rd b/R-package/man/mx.symbol.Dropout.Rd
index 560b580eeba1..f86e475c8417 100644
--- a/R-package/man/mx.symbol.Dropout.Rd
+++ b/R-package/man/mx.symbol.Dropout.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Dropout}
 \alias{mx.symbol.Dropout}
diff --git a/R-package/man/mx.symbol.ElementWiseSum.Rd b/R-package/man/mx.symbol.ElementWiseSum.Rd
index 601c7d03a60e..4bbdd72ef425 100644
--- a/R-package/man/mx.symbol.ElementWiseSum.Rd
+++ b/R-package/man/mx.symbol.ElementWiseSum.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.ElementWiseSum}
 \alias{mx.symbol.ElementWiseSum}
diff --git a/R-package/man/mx.symbol.Flatten.Rd b/R-package/man/mx.symbol.Flatten.Rd
index de96b1a6b41e..8f0c239d67d5 100644
--- a/R-package/man/mx.symbol.Flatten.Rd
+++ b/R-package/man/mx.symbol.Flatten.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Flatten}
 \alias{mx.symbol.Flatten}
diff --git a/R-package/man/mx.symbol.FullyConnected.Rd b/R-package/man/mx.symbol.FullyConnected.Rd
index 31d87ef1cf81..fab961dd4ee1 100644
--- a/R-package/man/mx.symbol.FullyConnected.Rd
+++ b/R-package/man/mx.symbol.FullyConnected.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.FullyConnected}
 \alias{mx.symbol.FullyConnected}
diff --git a/R-package/man/mx.symbol.Group.Rd b/R-package/man/mx.symbol.Group.Rd
index f46e30a13731..c3162db22188 100644
--- a/R-package/man/mx.symbol.Group.Rd
+++ b/R-package/man/mx.symbol.Group.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{mx.symbol.Group}
 \alias{mx.symbol.Group}
diff --git a/R-package/man/mx.symbol.LRN.Rd b/R-package/man/mx.symbol.LRN.Rd
index 1c74dfc5bd5a..748767828b1a 100644
--- a/R-package/man/mx.symbol.LRN.Rd
+++ b/R-package/man/mx.symbol.LRN.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.LRN}
 \alias{mx.symbol.LRN}
diff --git a/R-package/man/mx.symbol.LeakyReLU.Rd b/R-package/man/mx.symbol.LeakyReLU.Rd
index bb843847555a..3a91c82e2df7 100644
--- a/R-package/man/mx.symbol.LeakyReLU.Rd
+++ b/R-package/man/mx.symbol.LeakyReLU.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.LeakyReLU}
 \alias{mx.symbol.LeakyReLU}
diff --git a/R-package/man/mx.symbol.LinearRegressionOutput.Rd b/R-package/man/mx.symbol.LinearRegressionOutput.Rd
index 8d00dd325d1b..4dd9faef6082 100644
--- a/R-package/man/mx.symbol.LinearRegressionOutput.Rd
+++ b/R-package/man/mx.symbol.LinearRegressionOutput.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.LinearRegressionOutput}
 \alias{mx.symbol.LinearRegressionOutput}
diff --git a/R-package/man/mx.symbol.LogisticRegressionOutput.Rd b/R-package/man/mx.symbol.LogisticRegressionOutput.Rd
index 221816ea6c15..f6825519961e 100644
--- a/R-package/man/mx.symbol.LogisticRegressionOutput.Rd
+++ b/R-package/man/mx.symbol.LogisticRegressionOutput.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.LogisticRegressionOutput}
 \alias{mx.symbol.LogisticRegressionOutput}
diff --git a/R-package/man/mx.symbol.Pooling.Rd b/R-package/man/mx.symbol.Pooling.Rd
index b7faf88b7a97..427c0df10ecc 100644
--- a/R-package/man/mx.symbol.Pooling.Rd
+++ b/R-package/man/mx.symbol.Pooling.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Pooling}
 \alias{mx.symbol.Pooling}
diff --git a/R-package/man/mx.symbol.Reshape.Rd b/R-package/man/mx.symbol.Reshape.Rd
index b06b74973ae3..803e5d1d4335 100644
--- a/R-package/man/mx.symbol.Reshape.Rd
+++ b/R-package/man/mx.symbol.Reshape.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Reshape}
 \alias{mx.symbol.Reshape}
diff --git a/R-package/man/mx.symbol.SliceChannel.Rd b/R-package/man/mx.symbol.SliceChannel.Rd
index 3416b6fbfb5c..93595749195c 100644
--- a/R-package/man/mx.symbol.SliceChannel.Rd
+++ b/R-package/man/mx.symbol.SliceChannel.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.SliceChannel}
 \alias{mx.symbol.SliceChannel}
diff --git a/R-package/man/mx.symbol.Softmax.Rd b/R-package/man/mx.symbol.Softmax.Rd
index 0b9fd0abf869..d574270170a1 100644
--- a/R-package/man/mx.symbol.Softmax.Rd
+++ b/R-package/man/mx.symbol.Softmax.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/mxnet_generated.R
 \name{mx.symbol.Softmax}
 \alias{mx.symbol.Softmax}
@@ -13,6 +13,9 @@ Input data to softmax.}
 \item{grad.scale}{float, optional, default=1
 Scale the gradient by a float factor}
 
+\item{multi.output}{boolean, optional, default=False
+If set to true, for a (n,k,x_1,..,x_n) dimensionalinput tensor, softmax will generate n*x_1*...*x_n output, eachhas k classes}
+
 \item{name}{string, optional
 Name of the resulting symbol.}
 }
diff --git a/R-package/man/mx.symbol.Variable.Rd b/R-package/man/mx.symbol.Variable.Rd
index 725e366b3e53..304609ce6ec7 100644
--- a/R-package/man/mx.symbol.Variable.Rd
+++ b/R-package/man/mx.symbol.Variable.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{mx.symbol.Variable}
 \alias{mx.symbol.Variable}
diff --git a/R-package/man/mx.symbol.exp.Rd b/R-package/man/mx.symbol.exp.Rd
new file mode 100644
index 000000000000..7ae386a70573
--- /dev/null
+++ b/R-package/man/mx.symbol.exp.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.exp}
+\alias{mx.symbol.exp}
+\title{Take exp of the src}
+\usage{
+mx.symbol.exp(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take exp of the src
+}
+
diff --git a/R-package/man/mx.symbol.infer.shape.Rd b/R-package/man/mx.symbol.infer.shape.Rd
index 0494e982a304..8d965bbea078 100644
--- a/R-package/man/mx.symbol.infer.shape.Rd
+++ b/R-package/man/mx.symbol.infer.shape.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{mx.symbol.infer.shape}
 \alias{mx.symbol.infer.shape}
diff --git a/R-package/man/mx.symbol.load.Rd b/R-package/man/mx.symbol.load.Rd
index 19a2f11d7329..6af053b978f6 100644
--- a/R-package/man/mx.symbol.load.Rd
+++ b/R-package/man/mx.symbol.load.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{mx.symbol.load}
 \alias{mx.symbol.load}
@@ -16,5 +16,6 @@ Load an mx.symbol object
 data = mx.symbol.Variable('data')
 mx.symbol.save(data, 'temp.symbol')
 data2 = mx.symbol.load('temp.symbol')
+
 }
 
diff --git a/R-package/man/mx.symbol.log.Rd b/R-package/man/mx.symbol.log.Rd
new file mode 100644
index 000000000000..4bd3ac0f0122
--- /dev/null
+++ b/R-package/man/mx.symbol.log.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.log}
+\alias{mx.symbol.log}
+\title{Take log of the src}
+\usage{
+mx.symbol.log(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take log of the src
+}
+
diff --git a/R-package/man/mx.symbol.save.Rd b/R-package/man/mx.symbol.save.Rd
index 8cc86655e055..ab6cef2d1df9 100644
--- a/R-package/man/mx.symbol.save.Rd
+++ b/R-package/man/mx.symbol.save.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{mx.symbol.save}
 \alias{mx.symbol.save}
@@ -18,5 +18,6 @@ Save an mx.symbol object
 data = mx.symbol.Variable('data')
 mx.symbol.save(data, 'temp.symbol')
 data2 = mx.symbol.load('temp.symbol')
+
 }
 
diff --git a/R-package/man/mx.symbol.sqrt.Rd b/R-package/man/mx.symbol.sqrt.Rd
new file mode 100644
index 000000000000..c810c5a845fb
--- /dev/null
+++ b/R-package/man/mx.symbol.sqrt.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.sqrt}
+\alias{mx.symbol.sqrt}
+\title{Take sqrt of the src}
+\usage{
+mx.symbol.sqrt(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take sqrt of the src
+}
+
diff --git a/R-package/man/mx.symbol.square.Rd b/R-package/man/mx.symbol.square.Rd
new file mode 100644
index 000000000000..c64b4aa00590
--- /dev/null
+++ b/R-package/man/mx.symbol.square.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/mxnet_generated.R
+\name{mx.symbol.square}
+\alias{mx.symbol.square}
+\title{Take square of the src}
+\usage{
+mx.symbol.square(...)
+}
+\arguments{
+\item{src}{Symbol
+Source symbolic input to the function}
+
+\item{name}{string, optional
+Name of the resulting symbol.}
+}
+\value{
+out The result mx.symbol
+}
+\description{
+Take square of the src
+}
+
diff --git a/R-package/man/mxnet.Rd b/R-package/man/mxnet.Rd
index df05e4ae6230..c3af345122fc 100644
--- a/R-package/man/mxnet.Rd
+++ b/R-package/man/mxnet.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/zzz.R
 \docType{package}
 \name{mxnet}
diff --git a/R-package/man/mxnet.export.Rd b/R-package/man/mxnet.export.Rd
index 69a10c5b16be..e8bdc1c26028 100644
--- a/R-package/man/mxnet.export.Rd
+++ b/R-package/man/mxnet.export.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/util.R
 \name{mxnet.export}
 \alias{mxnet.export}
diff --git a/R-package/man/outputs.Rd b/R-package/man/outputs.Rd
index acf1fa31e98c..95f184468df8 100644
--- a/R-package/man/outputs.Rd
+++ b/R-package/man/outputs.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/symbol.R
 \name{outputs}
 \alias{outputs}
diff --git a/R-package/man/predict.MXFeedForwardModel.Rd b/R-package/man/predict.MXFeedForwardModel.Rd
index f6fb970f102f..a9802491a307 100644
--- a/R-package/man/predict.MXFeedForwardModel.Rd
+++ b/R-package/man/predict.MXFeedForwardModel.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/model.R
 \name{predict.MXFeedForwardModel}
 \alias{predict.MXFeedForwardModel}
@@ -17,11 +17,11 @@
 \item{array.batch.size}{The batch size used in batching. Only used when X is R's array.}
 
 \item{array.layout}{can be "auto", "colmajor", "rowmajor", (detault=auto)
-    The layout of array. "rowmajor" is only supported for two dimensional array.
-    For matrix, "rowmajor" means dim(X) = c(nexample, nfeatures),
-    "colmajor" means dim(X) = c(nfeatures, nexample)
-    "auto" will auto detect the layout by match the feature size,
-     and will report error when X is a square matrix to ask user to explicitly specify layout.}
+The layout of array. "rowmajor" is only supported for two dimensional array.
+For matrix, "rowmajor" means dim(X) = c(nexample, nfeatures),
+"colmajor" means dim(X) = c(nfeatures, nexample)
+"auto" will auto detect the layout by match the feature size,
+ and will report error when X is a square matrix to ask user to explicitly specify layout.}
 }
 \description{
 Predict the outputs given a model and dataset.
diff --git a/R-package/man/print.MXNDArray.Rd b/R-package/man/print.MXNDArray.Rd
index cafcadd31992..769324f3cdf1 100644
--- a/R-package/man/print.MXNDArray.Rd
+++ b/R-package/man/print.MXNDArray.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/ndarray.R
 \name{print.MXNDArray}
 \alias{print.MXNDArray}
diff --git a/R-package/tests/testthat/test_model.R b/R-package/tests/testthat/test_model.R
new file mode 100644
index 000000000000..93784a622bbb
--- /dev/null
+++ b/R-package/tests/testthat/test_model.R
@@ -0,0 +1,62 @@
+require(mxnet)
+
+context("models")
+
+# test_that("basic symbol operation", {
+#   # Network configuration
+#   batch.size <- 100
+#   data <- mx.symbol.Variable("data")
+#   fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
+#   act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
+#   fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
+#   act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
+#   fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
+#   softmax <- mx.symbol.Softmax(fc3, name = "sm")
+#   
+#   dtrain = mx.io.MNISTIter(
+#     image="data/train-images-idx3-ubyte",
+#     label="data/train-labels-idx1-ubyte",
+#     data.shape=c(784),
+#     batch.size=batch.size,
+#     shuffle=TRUE,
+#     flat=TRUE,
+#     silent=0,
+#     seed=10)
+#   
+#   dtest = mx.io.MNISTIter(
+#     image="data/t10k-images-idx3-ubyte",
+#     label="data/t10k-labels-idx1-ubyte",
+#     data.shape=c(784),
+#     batch.size=batch.size,
+#     shuffle=FALSE,
+#     flat=TRUE,
+#     silent=0)
+#   
+#   mx.set.seed(0)
+#   devices = lapply(1:2, function(i) {
+#     mx.cpu(i)
+#   })
+#   
+#   # create the model
+#   model <- mx.model.FeedForward.create(softmax, X=dtrain, eval.data=dtest,
+#                                        ctx=devices, num.round=1,
+#                                        learning.rate=0.1, momentum=0.9,
+#                                        initializer=mx.init.uniform(0.07),
+#                                        epoch.end.callback=mx.callback.save.checkpoint("chkpt"),
+#                                        batch.end.callback=mx.callback.log.train.metric(100))
+#   
+#   # do prediction
+#   pred <- predict(model, dtest)
+#   label <- mx.io.extract(dtest, "label")
+#   dataX <- mx.io.extract(dtest, "data")
+#   # Predict with R's array
+#   pred2 <- predict(model, X=dataX)
+#   
+#   accuracy <- function(label, pred) {
+#     ypred = max.col(t(as.array(pred)))
+#     return(sum((as.array(label) + 1) == ypred) / length(label))
+#   }
+#   
+#   print(paste0("Finish prediction... accuracy=", accuracy(label, pred)))
+#   print(paste0("Finish prediction... accuracy2=", accuracy(label, pred2)))
+# })
diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
index 82ad3cd4515a..b6c81dcd28fc 100644
--- a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
+++ b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
@@ -45,8 +45,8 @@ fc1 <- mx.symbol.FullyConnected(data, num_hidden=20)
 act1 <- mx.symbol.Activation(fc1, act_type="tanh")
 fc2 <- mx.symbol.FullyConnected(act1, num_hidden=2)
 
-# Softmax function for the output layer
-softmax <- mx.symbol.Softmax(fc2)
+# SoftmaxOutput means multi-class probability prediction.
+softmax <- mx.symbol.SoftmaxOutput(fc2)
 ```
 
 According to the comments in the code, you can see the meaning of each function and its arguments. They can be easily modified according to your need.
@@ -103,7 +103,7 @@ data <- mx.symbol.Variable("data")
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
 
-# Softmax function for the output layer
+# Use linear regression for the output layer
 lro <- mx.symbol.LinearRegressionOutput(fc1)
 ```
 
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
index 1913887426cf..d34e92adf262 100644
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ b/R-package/vignettes/mnistCompetition.Rmd
@@ -49,7 +49,7 @@ act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
 fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
 act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
 fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-softmax <- mx.symbol.Softmax(fc3, name="sm")
+softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
 ```
 
 1. In `mxnet`, we use its own data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` use `data` to represent the input data, i.e. the input layer.
@@ -128,7 +128,7 @@ tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
 # second fullc
 fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
 # loss
-lenet <- mx.symbol.Softmax(data=fc2)
+lenet <- mx.symbol.SoftmaxOutput(data=fc2)
 ```
 
 Then let us reshape the matrices into arrays:
@@ -143,7 +143,7 @@ dim(test.array) <- c(28, 28, 1, ncol(test))
 Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
 
 ```{r}
-n.gpu <- 1 
+n.gpu <- 1
 device.cpu <- mx.cpu()
 device.gpu <- lapply(0:(n.gpu-1), function(i) {
   mx.gpu(i)
@@ -163,7 +163,7 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
                                      learning.rate=0.05, momentum=0.9, wd=0.00001,
                                      eval.metric=mx.metric.accuracy,
                                      epoch.end.callback=mx.callback.log.train.metric(100))
-print(proc.time() - tic) 
+print(proc.time() - tic)
 ```
 
 Training on GPU:
@@ -176,7 +176,7 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
                                      learning.rate=0.05, momentum=0.9, wd=0.00001,
                                      eval.metric=mx.metric.accuracy,
                                      epoch.end.callback=mx.callback.log.train.metric(100))
-print(proc.time() - tic) 
+print(proc.time() - tic)
 ```
 
 As you can see by using GPU, we can get a much faster speedup in training!
diff --git a/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd b/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd
index 0f69d5449344..3c729664558c 100644
--- a/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd
+++ b/R-package/vignettes/ndarrayAndSymbolTutorial.Rmd
@@ -13,7 +13,7 @@ applications. There are two major concepts introduced in this tutorial.
 
 ## NDArray: Vectorized tensor computations on CPUs and GPUs
 
-`NDArray` is the basic vectorized operation unit in MXNet for matrix and tensor computations. 
+`NDArray` is the basic vectorized operation unit in MXNet for matrix and tensor computations.
 Users can perform usual calculations as on R"s array, but with two additional features:
 
 1.  **multiple devices**: all operations can be run on various devices including
@@ -67,7 +67,7 @@ d <- c / a - 5
 as.array(d)
 ```
 
-If two `NDArray`s sit on different divices, we need to explicitly move them 
+If two `NDArray`s sit on different divices, we need to explicitly move them
 into the same one. For instance:
 
 ```{r, eval=FALSE}
@@ -93,7 +93,7 @@ a <- mx.nd.load("temp.ndarray")
 as.array(a[[1]])
 ```
 
-In case you want to save data to the distributed file system such as S3 and HDFS, 
+In case you want to save data to the distributed file system such as S3 and HDFS,
 we can directly save to and load from them. For example:
 
 ```{r,eval=FALSE}
@@ -156,7 +156,7 @@ net <- mx.symbol.Variable("data")
 net <- mx.symbol.FullyConnected(data=net, name="fc1", num_hidden=128)
 net <- mx.symbol.Activation(data=net, name="relu1", act_type="relu")
 net <- mx.symbol.FullyConnected(data=net, name="fc2", num_hidden=64)
-net <- mx.symbol.Softmax(data=net, name="out")
+net <- mx.symbol.SoftmaxOutput(data=net, name="out")
 class(net)
 ```
 
diff --git a/README.md b/README.md
index aef489ff7920..2c9d5a539b24 100644
--- a/README.md
+++ b/README.md
@@ -12,12 +12,11 @@ deep learning programs together to maximize the efficiency and your productivity
 
 What's New
 ----------
+* [Minimum MXNet Library in One File](amalgamation)
 * [Training Deep Net on 14 Million Images on A Single Machine](https://mxnet-bing.readthedocs.org/en/latest/tutorial/imagenet_full.html)
 * [MXNet.jl Julia binding initial release](https://github.com/dmlc/MXNet.jl)
 * [Design Note: Squeeze the Memory Consumption of Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html)
-* [LSTM Example by using symbolic API](https://github.com/dmlc/mxnet/tree/master/example/rnn)
-* [MXNet R Package brings Deep learning for R!](https://github.com/dmlc/mxnet/tree/master/R-package)
-* [Design Note: Dependency Engine for Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html)
+
 
 Contents
 --------
diff --git a/amalgamation/.gitignore b/amalgamation/.gitignore
new file mode 100644
index 000000000000..318284280c8a
--- /dev/null
+++ b/amalgamation/.gitignore
@@ -0,0 +1 @@
+*-all.cc
diff --git a/amalgamation/Makefile b/amalgamation/Makefile
new file mode 100644
index 000000000000..def30163f109
--- /dev/null
+++ b/amalgamation/Makefile
@@ -0,0 +1,44 @@
+export MXNET_ROOT=`pwd`/..
+# Change this to path of openblas
+export OPENBLAS_ROOT=`pwd`/OpenBLAS
+
+# Whether use minimum build without blas and SSE, this will make the library super slow
+ifndef MIN
+	export MIN= 0
+endif
+
+.PHONY: all clean
+
+CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall
+LDFLAGS=-lrt
+
+ifneq ($(MIN), 1)
+	CFLAGS+= -I${OPENBLAS_ROOT}
+	LDFLAGS+=-L${OPENBLAS_ROOT} -lopenblas
+endif
+
+all: libmxnet_predict.a ${MXNET_ROOT}/lib/libmxnet_predict.so
+
+mxnet_predict0.d: mxnet_predict0.cc
+	${CXX} ${CFLAGS} -MD -MF $@ \
+	-I ${MXNET_ROOT}/ -I ${MXNET_ROOT}/mshadow/ -I ${MXNET_ROOT}/dmlc-core/include \
+	-I ${MXNET_ROOT}/include -c $+
+	rm mxnet_predict0.o
+
+mxnet_predict-all.cc:  mxnet_predict0.d mxnet_predict0.cc
+	@echo "Generating amalgamation to " $@
+	python ./amalgamation.py $+ $@ $(MIN)
+
+mxnet_predict-all.o: mxnet_predict-all.cc
+	${CXX} ${CFLAGS} -fPIC -o $@ -c $+
+
+libmxnet_predict.a: mxnet_predict-all.o
+	ar rcs libmxnet_predict.a $+
+
+${MXNET_ROOT}/lib/libmxnet_predict.so:  mxnet_predict-all.o
+	@mkdir -p ${MXNET_ROOT}/lib
+	${CXX} ${CFLAGS} -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
+	ls -alh $@
+
+clean:
+	rm -f *.d *.o
diff --git a/amalgamation/README.md b/amalgamation/README.md
new file mode 100644
index 000000000000..c42a86981e50
--- /dev/null
+++ b/amalgamation/README.md
@@ -0,0 +1,23 @@
+MXNet Amalgamation
+==================
+This folder contains a amalgamation generation script to generate the entire mxnet library into one file.
+Currently it supports generation for [predict API](../include/mxnet/c_predict_api.h),
+which allows you to run prediction in platform independent way.
+
+How to Generate the Amalgamation
+--------------------------------
+Type ```make``` will generate the following files
+- mxnet_predict-all.cc
+  - The file you can used to compile predict API
+- ../lib/libmxnet_predict.so
+  - The dynamic library generated for prediction.
+
+You can also checkout the [Makefile](Makefile)
+
+Dependency
+----------
+The only dependency is a BLAS library.
+
+Acknowledgement
+---------------
+This module is created by [Jack Deng](https://github.com/jdeng).
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
new file mode 100644
index 000000000000..888367b8b345
--- /dev/null
+++ b/amalgamation/amalgamation.py
@@ -0,0 +1,125 @@
+import sys
+import os.path, re, StringIO
+
+blacklist = [
+    'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh',
+    'cuda_runtime.h', 'cudnn.h', 'cudnn_lrn-inl.h', 'curand.h',
+    'glog/logging.h', 'io/azure_filesys.h', 'io/hdfs_filesys.h', 'io/s3_filesys.h',
+    'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
+    'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
+    'nvml.h', 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h'
+    ]
+
+if len(sys.argv) < 4:
+    print("Usage: <source.d> <source.cc> <output> [minumum=0]\n"
+          "Minimum means no blas, no sse, no dependency, may run twice slower.")
+    exit(0)
+
+minimum = int(sys.argv[4]) if len(sys.argv) > 4 else 0
+
+if minimum:
+    blacklist += ['packet/sse-inl.h', 'emmintrin.h']
+
+def get_sources(def_file):
+    sources = []
+    files = []
+    visited = set()
+    for line in open(def_file):
+        files = files + line.strip().split(' ')
+
+    for f in files:
+        f = f.strip()
+        if not f or f.endswith('.o') or f == '\\': continue
+        fn = os.path.relpath(f)
+        if fn.find('/usr/') < 0 and fn not in visited:
+            sources.append(fn)
+            visited.add(fn)
+    return sources
+
+sources = get_sources(sys.argv[1])
+
+def find_source(name, start):
+    candidates = []
+    for x in sources:
+        if x == name or x.endswith('/' + name): candidates.append(x)
+    if not candidates: return ''
+    if len(candidates) == 1: return candidates[0]
+    for x in candidates:
+        if x.split('/')[1] == start.split('/')[1]: return x
+    return ''
+
+
+re1 = re.compile('<([./a-zA-Z0-9_-]*)>')
+re2 = re.compile('"([./a-zA-Z0-9_-]*)"')
+
+sysheaders = []
+history = set([])
+out = StringIO.StringIO()
+
+def expand(x, pending):
+    if x in history and x not in ['mshadow/mshadow/expr_scalar-inl.h']: # MULTIPLE includes
+        return
+
+    if x in pending:
+        #print 'loop found: %s in ' % x, pending
+        return
+
+    print >>out, "//===== EXPANDIND: %s =====\n" %x
+    for line in open(x):
+        if line.find('#include') < 0:
+            out.write(line)
+            continue
+        if line.strip().find('#include') > 0:
+            print line
+            continue
+        m = re1.search(line)
+        if not m: m = re2.search(line)
+        if not m:
+            print line + ' not found'
+            continue
+        h = m.groups()[0].strip('./')
+        source = find_source(h, x)
+        if not source:
+            if h not in blacklist and h not in sysheaders: sysheaders.append(h)
+        else:
+            expand(source, pending + [x])
+    print >>out, "//===== EXPANDED: %s =====\n" %x
+    history.add(x)
+
+
+expand(sys.argv[2], [])
+
+f = open(sys.argv[3], 'wb')
+
+if minimum != 0:
+    print >>f, "#define MSHADOW_STAND_ALONE 1"
+    print >>f, "#define MSHADOW_USE_SSE 0"
+    print >>f, "#define MSHADOW_USE_CBLAS 0"
+
+print >>f, '''
+#if defined(__MACH__)
+#include <mach/clock.h>
+#include <mach/mach.h>
+#endif
+
+#if !defined(__WIN32__)
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#if !defined(__ANDROID__) && (!defined(MSHADOW_USE_SSE) || MSHADOW_USE_SSE == 1)
+#include <emmintrin.h>
+#endif
+
+#endif
+'''
+
+for k in sorted(sysheaders):
+    print >>f, "#include <%s>" % k
+
+print >>f, ''
+print >>f, out.getvalue()
+
+for x in sources:
+    if x not in history and not x.endswith('.o'):
+        print 'Not processed:', x
+
diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc
new file mode 100644
index 000000000000..4397308177d2
--- /dev/null
+++ b/amalgamation/mxnet_predict0.cc
@@ -0,0 +1,48 @@
+// mexnet.cc
+
+#define MSHADOW_FORCE_STREAM
+#ifndef MSHADOW_USE_CBLAS
+#define MSHADOW_USE_CBLAS 	1
+#endif
+#define MSHADOW_USE_CUDA 	0
+#define MSHADOW_USE_MKL 	0
+#define MSHADOW_RABIT_PS 	0
+#define MSHADOW_DIST_PS 	0
+
+#define MXNET_USE_OPENCV 	0
+#define MXNET_PREDICT_ONLY 	1
+#define DISABLE_OPENMP 1
+
+#include "src/ndarray/unary_function.cc"
+#include "src/ndarray/ndarray_function.cc"
+#include "src/ndarray/ndarray.cc"
+#include "src/engine/engine.cc"
+#include "src/engine/naive_engine.cc"
+#include "src/symbol/graph_executor.cc"
+#include "src/symbol/static_graph.cc"
+#include "src/symbol/symbol.cc"
+#include "src/operator/operator.cc"
+#include "src/operator/activation.cc"
+#include "src/operator/batch_norm.cc"
+#include "src/operator/block_grad.cc"
+#include "src/operator/concat.cc"
+#include "src/operator/convolution.cc"
+#include "src/operator/dropout.cc"
+#include "src/operator/elementwise_binary_op.cc"
+#include "src/operator/elementwise_sum.cc"
+#include "src/operator/fully_connected.cc"
+#include "src/operator/leaky_relu.cc"
+#include "src/operator/lrn.cc"
+#include "src/operator/pooling.cc"
+#include "src/operator/regression_output.cc"
+#include "src/operator/reshape.cc"
+#include "src/operator/slice_channel.cc"
+#include "src/operator/softmax_output.cc"
+#include "src/operator/deconvolution.cc"
+#include "src/storage/storage.cc"
+#include "src/common/tblob_op_registry.cc"
+
+#include "src/resource.cc"
+
+#include "src/c_api/c_predict_api.cc"
+#include "src/c_api/c_api_error.cc"
diff --git a/doc/R-package/fiveMinutesNeuralNetwork.md b/doc/R-package/fiveMinutesNeuralNetwork.md
index a58eafa62474..e9018ecf6374 100644
--- a/doc/R-package/fiveMinutesNeuralNetwork.md
+++ b/doc/R-package/fiveMinutesNeuralNetwork.md
@@ -61,8 +61,8 @@ fc1 <- mx.symbol.FullyConnected(data, num_hidden=20)
 act1 <- mx.symbol.Activation(fc1, act_type="tanh")
 fc2 <- mx.symbol.FullyConnected(act1, num_hidden=2)
 
-# Softmax function for the output layer
-softmax <- mx.symbol.Softmax(fc2)
+# SoftmaxOutput means multi-class probability prediction.
+softmax <- mx.symbol.SoftmaxOutput(fc2)
 ```
 
 According to the comments in the code, you can see the meaning of each function and its arguments. They can be easily modified according to your need.
@@ -163,7 +163,7 @@ data <- mx.symbol.Variable("data")
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
 
-# Softmax function for the output layer
+# Use linear regression for the output layer
 lro <- mx.symbol.LinearRegressionOutput(fc1)
 ```
 
diff --git a/doc/R-package/mnistCompetition.md b/doc/R-package/mnistCompetition.md
index 16a7ca761146..95fff099bd04 100644
--- a/doc/R-package/mnistCompetition.md
+++ b/doc/R-package/mnistCompetition.md
@@ -51,7 +51,7 @@ table(train.y)
 
 ```
 ## train.y
-##    0    1    2    3    4    5    6    7    8    9 
+##    0    1    2    3    4    5    6    7    8    9
 ## 4132 4684 4177 4351 4072 3795 4137 4401 4063 4188
 ```
 
@@ -67,7 +67,7 @@ act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
 fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
 act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
 fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-softmax <- mx.symbol.Softmax(fc3, name="sm")
+softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
 ```
 
 1. In `mxnet`, we use its own data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` use `data` to represent the input data, i.e. the input layer.
@@ -177,7 +177,7 @@ table(pred.label)
 
 ```
 ## pred.label
-##    0    1    2    3    4    5    6    7    8    9 
+##    0    1    2    3    4    5    6    7    8    9
 ## 2818 3195 2744 2767 2683 2596 2798 2790 2784 2825
 ```
 
@@ -216,7 +216,7 @@ tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
 # second fullc
 fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
 # loss
-lenet <- mx.symbol.Softmax(data=fc2)
+lenet <- mx.symbol.SoftmaxOutput(data=fc2)
 ```
 
 Then let us reshape the matrices into arrays:
@@ -233,7 +233,7 @@ Next we are going to compare the training speed on different devices, so the def
 
 
 ```r
-n.gpu <- 1 
+n.gpu <- 1
 device.cpu <- mx.cpu()
 device.gpu <- lapply(0:(n.gpu-1), function(i) {
   mx.gpu(i)
@@ -266,11 +266,11 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
 ```
 
 ```r
-print(proc.time() - tic) 
+print(proc.time() - tic)
 ```
 
 ```
-##    user  system elapsed 
+##    user  system elapsed
 ## 130.030 204.976  83.821
 ```
 
@@ -317,11 +317,11 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
 ```
 
 ```r
-print(proc.time() - tic) 
+print(proc.time() - tic)
 ```
 
 ```
-##    user  system elapsed 
+##    user  system elapsed
 ##   9.288   1.680   6.889
 ```
 
diff --git a/doc/python/model.md b/doc/python/model.md
index fca3a39ec893..f6f27c99d082 100644
--- a/doc/python/model.md
+++ b/doc/python/model.md
@@ -23,7 +23,7 @@ data = mx.symbol.Variable('data')
 fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
 act1 = mx.symbol.Activation(fc1, name='relu1', act_type='relu')
 fc2 = mx.symbol.FullyConnected(act1, name='fc2', num_hidden=64)
-softmax = mx.symbol.Softmax(fc2, name='sm')
+softmax = mx.symbol.SoftmaxOutput(fc2, name='sm')
 # create a model
 model = mx.model.FeedForward.create(
      softmax,
diff --git a/doc/python/symbol.md b/doc/python/symbol.md
index b153fdb32773..fc8a0a99b8dd 100644
--- a/doc/python/symbol.md
+++ b/doc/python/symbol.md
@@ -23,7 +23,7 @@ The following code gives an example of two layer neural network configuration.
 >>> net = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
 >>> net = mx.symbol.Activation(data=net, name='relu1', act_type="relu")
 >>> net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
->>> net = mx.symbol.Softmax(data=net, name='out')
+>>> net = mx.symbol.SoftmaxOutput(data=net, name='out')
 >>> type(net)
 <class 'mxnet.symbol.Symbol'>
 ```
@@ -68,7 +68,7 @@ You can use [mxnet.symbol.Group](#mxnet.symbol.Group) function to group the symb
 >>> fc1 = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
 >>> net = mx.symbol.Activation(data=fc1, name='relu1', act_type="relu")
 >>> net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
->>> out = mx.symbol.Softmax(data=net, name='softmax')
+>>> out = mx.symbol.SoftmaxOutput(data=net, name='softmax')
 >>> group = mx.symbol.Group([fc1, out])
 >>> group.list_outputs()
 ['fc1_output', 'softmax_output']
@@ -102,7 +102,7 @@ Before you get started, you can check the list of functions in the following tab
    mxnet.symbol.LeakyReLU
    mxnet.symbol.Pooling
    mxnet.symbol.Reshape
-   mxnet.symbol.Softmax
+   mxnet.symbol.SoftmaxOutput
 ```
 
 ```eval_rst
diff --git a/doc/python/tutorial.md b/doc/python/tutorial.md
index 09a70df07c04..7620afe0c8cb 100644
--- a/doc/python/tutorial.md
+++ b/doc/python/tutorial.md
@@ -227,7 +227,7 @@ The following codes create a two layer perceptrons network:
 >>> net = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
 >>> net = mx.symbol.Activation(data=net, name='relu1', act_type="relu")
 >>> net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
->>> net = mx.symbol.Softmax(data=net, name='out')
+>>> net = mx.symbol.SoftmaxOutput(data=net, name='out')
 >>> type(net)
 <class 'mxnet.symbol.Symbol'>
 ```
diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index efb1122504a0..533f6714bd96 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -55,7 +55,7 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 pool = mx.symbol.Pooling(data=in5b, pool_type="avg", kernel=(7,7), name="global_pool")
 flatten = mx.symbol.Flatten(data=pool, name="flatten1")
 fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10, name="fc1")
-softmax = mx.symbol.Softmax(data=fc, name="loss")
+softmax = mx.symbol.SoftmaxOutput(data=fc, name="loss")
 
 #########################################################
 
diff --git a/example/imagenet/alexnet.py b/example/imagenet/alexnet.py
index dbf5e9a28ba4..b933b090e5b2 100644
--- a/example/imagenet/alexnet.py
+++ b/example/imagenet/alexnet.py
@@ -40,7 +40,7 @@
 dropout2 = mx.symbol.Dropout(data=relu7, p=0.5)
 # stage 6
 fc3 = mx.symbol.FullyConnected(data=dropout2, num_hidden=1000)
-softmax = mx.symbol.Softmax(data=fc3)
+softmax = mx.symbol.SoftmaxOutput(data=fc3)
 
 
 ## data
diff --git a/example/imagenet/inception-full.py b/example/imagenet/inception-full.py
index d703a6db59a2..1ac0a5c14a68 100644
--- a/example/imagenet/inception-full.py
+++ b/example/imagenet/inception-full.py
@@ -74,7 +74,7 @@ def inception(nhidden, grad_scale):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
     return softmax
 
 softmax = inception(21841, 1.0)
diff --git a/example/imagenet/inception.py b/example/imagenet/inception.py
index 263f3a22733f..a9afe9c01f89 100644
--- a/example/imagenet/inception.py
+++ b/example/imagenet/inception.py
@@ -73,7 +73,7 @@ def inception(nhidden, grad_scale):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
     return softmax
 
 softmax = inception(1000, 1.0)
diff --git a/example/memcost/inception_memcost.py b/example/memcost/inception_memcost.py
index 8183c6774724..eb9e16908035 100644
--- a/example/memcost/inception_memcost.py
+++ b/example/memcost/inception_memcost.py
@@ -69,7 +69,7 @@ def inception(nhidden, grad_scale):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
     return softmax
 
 
diff --git a/example/mnist/lenet.py b/example/mnist/lenet.py
index 40779150ccfb..d8691bbe5867 100644
--- a/example/mnist/lenet.py
+++ b/example/mnist/lenet.py
@@ -23,7 +23,7 @@
 # second fullc
 fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
 # loss
-lenet = mx.symbol.Softmax(data=fc2)
+lenet = mx.symbol.SoftmaxOutput(data=fc2)
 
 ## data
 train, val = mnist_iterator(batch_size=100, input_shape=(1,28,28))
diff --git a/example/mnist/mlp.py b/example/mnist/mlp.py
index 0cfffe55cbe4..2bfa55d913ba 100644
--- a/example/mnist/mlp.py
+++ b/example/mnist/mlp.py
@@ -11,7 +11,7 @@
 fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
 act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
 fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
-mlp = mx.symbol.Softmax(data = fc3, name = 'mlp')
+mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'mlp')
 
 # data
 
diff --git a/example/mnist/mlp_numpy.py b/example/mnist/mlp_numpy.py
index 114a6bf257d5..538aa87c7c23 100644
--- a/example/mnist/mlp_numpy.py
+++ b/example/mnist/mlp_numpy.py
@@ -11,7 +11,7 @@
 fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
 act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
 fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
-mlp = mx.symbol.Softmax(data = fc3, name = 'mlp')
+mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'mlp')
 
 # data
 
diff --git a/example/notebooks/cifar-100.ipynb b/example/notebooks/cifar-100.ipynb
index 8e8c53a2d75b..bb5cb1b81624 100644
--- a/example/notebooks/cifar-100.ipynb
+++ b/example/notebooks/cifar-100.ipynb
@@ -131,7 +131,7 @@
     "    # linear classifier\n",
     "    flatten = mx.symbol.Flatten(data=avg, name='flatten')\n",
     "    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc')\n",
-    "    softmax = mx.symbol.Softmax(data=fc1, name='softmax')\n",
+    "    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')\n",
     "    return softmax\n",
     "\n",
     "softmax = inception(100, 1.0)"
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index eae38dab736c..7c436554fa47 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -127,7 +127,7 @@
     "pool = mx.symbol.Pooling(data=in5b, pool_type=\"avg\", kernel=(7,7), name=\"global_avg\")\n",
     "flatten = mx.symbol.Flatten(data=pool)\n",
     "fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10)\n",
-    "softmax = mx.symbol.Softmax(data=fc)"
+    "softmax = mx.symbol.SoftmaxOutput(data=fc)"
    ]
   },
   {
diff --git a/example/notebooks/composite_symbol.ipynb b/example/notebooks/composite_symbol.ipynb
index 22966f5fd3f5..1d2cdaec764d 100644
--- a/example/notebooks/composite_symbol.ipynb
+++ b/example/notebooks/composite_symbol.ipynb
@@ -3691,7 +3691,7 @@
        "<!-- softmax0 -->\n",
        "<g id=\"node234\" class=\"node\"><title>softmax0</title>\n",
        "<polygon fill=\"#b3de69\" stroke=\"black\" points=\"428,-14976 334,-14976 334,-14918 428,-14918 428,-14976\"/>\n",
-       "<text text-anchor=\"middle\" x=\"381\" y=\"-14943.3\" font-family=\"Times,serif\" font-size=\"14.00\">Softmax</text>\n",
+       "<text text-anchor=\"middle\" x=\"381\" y=\"-14943.3\" font-family=\"Times,serif\" font-size=\"14.00\">SoftmaxOutput</text>\n",
        "</g>\n",
        "<!-- softmax0&#45;&gt;fullyconnected0 -->\n",
        "<g id=\"edge261\" class=\"edge\"><title>softmax0&#45;&gt;fullyconnected0</title>\n",
@@ -3739,7 +3739,7 @@
     "# linear classifier\n",
     "flatten = mx.symbol.Flatten(data=avg)\n",
     "fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=1000)\n",
-    "softmax = mx.symbol.Softmax(data=fc1)\n",
+    "softmax = mx.symbol.SoftmaxOutput(data=fc1)\n",
     "\n",
     "# if you like, you can visualize full network structure\n",
     "mx.viz.plot_network(symbol=softmax, shape={\"data\" : (128, 3, 224, 224)})"
diff --git a/example/numpy-ops/data.py b/example/numpy-ops/data.py
new file mode 100644
index 000000000000..d39821f52145
--- /dev/null
+++ b/example/numpy-ops/data.py
@@ -0,0 +1,32 @@
+# pylint: skip-file
+""" data iterator for mnist """
+import sys
+import os
+# code to automatically download dataset
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+import get_data
+import mxnet as mx
+
+def mnist_iterator(batch_size, input_shape):
+    """return train and val iterators for mnist"""
+    # download data
+    get_data.GetMNIST_ubyte()
+    flat = False if len(input_shape) == 3 else True
+
+    train_dataiter = mx.io.MNISTIter(
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        input_shape=input_shape,
+        batch_size=batch_size,
+        shuffle=True,
+        flat=flat)
+
+    val_dataiter = mx.io.MNISTIter(
+        image="data/t10k-images-idx3-ubyte",
+        label="data/t10k-labels-idx1-ubyte",
+        input_shape=input_shape,
+        batch_size=batch_size,
+        flat=flat)
+
+    return (train_dataiter, val_dataiter)
diff --git a/example/numpy-ops/numpy_softmax.py b/example/numpy-ops/numpy_softmax.py
new file mode 100644
index 000000000000..1ea5f0051061
--- /dev/null
+++ b/example/numpy-ops/numpy_softmax.py
@@ -0,0 +1,60 @@
+# pylint: skip-file
+from data import mnist_iterator
+import mxnet as mx
+import numpy as np
+import logging
+
+
+class NumpySoftmax(mx.operator.NumpyOp):
+    def need_top_grad(self):
+        return False
+    
+    def list_arguments(self):
+        return ['data', 'label']
+
+    def list_outputs(self):
+        return ['prob']
+
+    def infer_shape(self, in_shape):
+    	return [in_shape[0], (in_shape[0][0],)], [in_shape[0]]
+
+    def forward(self, in_data, out_data):
+    	x = in_data[0]
+    	y = out_data[0]
+    	y[:] = np.exp(x - x.max(axis=1).reshape((x.shape[0], 1)))
+    	y /= y.sum(axis=1).reshape((x.shape[0], 1))
+
+    def backward(self, out_grad, in_data, out_data, in_grad):
+    	l = in_data[1]
+    	l = l.reshape((l.size,)).astype(np.int)
+    	y = out_data[0]
+    	dx = in_grad[0]
+    	dx[:] = y
+    	dx[np.arange(l.shape[0]), l] -= 1.0
+
+# define mlp
+
+data = mx.symbol.Variable('data')
+fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
+#mlp = mx.symbol.Softmax(data = fc3, name = 'mlp')
+mysoftmax = NumpySoftmax()
+mlp = mysoftmax(data=fc3, name = 'mlp')
+
+# data
+
+train, val = mnist_iterator(batch_size=100, input_shape = (784,))
+
+# train
+
+logging.basicConfig(level=logging.DEBUG)
+
+model = mx.model.FeedForward(
+    ctx = mx.gpu(), symbol = mlp, num_epoch = 20,
+    learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
+
+model.fit(X=train, eval_data=val)
+
diff --git a/example/python-howto/multiple_outputs.py b/example/python-howto/multiple_outputs.py
index ab6d6d12356c..97ce469d58a2 100644
--- a/example/python-howto/multiple_outputs.py
+++ b/example/python-howto/multiple_outputs.py
@@ -8,7 +8,7 @@
 fc1 = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
 net = mx.symbol.Activation(data=fc1, name='relu1', act_type="relu")
 net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
-out = mx.symbol.Softmax(data=net, name='softmax')
+out = mx.symbol.SoftmaxOutput(data=net, name='softmax')
 # group fc1 and out together
 group = mx.symbol.Group([fc1, out])
 print group.list_outputs()
diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
index 25245aad18ee..4b5706ff7208 100644
--- a/example/rnn/lstm.py
+++ b/example/rnn/lstm.py
@@ -83,7 +83,7 @@ def lstm_unroll(num_lstm_layer, seq_len,
                                    bias=cls_bias,
                                    num_hidden=num_label,
                                    name="t%d_cls" % seqidx)
-        sm = mx.sym.Softmax(data=fc, label=label, name="t%d_sm" % seqidx)
+        sm = mx.sym.SoftmaxOutput(data=fc, label=label, name="t%d_sm" % seqidx)
         out_prob.append(sm)
 
     for i in range(num_lstm_layer):
@@ -216,7 +216,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
             set_rnn_inputs(m, X_train_batch, begin=begin)
             m.rnn_exec.forward(is_train=True)
             # probability of each label class, used to evaluate nll
-            seq_label_probs = [mx.nd.choose_element(out, label).copyto(mx.cpu())
+            seq_label_probs = [mx.nd.choose_element_0index(out, label).copyto(mx.cpu())
                                for out, label in zip(m.seq_outputs, m.seq_labels)]
             m.rnn_exec.backward()
             # transfer the states
@@ -251,7 +251,7 @@ def train_lstm(model, X_train_batch, X_val_batch,
             set_rnn_inputs(m, X_val_batch, begin=begin)
             m.rnn_exec.forward(is_train=False)
             # probability of each label class, used to evaluate nll
-            seq_label_probs = [mx.nd.choose_element(out, label).copyto(mx.cpu())
+            seq_label_probs = [mx.nd.choose_element_0index(out, label).copyto(mx.cpu())
                                for out, label in zip(m.seq_outputs, m.seq_labels)]
             # transfer the states
             for init, last in zip(m.init_states, m.last_states):
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 1eeffc1ab4b9..52100cdf05ea 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -50,8 +50,8 @@
 #endif
 
 /*!
-* \brief define dllexport for Visual Studio
-*/
+ * \brief define dllexport for Visual Studio
+ */
 #ifdef _MSC_VER
 #ifdef MXNET_EXPORTS
 #define MXNET_API __declspec(dllexport)
@@ -62,6 +62,14 @@
 #define MXNET_API
 #endif
 
+/*!
+ * \brief define prediction only
+ */
+#ifndef MXNET_PREDICT_ONLY
+#define MXNET_PREDICT_ONLY 0
+#endif
+
+
 /*! \brief namespace of mxnet */
 namespace mxnet {
 /*! \brief mxnet cpu */
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 2bbda3ddbf0e..84754977ba3e 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -48,6 +48,16 @@ typedef void *DataIterHandle;
 typedef void *KVStoreHandle;
 /*! \brief handle to RecordIO */
 typedef void *RecordIOHandle;
+
+MXNET_EXTERN_C {
+struct NativeOpInfo {
+  void (*forward)(int, float**, int*, unsigned**, int*);
+  void (*backward)(int, float**, int*, unsigned**, int*);
+  void (*infer_shape)(int, int*, unsigned**);
+  void (*list_outputs)(char***);
+  void (*list_arguments)(char***);
+};
+}
 /*!
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index e5671da33cbc..9163a6c3e910 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -9,6 +9,8 @@
 
 #ifdef __cplusplus
 #define MXNET_EXTERN_C extern "C"
+#else
+#define MXNET_EXTERN_C
 #endif
 
 #ifdef _WIN32
diff --git a/mshadow b/mshadow
index 27ba6a635e81..74be312ab6f2 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 27ba6a635e81ac6e9f0f30a1ab1bf1d32e56f7d8
+Subproject commit 74be312ab6f20178766901a7caf021d4829e9110
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index a036e003ba77..abd842748a57 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -13,6 +13,7 @@
 from . import symbol
 from . import io
 from . import recordio
+from . import operator
 # use mx.nd as short for mx.ndarray
 from . import ndarray as nd
 # use mx.rnd as short for mx.random
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index d6aec6509b85..4ae621a86c7f 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -105,7 +105,6 @@ def c_array(ctype, values):
     """
     return (ctype * len(values))(*values)
 
-
 def ctypes2buffer(cptr, length):
     """Convert ctypes pointer to buffer type.
 
@@ -197,7 +196,6 @@ def ctypes2docstring(num_args, arg_names, arg_types, arg_descs, remove_dup=True)
     doc_str = doc_str % ('\n'.join(param_str))
     return doc_str
 
-
 def _notify_shutdown():
     """Notify MXNet about a shutdown."""
     check_call(_LIB.MXNotifyShutdown())
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index dde8f40db9c5..8d08e40ba7d3 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-"""Callback functions that can be used to track various status during iteration."""
+"""Callback functions that can be used to track various status during epoch."""
 from __future__ import absolute_import
 
 import sys
@@ -9,7 +9,7 @@
 from .model import save_checkpoint
 
 def do_checkpoint(prefix):
-    """Callback to checkpoint the model to prefix every iteration.
+    """Callback to checkpoint the model to prefix every epoch.
 
     Parameters
     ----------
@@ -45,7 +45,7 @@ def _callback(param):
         if param.nbatch % period == 0:
             name, value = param.eval_metric.get()
             logging.info('Iter[%d] Batch[%d] Train-%s=%f',
-                         param.iteration, param.nbatch, name, value)
+                         param.epoch, param.nbatch, name, value)
     return _callback
 
 
@@ -77,7 +77,7 @@ def __call__(self, param):
             if count % self.frequent == 0:
                 speed = self.frequent * self.batch_size / (time.time() - self.tic)
                 logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
-                             param.iteration, count, speed)
+                             param.epoch, count, speed)
                 self.tic = time.time()
         else:
             self.init = True
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
new file mode 100644
index 000000000000..1cdd4277d894
--- /dev/null
+++ b/python/mxnet/operator.py
@@ -0,0 +1,186 @@
+# coding: utf-8
+# pylint: disable=invalid-name, protected-access, too-many-arguments, no-self-use
+"""numpy interface for operators."""
+from __future__ import absolute_import
+
+from ctypes import CFUNCTYPE, POINTER, Structure, pointer, c_void_p, cast, c_int, c_char, c_char_p
+from .base import c_array, c_str, mx_uint, mx_float, ctypes2numpy_shared
+from . import symbol
+
+class NumpyOp(object):
+    """Base class for numpy operators. numpy operators allow parts
+    of computation in symbolic graph to be writen in numpy. This feature
+    is intended for quickly hacking out a solution for non performance
+    critical parts. Please consider write a c++ implementation if it becomes
+    a bottleneck.
+    """
+    def __init__(self):
+        self.info_ = None
+
+    def __call__(self, *args, **kwargs):
+        return self.get_symbol(*args, **kwargs)
+
+    def get_symbol(self, *args, **kwargs):
+        """Create a symbol from numpy operator.
+        This Should only be called once per instance if operator contains
+        internal states.
+
+        Parameters
+        ----------
+        args : list
+            a list of input arguments (symbols)
+
+        Returns
+        -------
+        sym : mxnet.symbol.Symbol
+        """
+        fb_functype = CFUNCTYPE(None, c_int, POINTER(POINTER(mx_float)), POINTER(c_int),
+                                POINTER(POINTER(mx_uint)), POINTER(c_int))
+        infer_functype = CFUNCTYPE(None, c_int, POINTER(c_int), POINTER(POINTER(mx_uint)))
+        list_functype = CFUNCTYPE(None, POINTER(POINTER(POINTER(c_char))))
+        class NumpyOpInfo(Structure):
+            """Structure that holds Callback information. Passed to NumpyOpProp"""
+            _fields_ = [
+                ('forward', fb_functype),
+                ('backward', fb_functype),
+                ('infer_shape', infer_functype),
+                ('list_outputs', list_functype),
+                ('list_arguments', list_functype)
+                ]
+        def forward_entry(num_tensor, tensor_ptrs, tensor_dims,
+                          tensor_shapes, tensor_tags):
+            """C Callback for NumpyOp::Forward"""
+            tensors = [[] for i in range(4)]
+            for i in range(num_tensor):
+                shape = [tensor_shapes[i][j] for j in range(tensor_dims[i])]
+                buff = ctypes2numpy_shared(tensor_ptrs[i], shape)
+                tensors[tensor_tags[i]].append(buff)
+            self.forward(in_data=tensors[0], out_data=tensors[1])
+
+        def backward_entry(num_tensor, tensor_ptrs, tensor_dims,
+                           tensor_shapes, tensor_tags):
+            """C Callback for NumpyOp::Backward"""
+            tensors = [[] for i in range(4)]
+            for i in range(num_tensor):
+                shape = [tensor_shapes[i][j] for j in range(tensor_dims[i])]
+                buff = ctypes2numpy_shared(tensor_ptrs[i], shape)
+                tensors[tensor_tags[i]].append(buff)
+            self.backward(in_data=tensors[0], out_data=tensors[1],
+                          in_grad=tensors[2], out_grad=tensors[3])
+
+        def infer_shape_entry(num_tensor, tensor_dims,
+                              tensor_shapes):
+            """C Callback for NumpyOpProp::InferShape"""
+            n_in = len(self.list_arguments())
+            n_out = len(self.list_outputs())
+            assert num_tensor == n_in + n_out
+
+            shapes = [[tensor_shapes[i][j] for j in range(tensor_dims[i])] for i in range(n_in)]
+            ishape, oshape = self.infer_shape(shapes)
+            assert len(oshape) == n_out
+            assert len(ishape) == n_in
+            rshape = list(ishape) + list(oshape)
+            for i in range(n_in+n_out):
+                tensor_shapes[i] = cast(c_array(mx_uint, rshape[i]), POINTER(mx_uint))
+                tensor_dims[i] = len(rshape[i])
+
+        def list_outputs_entry(out):
+            """C Callback for NumpyOpProp::ListOutputs"""
+            ret = self.list_outputs()
+            ret = [c_str(i) for i in ret] + [c_char_p(0)]
+            ret = c_array(c_char_p, ret)
+            out[0] = cast(ret, POINTER(POINTER(c_char)))
+
+        def list_arguments_entry(out):
+            """C Callback for NumpyOpProp::ListArguments"""
+            ret = self.list_arguments()
+            ret = [c_str(i) for i in ret] + [c_char_p(0)]
+            ret = c_array(c_char_p, ret)
+            out[0] = cast(ret, POINTER(POINTER(c_char)))
+
+
+        self.info_ = NumpyOpInfo(fb_functype(forward_entry),
+                                 fb_functype(backward_entry),
+                                 infer_functype(infer_shape_entry),
+                                 list_functype(list_outputs_entry),
+                                 list_functype(list_arguments_entry))
+        cb_ptr = hex(cast(pointer(self.info_), c_void_p).value)
+        # pylint: disable=E1101
+        return symbol.Symbol._Native(*args,
+                                     info=cb_ptr,
+                                     need_top_grad=self.need_top_grad(),
+                                     **kwargs)
+
+    def forward(self, in_data, out_data):
+        """forward interface. override to create new operators
+
+        Parameters
+        ----------
+        in_data, out_data: list
+            input and output for forward. See document for
+            corresponding arguments of Operator::Forward
+        """
+        out_data[0][:] = in_data[0]
+
+    def backward(self, out_grad, in_data, out_data, in_grad):
+        """backward interface. override to create new operators
+
+        Parameters
+        ----------
+        out_grad, in_data, out_data, in_grad : list
+            input and output for backward. See document for
+            corresponding arguments of Operator::Backward
+        """
+        # pylint: disable=W0613
+        in_grad[0][:] = 1.0
+
+    def infer_shape(self, in_shape):
+        """infer_shape interface. override to create new operators
+
+        Parameters
+        ----------
+        in_shape : list
+            list of argument shapes in the same order as
+            declared in list_arguments.
+
+        Returns
+        -------
+        in_shape : list
+            list of argument shapes. Can be modified from in_shape.
+        out_shape : list
+            list of output shapes calculated from in_shape,
+            in the same order as declared in list_arguments.
+        """
+        return in_shape, [in_shape[0]]
+
+    def list_outputs(self):
+        """list_outputs interface. override to create new operators
+
+        Returns
+        -------
+        outputs : list
+            list of output blob names.
+        """
+        return ['output']
+
+    def list_arguments(self):
+        """list_arguments interface. override to create new operators
+
+        Returns
+        -------
+        in_shape : list
+            list of argument shapes in the same order as
+            declared in list_arguments.
+        """
+        return ['data']
+
+    def need_top_grad(self):
+        """Whether this operator needs out_grad for backward.
+
+        Returns
+        -------
+        need_top_grad : bool
+            Whether this operator needs out_grad for backward.
+            Should be set to False for loss layers.
+        """
+        return True
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index aaac3ee61d08..8706ac1cc86c 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -972,8 +972,9 @@ int MXKVStoreIsSchedulerNode(int *ret) {
 int MXKVStoreRunServer(KVStoreHandle handle,
                        MXKVStoreServerController controller) {
   API_BEGIN();
-  auto ctrl = [controller](int head, const std::string& body) {
-    controller(head, body.c_str());
+  MXKVStoreServerController *controller_temp = controller;
+  auto ctrl = [controller_temp](int head, const std::string& body) {
+      controller_temp(head, body.c_str());
   };
   static_cast<KVStore*>(handle)->RunServer(ctrl);
   API_END();
diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
index 8dac8944f144..064cc4b1cc6f 100644
--- a/src/common/tblob_op_registry.cc
+++ b/src/common/tblob_op_registry.cc
@@ -252,7 +252,7 @@ class TBlobUnaryOpProp : public OperatorProperty {
     }
   }
 
-  Operator* CreateOperator(Context ctx) const {
+  Operator* CreateOperator(Context ctx) const override {
     size_t dev_mask = ctx.dev_mask();
     TBlobUnaryOperator *op = new TBlobUnaryOperator();
     CHECK(dev_mask < source->funary_.size() && source->funary_[dev_mask] != nullptr);
diff --git a/src/engine/engine.cc b/src/engine/engine.cc
index eececfa91e04..ae72861260e1 100644
--- a/src/engine/engine.cc
+++ b/src/engine/engine.cc
@@ -17,6 +17,7 @@ inline Engine* CreateEngine() {
   std::string stype = type;
 
   Engine *ret = nullptr;
+  #if MXNET_PREDICT_ONLY == 0
   if (stype == "NaiveEngine") {
     ret = CreateNaiveEngine();
   } else if (stype == "ThreadedEngine") {
@@ -24,6 +25,9 @@ inline Engine* CreateEngine() {
   } else if (stype == "ThreadedEnginePerDevice") {
     ret = CreateThreadedEnginePerDevice();
   }
+  #else
+  ret = CreateNaiveEngine();
+  #endif
 
   if (ret ==nullptr) {
     LOG(FATAL) << "Cannot find Engine " << type;
diff --git a/src/engine/engine_impl.h b/src/engine/engine_impl.h
index 44452df7b9c5..9d3fc4cd09f7 100644
--- a/src/engine/engine_impl.h
+++ b/src/engine/engine_impl.h
@@ -71,10 +71,12 @@ static constexpr std::size_t kMaxNumGPUs = 16;
 // predeclare factory function for each type of engine
 /*! \return NaiveEngine instance */
 Engine *CreateNaiveEngine();
+#if MXNET_PREDICT_ONLY == 0
 /*! \return ThreadedEnginePooled instance */
 Engine *CreateThreadedEnginePooled();
 /*! \return ThreadedEnginePerDevie instance */
 Engine *CreateThreadedEnginePerDevice();
+#endif
 }  // namespace engine
 }  // namespace mxnet
 #endif  // MXNET_ENGINE_ENGINE_IMPL_H_
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index d25d6d95d989..0bf446d50ca5 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -39,9 +39,9 @@ class Executor {
       lk.unlock();
 
       if (blk.f) {
-        blk.f(); blk.p.set_value();
+        blk.f(); blk.p->set_value();
       } else {
-        blk.p.set_value(); break;
+        blk.p->set_value(); break;
       }
       lk.lock();
     }
@@ -57,7 +57,7 @@ class Executor {
    */
   void Exec(const Func& func) {
     Block blk(func);
-    auto fut = blk.p.get_future();
+    auto fut = blk.p->get_future();
     {
       std::lock_guard<std::mutex> lk(mu_);
       queue_.push(std::move(blk));
@@ -75,9 +75,9 @@ class Executor {
 
  private:
   struct Block {
-    explicit Block(const Func& func) : f(func) { }
+  explicit Block(const Func& func) : f(func), p(std::make_shared<std::promise<void>>()) { }
     Func f;
-    std::promise<void> p;
+    std::shared_ptr<std::promise<void>> p;
   };
   std::queue<Block> queue_;
   std::mutex mu_;
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 404c0891f984..9ea7321195c0 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -595,6 +595,7 @@ void NDArray::SyncCopyToCPU(real_t *data, size_t size) const {
   }
 }
 
+#if MXNET_PREDICT_ONLY == 0
 // register API function
 // those with underscore will be registered at NDArray
 MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
@@ -610,10 +611,11 @@ MXNET_REGISTER_NDARRAY_FUN(dot).set_function(BinaryOp<ndarray::Dot>)
 
 MXNET_REGISTER_NDARRAY_FUN(_onehot_encode).set_function(BinaryOp<ndarray::OneHotEncode>);
 
-MXNET_REGISTER_NDARRAY_FUN(choose_element)
+MXNET_REGISTER_NDARRAY_FUN(choose_element_0index)
 .set_function(BinaryOp<ndarray::MatChooseRowElem>)
 .describe("Choose one element from each line(row for python, column for R/Julia)"
-          " in lhs according to index indicated by rhs");
+          " in lhs according to index indicated by rhs."
+          " This function assume rhs uses 0-based index.");
 
 // register API function
 // those with underscore will be registered at NDArray
@@ -659,4 +661,5 @@ MXNET_REGISTER_NDARRAY_FUN(clip)
 .add_argument("src", "NDArray", "Source input")
 .add_argument("a_min", "real_t", "Minimum value")
 .add_argument("a_max", "real_t", "Maximum value");
+#endif
 }  // namespace mxnet
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 6280c1664e84..cca8d7824697 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -144,7 +144,7 @@ class ActivationProp : public OperatorProperty {
     return {{in_data[activation::kData], out_data[activation::kOut]}};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   ActivationParam param_;
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index f031058f899e..8ae6d30a50bb 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -273,7 +273,7 @@ class BatchNormProp : public OperatorProperty {
     return {"moving_mean", "moving_var"};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   BatchNormParam param_;
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
index 012dc7a2da63..ff5262d4e04a 100644
--- a/src/operator/block_grad-inl.h
+++ b/src/operator/block_grad-inl.h
@@ -102,7 +102,7 @@ class BlockGradientProp : public OperatorProperty {
     return {{in_data[blockgrad::kData], out_data[blockgrad::kOut]}};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 };  // class BlockGradientProperty
 
 #endif  // DMLC_USE_CXX11
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index cd8b972a9792..72609ba51b96 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -14,41 +14,46 @@
 namespace mxnet {
 namespace op {
 
-using mshadow::expr::concat;
-
-
 template<typename xpu, int dim>
 inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim> > &input,
                         mshadow::Tensor<xpu, dim> *output) {
+  using mshadow::expr::concat;
+  using mshadow::expr::slice;
   mshadow::Tensor<xpu, dim> out = *output;
   size_t size = input.size();
   switch (size) {
-    case 2:
+    case 2: {
       out = concat<1>(input[0], input[1]);
       break;
-    case 3:
+    }
+    case 3: {
       out = concat<1>(input[0],
                       concat<1>(input[1], input[2]));
       break;
-    case 4:
+    }
+    case 4: {
       out = concat<1>(input[0],
                       concat<1>(input[1],
                                 concat<1>(input[2], input[3])));
       break;
-    case 5:
-      out = concat<1>(input[0],
-                      concat<1>(input[1],
-                                concat<1>(input[2],
-                                          concat<1>(input[3], input[4]))));
+    }
+    default: {
+      index_t begin = 0;
+      for (index_t i = 0; i < size; ++i) {
+        index_t end = begin + input[i].size(1);
+        slice<1>(out, begin, end) = input[i];
+        begin = end;
+      }
       break;
-    default:
-      LOG(FATAL) << "Incorrect concat size: " << size;
+    }
   }
 }
 
 template<typename xpu, int dim>
 void Split(const mshadow::Tensor<xpu, dim> &input,
            std::vector<mshadow::Tensor<xpu, dim> > *output) {
+  using mshadow::expr::concat;
+  using mshadow::expr::slice;
   std::vector<mshadow::Tensor<xpu, dim> > out = *output;
   size_t size = out.size();
   switch (size) {
@@ -67,15 +72,15 @@ void Split(const mshadow::Tensor<xpu, dim> &input,
                           concat<1>(out[2], out[3]))) = input;
       break;
     }
-    case 5: {
-      concat<1>(out[0],
-                concat<1>(out[1],
-                          concat<1>(out[2],
-                                    concat<1>(out[3], out[4])))) = input;
+    default: {
+      index_t begin = 0;
+      for (index_t i = 0; i < size; ++i) {
+        index_t end = begin + out[i].size(1);
+        out[i] = slice<1>(input, begin, end);
+        begin = end;
+      }
       break;
     }
-    default:
-      LOG(FATAL) << "Incorrect concat size: " << size;
   }
 }
 }  // namespace op
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index 3e9c812603e3..13c53e7c2246 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -28,7 +28,7 @@ enum ConcatOpOutputs {kOut};
 struct ConcatParam : public dmlc::Parameter<ConcatParam> {
   int num_args;
   DMLC_DECLARE_PARAMETER(ConcatParam) {
-    DMLC_DECLARE_FIELD(num_args).set_range(1,  6)
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs to be concated.");
   }
 };  // struct ConcatParam
@@ -175,7 +175,7 @@ class ConcatProp : public OperatorProperty {
     return out_grad;
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   ConcatParam param_;
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 29a9288b2870..6a6c8590daf2 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -46,7 +46,9 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("convolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
-    .describe("number of groups partition");
+    .describe("Number of groups partition. "
+              "This option is not supported by CuDNN, you can use SliceChannel to num_group,"
+              "apply convolution and concat instead to achieve the same need.");
     DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(128, 4096)
     .describe("Tmp workspace for convolution (MB)");
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
@@ -347,7 +349,7 @@ class ConvolutionProp : public OperatorProperty {
     return {ResourceRequest::kTempSpace};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   ConvolutionParam param_;
diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h
new file mode 100644
index 000000000000..5b6dc4b7b8c8
--- /dev/null
+++ b/src/operator/cudnn_deconvolution-inl.h
@@ -0,0 +1,280 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cudnn_deconvolution-inl.h
+ * \brief
+ * \author Wei Wu
+*/
+#ifndef MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_
+#define MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_
+
+#include <algorithm>
+#include <vector>
+#include "./deconvolution-inl.h"
+
+namespace mxnet {
+namespace op {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1
+class CuDNNDeconvolutionOp : public Operator {
+ public:
+  explicit CuDNNDeconvolutionOp(DeconvolutionParam param) {
+    this->param_ = param;
+    // convert MB to words
+    param_.workspace = (param_.workspace << 20) / sizeof(real_t);
+    init_cudnn_ = false;
+    // TODO(xxx): fp16
+    dtype_ = CUDNN_DATA_FLOAT;
+  }
+
+  ~CuDNNDeconvolutionOp() {
+    if (init_cudnn_) {
+      CHECK_EQ(cudnnDestroyTensorDescriptor(in_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(out_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(bias_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyFilterDescriptor(filter_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyConvolutionDescriptor(conv_desc_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    size_t expected = param_.no_bias ? 2 : 3;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4> data = in_data[deconv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> wmat = in_data[deconv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> out = out_data[deconv::kOut].get<gpu, 4, real_t>(s);
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(wmat.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    if (!init_cudnn_) {
+      Init(s, in_data, out_data);
+    }
+    Tensor<gpu, 1> workspace = ctx.requested[deconv::kTempSpace].get_space<gpu>(
+      mshadow::Shape1(forward_workspace_), s);
+      CHECK_EQ(cudnnConvolutionBackwardData_v3(s->dnn_handle_,
+               &alpha,
+               filter_desc_,
+               wmat.dptr_,
+               in_desc_,
+               data.dptr_,
+               conv_desc_,
+               back_algo_,
+               workspace.dptr_,
+               backward_workspace_byte_,
+               &beta,
+               out_desc_,
+               out.dptr_), CUDNN_STATUS_SUCCESS);
+    if (!param_.no_bias) {
+      beta = 1.0f;
+      Tensor<gpu, 1> bias = in_data[deconv::kBias].get<gpu, 1, real_t>(s);
+      CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
+                              CUDNN_ADD_SAME_C,
+                              &alpha,
+                              bias_desc_,
+                              bias.dptr_,
+                              &beta,
+                              out_desc_,
+                              out.dptr_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    // TODO(bing): think about how to support add to
+    CHECK_EQ(req[deconv::kWeight], kWriteTo);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4> grad = out_grad[deconv::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> wmat = in_data[deconv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> gwmat = in_grad[deconv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[deconv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> gdata = in_grad[deconv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 1> workspace = ctx.requested[deconv::kTempSpace].get_space<gpu>(
+      mshadow::Shape1(backward_workspace_), s);
+    if (!param_.no_bias) {
+      Tensor<gpu, 1> gbias = in_grad[deconv::kBias].get<gpu, 1, real_t>(s);
+      CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_,
+                                            &alpha,
+                                            out_desc_,
+                                            grad.dptr_,
+                                            &beta,
+                                            bias_desc_,
+                                            gbias.dptr_), CUDNN_STATUS_SUCCESS);
+    }
+    CHECK_EQ(cudnnConvolutionBackwardFilter_v3(s->dnn_handle_,
+             &alpha,
+             out_desc_,
+             grad.dptr_,
+             in_desc_,
+             data.dptr_,
+             conv_desc_,
+             back_algo_w_,
+             workspace.dptr_,
+             backward_workspace_byte_,
+             &beta,
+             filter_desc_,
+             gwmat.dptr_), CUDNN_STATUS_SUCCESS);
+     CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_,
+                                      &alpha,
+                                      out_desc_,
+                                      grad.dptr_,
+                                      filter_desc_,
+                                      wmat.dptr_,
+                                      conv_desc_,
+                                      algo_,
+                                      workspace.dptr_,
+                                      forward_workspace_byte_,
+                                      &beta,
+                                      in_desc_,
+                                      gdata.dptr_), CUDNN_STATUS_SUCCESS);
+  }
+
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1);
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(real_t));
+      size_t back_size = 0;
+      size_t back_size_w = 0;
+      Tensor<gpu, 4> data = in_data[deconv::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[deconv::kOut].get<gpu, 4, real_t>(s);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
+                                          dtype_,
+                                          data.shape_[1],
+                                          param_.num_filter,
+                                          param_.kernel[0],
+                                          param_.kernel[1]), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
+               param_.pad[0],
+               param_.pad[1],
+               param_.stride[0],
+               param_.stride[1],
+               1,
+               1,
+               CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          data.shape_[0],
+                                          data.shape_[1],
+                                          data.shape_[2],
+                                          data.shape_[3]), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          out.shape_[0],
+                                          out.shape_[1],
+                                          out.shape_[2],
+                                          out.shape_[3]), CUDNN_STATUS_SUCCESS);
+      if (!param_.no_bias) {
+        Tensor<gpu, 1> bias = in_data[deconv::kBias].get<gpu, 1, real_t>(s);
+        CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            1,
+                                            bias.shape_[0],
+                                            1,
+                                            1), CUDNN_STATUS_SUCCESS);
+      }
+      CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+      CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
+               out_desc_,
+               filter_desc_,
+               conv_desc_,
+               in_desc_,
+               CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+               workspace_byte,
+               &algo_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
+               out_desc_,
+               in_desc_,
+               conv_desc_,
+               filter_desc_,
+               CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
+               workspace_byte,
+               &back_algo_w_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
+               filter_desc_,
+               in_desc_,
+               conv_desc_,
+               out_desc_,
+               CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
+               workspace_byte,
+               &back_algo_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
+               filter_desc_,
+               in_desc_,
+               conv_desc_,
+               out_desc_,
+               back_algo_,
+               &back_size), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
+               out_desc_,
+               in_desc_,
+               conv_desc_,
+               filter_desc_,
+               back_algo_w_,
+               &back_size_w), CUDNN_STATUS_SUCCESS);
+      backward_workspace_byte_ = std::max(back_size, back_size_w);
+      CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
+               out_desc_,
+               filter_desc_,
+               conv_desc_,
+               in_desc_,
+               algo_,
+               &forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
+      forward_workspace_ = forward_workspace_byte_ / sizeof(real_t) + 1;
+      backward_workspace_ = backward_workspace_byte_ / sizeof(real_t) + 1;
+    }
+  }
+
+  bool init_cudnn_;
+  size_t forward_workspace_;
+  size_t backward_workspace_;
+  size_t forward_workspace_byte_;
+  size_t backward_workspace_byte_;
+  cudnnDataType_t dtype_;
+  cudnnTensorDescriptor_t in_desc_;
+  cudnnTensorDescriptor_t out_desc_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  cudnnConvolutionDescriptor_t conv_desc_;
+  cudnnConvolutionFwdAlgo_t algo_;
+  cudnnConvolutionBwdDataAlgo_t back_algo_;
+  cudnnConvolutionBwdFilterAlgo_t back_algo_w_;
+  DeconvolutionParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_
diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h
new file mode 100644
index 000000000000..c94c56691421
--- /dev/null
+++ b/src/operator/deconvolution-inl.h
@@ -0,0 +1,361 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file deconvolution-inl.h
+ * \brief
+ * \author Wei Wu
+*/
+#ifndef MXNET_OPERATOR_DECONVOLUTION_INL_H_
+#define MXNET_OPERATOR_DECONVOLUTION_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+
+namespace mxnet {
+namespace op {
+
+namespace deconv {
+  enum DeconvolutionOpInputs {kData, kWeight, kBias};
+  enum DeconvolutionOpOutputs {kOut};
+  enum DeconvolutionOpResource {kTempSpace};
+}
+
+struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
+  TShape kernel;
+  TShape stride;
+  TShape pad;
+  uint32_t num_filter;
+  uint32_t num_group;
+  uint64_t workspace;
+  bool no_bias;
+  DMLC_DECLARE_PARAMETER(DeconvolutionParam) {
+    int shape[] = {1, 1};
+    DMLC_DECLARE_FIELD(kernel).describe("deconvolution kernel size: (y, x)");
+    DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
+    .describe("deconvolution stride: (y, x)");
+    shape[0] = shape[1] = 0;
+    DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
+    .describe("pad for deconvolution: (y, x)");
+    DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
+    .describe("deconvolution filter(channel) number");
+    DMLC_DECLARE_FIELD(num_group).set_default(1)
+    .describe("number of groups partition");
+    DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(128, 4096)
+    .describe("Tmp workspace for deconvolution (MB)");
+    DMLC_DECLARE_FIELD(no_bias).set_default(true)
+    .describe("Whether to disable bias parameter.");
+  }
+};
+
+template<typename xpu>
+class DeconvolutionOp : public Operator {
+ public:
+  explicit DeconvolutionOp(DeconvolutionParam p) {
+    this->param_ = p;
+    // convert MB to words
+    param_.workspace = (param_.workspace << 20) / sizeof(real_t);
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[deconv::kOut], kWriteTo);
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[deconv::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[deconv::kOut].get<xpu, 4, real_t>(s);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               data.shape_[1] / param_.num_group,
+               param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3> wmat = in_data[deconv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
+    const index_t nbatch = data.size(0);
+    Tensor<xpu, 1> workspace = ctx.requested[deconv::kTempSpace].get_space<xpu>(
+        Shape1(this->InitTemp(out.shape_, data.shape_)), s);
+    for (index_t i = 0; i < nbatch; i += nstep_) {
+      const index_t step = std::min(nstep_, nbatch - i);
+      Tensor<xpu, 2> temp_col = Tensor<xpu, 2>(workspace.dptr_,
+                                               Shape2(shape_colunit_[0],
+                                                      shape_colunit_[1] * step), s);
+      Tensor<xpu, 3> temp_dst = Tensor<xpu, 3>(workspace.dptr_ + temp_col.shape_.Size(),
+                                               Shape3(shape_dstunit_[0],
+                                                      shape_dstunit_[1],
+                                                      shape_dstunit_[2] * step), s);
+      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        temp_col = unpack_patch2col(out.Slice(i, i + step),
+                                    param_.kernel[0],
+                                    param_.kernel[1],
+                                    param_.stride[0],
+                                    param_.stride[1]);
+      } else {
+        temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
+                                        param_.pad[0], param_.pad[1]),
+                                    param_.kernel[0],
+                                    param_.kernel[1],
+                                    param_.stride[0],
+                                    param_.stride[1]);
+      }
+      const index_t gstride = temp_col.size(0) / param_.num_group;
+      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+        mshadow::Tensor<xpu, 2> tmpc = temp_col.Slice(gstride * gid,
+                                       gstride * (gid + 1));
+        tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+      }
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        out.Slice(i, i + step) = pack_col2patch(temp_col,
+                                   out.Slice(i, i + step).shape_,
+                                   param_.kernel[0],
+                                   param_.kernel[1],
+                                   param_.stride[0]);
+      } else {
+        Shape<4> pshape = out.Slice(i, i + step).shape_;
+        pshape[2] += 2 * param_.pad[0];
+        pshape[3] += 2 * param_.pad[1];
+        out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
+                                        pshape,
+                                        param_.kernel[0],
+                                        param_.kernel[1],
+                                        param_.stride[0]),
+                                        out[i][0].shape_);
+      }
+    }
+    if (!param_.no_bias) {
+      // add bias, broadcast bias to dim 1: channel
+      Tensor<xpu, 1> bias = in_data[deconv::kBias].get<xpu, 1, real_t>(s);
+      out += broadcast<1>(bias, out.shape_);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(bing): check the BLAS Handle, be careful
+    CHECK_EQ(out_grad.size(), 1);
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
+    // get data
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[deconv::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad = out_grad[deconv::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> gdata = in_grad[deconv::kData].get<xpu, 4, real_t>(s);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               data.shape_[1] / param_.num_group,
+               param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3> wmat = in_data[deconv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+    Tensor<xpu, 3> gwmat = in_grad[deconv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
+    const index_t nbatch = data.size(0);
+    Tensor<xpu, 1> workspace = ctx.requested[deconv::kTempSpace].get_space<xpu>(
+              Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
+    for (index_t i = 0; i < nbatch; i += nstep_) {
+      const index_t step = std::min(nstep_, nbatch - i);
+      Tensor<xpu, 2> temp_col = Tensor<xpu, 2>(workspace.dptr_,
+                                               Shape2(shape_colunit_[0],
+                                                      shape_colunit_[1] * step), s);
+      Tensor<xpu, 3> temp_dst = Tensor<xpu, 3>(workspace.dptr_ + temp_col.shape_.Size(),
+                                               Shape3(shape_dstunit_[0],
+                                                      shape_dstunit_[1],
+                                                      shape_dstunit_[2] * step), s);
+      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        temp_col = unpack_patch2col(grad.Slice(i, i + step),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
+                                     param_.stride[0],
+                                     param_.stride[1]);
+      } else {
+        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
+                                     param_.stride[0],
+                                     param_.stride[1]);
+      }
+      const index_t gstride = temp_col.size(0) / param_.num_group;
+      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+        Tensor<xpu, 2> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+        if (i == 0) {
+          Tensor<xpu, 2> tmp_gwmat = gwmat[gid];
+          Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
+        } else {
+          gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+        }
+      }
+      if (req[deconv::kData] == kWriteTo || req[deconv::kData] == kWriteInplace) {
+        for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+          Tensor<xpu, 2> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+          temp_dst[gid] = dot(wmat[gid], tmpc);
+        }
+        gdata.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
+                                                    mshadow::Shape4(gdata.shape_[1],
+                                                    step,
+                                                    gdata.size(2),
+                                                    gdata.size(3))));
+      }
+    }
+    if (!param_.no_bias) {
+      Tensor<xpu, 1> gbias = in_grad[deconv::kBias].get<xpu, 1, real_t>(s);
+      Assign(gbias, req[deconv::kBias], sumall_except_dim<1>(grad));
+    }
+  }
+
+ private:
+  inline index_t InitTemp(const mshadow::Shape<4> &ishape,
+                          const mshadow::Shape<4> &oshape) {
+    const int ksize_y = param_.kernel[0];
+    const int ksize_x = param_.kernel[1];
+    shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
+                                     oshape[2] * oshape[3]);
+    shape_dstunit_ = mshadow::Shape3(param_.num_group,
+                                     oshape[1] / param_.num_group,
+                                     oshape[2] * oshape[3]);
+    const uint64_t workspace_size = param_.workspace;
+    nstep_ = std::max(std::min(static_cast<index_t>(workspace_size / shape_colunit_.Size()),
+                               ishape[0]), 1U);
+    int nop = (ishape[0] + nstep_ - 1) / nstep_;
+    nstep_ = (ishape[0] + nop - 1) / nop;
+    mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
+                                             shape_colunit_[1] * nstep_);
+    mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
+                                             shape_dstunit_[1],
+                                             shape_dstunit_[2] * nstep_);
+    CHECK_GE(param_.workspace, scol.Size() + sdst.Size())
+      << "\nMinimum workspace size: " << scol.Size() + sdst.Size() << "\n"
+      << "Given: " << param_.workspace;
+    return scol.Size() + sdst.Size();
+  }
+
+  DeconvolutionParam param_;
+  mshadow::Shape<2> shape_colunit_;
+  mshadow::Shape<3> shape_dstunit_;
+  index_t nstep_;
+};  // class DeconvolutionOp
+
+template<typename xpu>
+Operator* CreateOp(DeconvolutionParam param);
+
+#if DMLC_USE_CXX11
+class DeconvolutionProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (!param_.no_bias) {
+      return {"data", "weight", "bias"};
+    } else {
+      return {"data", "weight"};
+    }
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (!param_.no_bias) {
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
+    }
+    const TShape &dshape = (*in_shape)[deconv::kData];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 4) \
+        << "Input data should be 4D in batch-num_filter-y-x";
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       deconv::kWeight,
+                       Shape4(dshape[1], param_.num_filter, param_.kernel[0], param_.kernel[1]));
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+    const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+    CHECK_EQ(dshape[1] % param_.num_group, 0) \
+        << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+        << "output num_filter must divide group size";
+    CHECK_GE(param_.kernel.Size(), 0) \
+        << "incorrect kernel size: " << param_.kernel;
+    CHECK_GE(param_.stride.Size(), 0) \
+        << "incorrect stride size: " << param_.stride;
+    CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
+        << "kernel size exceed input";
+    (*out_shape)[deconv::kOut][1] = param_.num_filter;
+    (*out_shape)[deconv::kOut][2] = param_.stride[0] * (dshape[2] - 1) +
+        ksize_y - 2 * param_.pad[0];
+    (*out_shape)[deconv::kOut][3] = param_.stride[1] * (dshape[3] - 1) +
+        ksize_x - 2 * param_.pad[1];
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new DeconvolutionProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Deconvolution";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  DeconvolutionParam param_;
+};  // class DeconvolutionProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_DECONVOLUTION_INL_H_
diff --git a/src/operator/deconvolution.cc b/src/operator/deconvolution.cc
new file mode 100644
index 000000000000..fe5deeafc05b
--- /dev/null
+++ b/src/operator/deconvolution.cc
@@ -0,0 +1,31 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file deconvolution.cc
+ * \brief
+ * \author Wei Wu
+*/
+
+#include "./deconvolution-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(DeconvolutionParam param) {
+  return new DeconvolutionOp<cpu>(param);
+}
+
+Operator* DeconvolutionProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(DeconvolutionParam);
+
+MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp)
+.add_argument("data", "Symbol", "Input data to the DeconvolutionOp.")
+.add_argument("weight", "Symbol", "Weight matrix.")
+.add_argument("bias", "Symbol", "Bias parameter.")
+.add_arguments(DeconvolutionParam::__FIELDS__())
+.describe("Apply deconvolution to input then add a bias.");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/deconvolution.cu b/src/operator/deconvolution.cu
new file mode 100644
index 000000000000..d7662735e89c
--- /dev/null
+++ b/src/operator/deconvolution.cu
@@ -0,0 +1,25 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file deconvolution.cu
+ * \brief
+ * \author Wei Wu
+*/
+
+#include "./deconvolution-inl.h"
+#if MXNET_USE_CUDNN == 1
+#include "./cudnn_deconvolution-inl.h"
+#endif // MXNET_USE_CUDNN
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(DeconvolutionParam param) {
+#if MXNET_USE_CUDNN == 1
+  return new CuDNNDeconvolutionOp(param);
+#else
+  return new DeconvolutionOp<gpu>(param);
+#endif // MXNET_USE_CUDNN
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index fa76bd38ccf6..1d117bf24c3d 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -164,7 +164,7 @@ class DropoutProp : public OperatorProperty {
     return {"output", "mask"};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   DropoutParam param_;
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index d9c4c0e36206..f763032690d7 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -194,7 +194,7 @@ class ElementWiseSumProp : public OperatorProperty {
     return {{in_data[0], out_data[0]}};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   ElementWiseSumParam param_;
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index 262aba95d0fb..3454c3498cf9 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -195,7 +195,7 @@ class FullyConnectedProp : public OperatorProperty {
     return {{in_data[fullc::kData], in_grad[fullc::kData]}};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   FullyConnectedParam param_;
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index 4bdb65ef415a..3d4429556877 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -128,7 +128,6 @@ class LeakyReLUOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO(bing): double check
     size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1;
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(req.size(), expected);
@@ -141,9 +140,9 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 4> mask;
     Tensor<xpu, 1> weight;
     Tensor<xpu, 1> grad_weight;
-    if (in_data[leakyrelu::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[leakyrelu::kData].shape_[0],
-                               in_data[leakyrelu::kData].shape_[1], 1, 1);
+    if (out_grad[leakyrelu::kOut].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[leakyrelu::kOut].shape_[0],
+                               out_grad[leakyrelu::kOut].shape_[1], 1, 1);
       grad = out_grad[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       gdata = in_grad[leakyrelu::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       output = out_data[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
@@ -298,8 +297,8 @@ class LeakyReLUProp : public OperatorProperty {
     return 1;
   }
 
-  virtual std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const {
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
     if (param_.act_type == leakyrelu::kRReLU) {
       return {ResourceRequest::kRandom};
     } else {
@@ -307,7 +306,7 @@ class LeakyReLUProp : public OperatorProperty {
     }
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   LeakyReLUParam param_;
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
index 35aac8fe73ae..40985e7b5112 100644
--- a/src/operator/lrn-inl.h
+++ b/src/operator/lrn-inl.h
@@ -179,7 +179,7 @@ class LocalResponseNormProp : public OperatorProperty {
 #endif
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   LRNParam param_;
diff --git a/src/operator/native_op-inl.h b/src/operator/native_op-inl.h
new file mode 100644
index 000000000000..afe6868aa415
--- /dev/null
+++ b/src/operator/native_op-inl.h
@@ -0,0 +1,256 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file native_op-inl.h
+ * \brief
+ * \author Junyuan Xie
+*/
+
+#ifndef MXNET_OPERATOR_NATIVE_OP_INL_H_
+#define MXNET_OPERATOR_NATIVE_OP_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <mxnet/c_api.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include <sstream>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+struct NativeOpParam : public dmlc::Parameter<NativeOpParam> {
+  void *info;
+  bool need_top_grad;
+
+  NativeOpInfo *pinfo;
+  int num_inputs_, num_outputs_;
+  DMLC_DECLARE_PARAMETER(NativeOpParam) {
+    DMLC_DECLARE_FIELD(info);
+    DMLC_DECLARE_FIELD(need_top_grad).set_default(true)
+    .describe("Whether this layer needs out grad for backward. "
+      "Should be false for loss layers.");
+  }
+};
+
+template<typename xpu>
+class NativeOp : public Operator {
+ public:
+  explicit NativeOp(NativeOpParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    ptrs.clear();
+    ndims.clear();
+    shapes.clear();
+    tags.clear();
+    SyncVec(in_data, "in_data", s, 0);
+    SyncVec(out_data, "out_data", s, 1);
+    s->Wait();
+    param_.pinfo->forward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(), tags.data());
+    for (index_t i = 0; i < out_data.size(); ++i) {
+      CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
+      if (req[i] != kNullOp) {
+        std::stringstream ss;
+        ss << std::string("out_data") << i;
+        Copy(out_data[i].FlatTo2D<xpu, real_t>(s),
+             buffer_map[ss.str()].second, s);
+      }
+    }
+    s->Wait();
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    ptrs.clear();
+    ndims.clear();
+    shapes.clear();
+    tags.clear();
+    SyncVec(in_data, "in_data", s, 0);
+    SyncVec(out_data, "out_data", s, 1);
+    SyncVec(in_grad, "in_grad", s, 2);
+    if (param_.need_top_grad) {
+      SyncVec(out_grad, "out_grad", s, 3);
+    }
+    s->Wait();
+    param_.pinfo->backward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(), tags.data());
+    for (index_t i = 0; i < in_grad.size(); ++i) {
+      CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
+      if (req[i] != kNullOp) {
+        std::stringstream ss;
+        ss << std::string("in_grad") << i;
+        Copy(in_grad[i].FlatTo2D<xpu, real_t>(s),
+             buffer_map[ss.str()].second, s);
+      }
+    }
+    s->Wait();
+  }
+
+ private:
+  NativeOpParam param_;
+  std::vector<real_t*> ptrs;
+  std::vector<int> ndims;
+  std::vector<unsigned*> shapes;
+  std::vector<int> tags;
+  std::map<std::string, std::pair<TShape, mshadow::Tensor<cpu, 2> > > buffer_map;
+
+  virtual void SyncBuffer(const TBlob &tblob,
+                          const std::string &name,
+                          mshadow::Stream<xpu> *stream) {
+    using namespace mshadow;
+    std::map<std::string, std::pair<TShape, mshadow::Tensor<cpu, 2> > >::iterator buffer =
+      buffer_map.find(name);
+    if (buffer == buffer_map.end() || buffer->second.first != tblob.shape_) {
+      if (buffer != buffer_map.end()) {
+        FreeSpace<2, real_t>(&(buffer->second.second));
+        buffer_map.erase(buffer);
+      }
+      buffer_map[name] =
+        std::pair<TShape, Tensor<cpu, 2> >(tblob.shape_,
+                                         NewTensor<cpu>(tblob.shape_.FlatTo2D(),
+                                                        0.0f,
+                                                        false));
+      buffer = buffer_map.find(name);
+    }
+    Copy(buffer->second.second, tblob.FlatTo2D<xpu, real_t>(stream), stream);
+  }
+
+  virtual void SyncVec(const std::vector<TBlob> &vec,
+                       const std::string &prefix,
+                       mshadow::Stream<xpu> *stream,
+                       int tag) {
+    for (size_t i = 0; i < vec.size(); ++i) {
+      std::stringstream name;
+      name << prefix << i;
+      SyncBuffer(vec[i], name.str(), stream);
+      ptrs.push_back(buffer_map[name.str()].second.dptr_);
+      ndims.push_back(vec[i].ndim());
+      shapes.push_back(const_cast<index_t*>(vec[i].shape_.data()));
+      tags.push_back(tag);
+    }
+  }
+};  // NativeOp
+
+template<typename xpu>
+Operator* CreateOp(NativeOpParam param);
+
+#if DMLC_USE_CXX11
+class NativeOpProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    char ** args = NULL;
+    param_.pinfo->list_arguments(&args);
+    std::vector<std::string> ret;
+    for (int i = 0; args[i] != NULL; ++i) {
+      ret.push_back(args[i]);
+    }
+    return ret;
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    char ** args = NULL;
+    param_.pinfo->list_outputs(&args);
+    std::vector<std::string> ret;
+    for (int i = 0; args[i] != NULL; ++i) {
+      ret.push_back(args[i]);
+    }
+    return ret;
+  }
+
+  int NumOutputs() const override {
+    return param_.num_outputs_;
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+    for (auto iter = kwargs.begin(); iter != kwargs.end(); ++iter) {
+      if (iter->first == "info") {
+        sscanf(iter->second.c_str(), "%p", &param_.pinfo);
+      }
+    }
+    param_.num_inputs_ = ListArguments().size();
+    param_.num_outputs_ = ListOutputs().size();
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    std::vector<unsigned*> shapes;
+    std::vector<int> ndims;
+    for (auto iter = in_shape->begin(); iter != in_shape->end(); ++iter) {
+      shapes.push_back(iter->data());
+      ndims.push_back(iter->ndim());
+    }
+    shapes.resize(param_.num_inputs_+param_.num_outputs_);
+    ndims.resize(param_.num_inputs_+param_.num_outputs_);
+    param_.pinfo->infer_shape(shapes.size(), ndims.data(), shapes.data());
+    for (unsigned i = 0; i < in_shape->size(); ++i) {
+      (*in_shape)[i] = TShape(shapes[i], shapes[i]+ndims[i]);
+    }
+    for (unsigned i = param_.num_inputs_; i < param_.num_inputs_ + out_shape->size(); ++i) {
+      (*out_shape)[i-param_.num_inputs_] = TShape(shapes[i], shapes[i]+ndims[i]);
+    }
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    NativeOpProp *prop_sym = new NativeOpProp();
+    prop_sym->param_ = this->param_;
+    return prop_sym;
+  }
+
+  std::string TypeString() const override {
+    return "_Native";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    std::vector<int> deps;
+    if (param_.need_top_grad) {
+      deps.insert(deps.end(), out_grad.begin(), out_grad.end());
+    }
+    deps.insert(deps.end(), in_data.begin(), in_data.end());
+    deps.insert(deps.end(), out_data.begin(), out_data.end());
+    return deps;
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  NativeOpParam param_;
+};  // class PythonProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_NATIVE_OP_INL_H_
diff --git a/src/operator/native_op.cc b/src/operator/native_op.cc
new file mode 100644
index 000000000000..7ab0614a041c
--- /dev/null
+++ b/src/operator/native_op.cc
@@ -0,0 +1,27 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file native_op.cc
+ * \brief
+ * \author Junyuan Xie
+*/
+#include "./native_op-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(NativeOpParam param) {
+  return new NativeOp<cpu>(param);
+}
+
+Operator* NativeOpProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(NativeOpParam);
+
+MXNET_REGISTER_OP_PROPERTY(_Native, NativeOpProp)
+.describe("Stub for implementing an operator implemented in native frontend language.")
+.add_arguments(NativeOpParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/native_op.cu b/src/operator/native_op.cu
new file mode 100644
index 000000000000..807592626e8b
--- /dev/null
+++ b/src/operator/native_op.cu
@@ -0,0 +1,15 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file native_op.cu
+ * \brief
+ * \author Junyuan Xie
+*/
+#include "./native_op-inl.h"
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(NativeOpParam param) {
+  return new NativeOp<gpu>(param);
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 1f3d76e1ab7a..54808c9bed19 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -215,7 +215,7 @@ class PoolingProp : public OperatorProperty {
 #endif
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   PoolingParam param_;
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index 479579d4b472..0366d16e85a8 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -130,7 +130,7 @@ class RegressionOutputProp : public OperatorProperty {
     return {{in_data[reg_enum::kData], out_data[reg_enum::kOut]}};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 };
 #endif  // DMLC_USE_CXX11
 }  // namespace op
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index 12c2071a8c97..915b6938883c 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -144,7 +144,7 @@ class ReshapeProp : public OperatorProperty {
     return {{out_grad[reshape_enum::kOut], in_grad[reshape_enum::kData]}};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  protected:
   ReshapeParam param_;
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 05e3da199bda..6c426ec1bb65 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -29,7 +29,7 @@ enum SliceChannelOpOutputs {kOut0, kOut1, kOut2, kOut3, kOut4};
 struct SliceChannelParam : public dmlc::Parameter<SliceChannelParam> {
   int num_outputs;
   DMLC_DECLARE_PARAMETER(SliceChannelParam) {
-    DMLC_DECLARE_FIELD(num_outputs).set_range(1,  6)
+    DMLC_DECLARE_FIELD(num_outputs).set_lower_bound(1)
     .describe("Number of outputs to be sliced.");
   }
 };  // struct SliceChannelParam
@@ -170,7 +170,7 @@ class SliceChannelProp : public OperatorProperty {
     return out_grad;
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
  private:
   SliceChannelParam param_;
diff --git a/src/operator/softmax.cc b/src/operator/softmax.cc
deleted file mode 100644
index 2c2516ba9bc9..000000000000
--- a/src/operator/softmax.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file softmax.cc
- * \brief
- * \author Bing Xu
-*/
-#include "./softmax-inl.h"
-
-namespace mxnet {
-namespace op {
-template<>
-Operator *CreateOp<cpu>(SoftmaxParam param) {
-  return new SoftmaxOp<cpu>(param);
-}
-
-Operator *SoftmaxProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
-}
-
-DMLC_REGISTER_PARAMETER(SoftmaxParam);
-
-MXNET_REGISTER_OP_PROPERTY(Softmax, SoftmaxProp)
-.describe("Perform a softmax transformation on input.")
-.add_argument("data", "Symbol", "Input data to softmax.")
-.add_arguments(SoftmaxParam::__FIELDS__());
-
-}  // namespace op
-}  // namespace mxnet
-
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax_output-inl.h
similarity index 57%
rename from src/operator/softmax-inl.h
rename to src/operator/softmax_output-inl.h
index d1e5331d9d06..9528ed0a41c6 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -1,11 +1,11 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file softmax-inl.h
+ * \file softmax_output-inl.h
  * \brief
  * \author Bing Xu
 */
-#ifndef MXNET_OPERATOR_SOFTMAX_INL_H_
-#define MXNET_OPERATOR_SOFTMAX_INL_H_
+#ifndef MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
+#define MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
 
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
@@ -20,15 +20,15 @@
 namespace mxnet {
 namespace op {
 
-namespace softmax_enum {
-enum SoftmaxOpInputs {kData, kLabel};
-enum SoftmaxOpOutputs {kOut};
-}  // namespace softmax_enum
+namespace softmaxout_enum {
+enum SoftmaxOutputOpInputs {kData, kLabel};
+enum SoftmaxOutputOpOutputs {kOut};
+}  // namespace softmaxout_enum
 
-struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
+struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
   float grad_scale;
   bool multi_output;
-  DMLC_DECLARE_PARAMETER(SoftmaxParam) {
+  DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scale the gradient by a float factor");
     DMLC_DECLARE_FIELD(multi_output).set_default(false)
@@ -39,9 +39,9 @@ struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
 };
 
 template<typename xpu>
-class SoftmaxOp : public Operator {
+class SoftmaxOutputOp : public Operator {
  public:
-  explicit SoftmaxOp(SoftmaxParam param) : param_(param) {}
+  explicit SoftmaxOutputOp(SoftmaxOutputParam param) : param_(param) {}
 
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
@@ -50,19 +50,19 @@ class SoftmaxOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2) << "Softmax Input: [data, label]";
-    CHECK_EQ(out_data.size(), 1) << "Softmax Output: [output]";
+    CHECK_EQ(in_data.size(), 2) << "SoftmaxOutput Input: [data, label]";
+    CHECK_EQ(out_data.size(), 1) << "SoftmaxOutput Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.multi_output) {
-      int n = in_data[softmax_enum::kData].size(0);
-      int k = in_data[softmax_enum::kData].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_enum::kData].Size()/n/k));
-      Tensor<xpu, 3> data = in_data[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      int n = in_data[softmaxout_enum::kData].size(0);
+      int k = in_data[softmaxout_enum::kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmaxout_enum::kData].Size()/n/k));
+      Tensor<xpu, 3> data = in_data[softmaxout_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> out = out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
       Softmax(out, data);
     } else {
-      Tensor<xpu, 2> data = in_data[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, real_t>(s);
       Softmax(out, data);
     }
   }
@@ -82,20 +82,20 @@ class SoftmaxOp : public Operator {
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.multi_output) {
-      int n = out_data[softmax_enum::kOut].size(0);
-      int k = out_data[softmax_enum::kOut].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmax_enum::kOut].Size()/n/k));
-      Tensor<xpu, 2> label = in_data[softmax_enum::kLabel].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3> grad = in_grad[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      int n = out_data[softmaxout_enum::kOut].size(0);
+      int k = out_data[softmaxout_enum::kOut].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
+      Tensor<xpu, 2> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 3> out = out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> grad = in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
       SoftmaxGrad(grad, out, label);
       if (param_.grad_scale < 1.0) {
         grad *= param_.grad_scale;
       }
     } else {
-      Tensor<xpu, 1> label = in_data[softmax_enum::kLabel].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> grad = in_grad[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 1> label = in_data[softmaxout_enum::kLabel].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, real_t>(s);
       SoftmaxGrad(grad, out, label);
       if (param_.grad_scale < 1.0) {
         grad *= param_.grad_scale;
@@ -104,15 +104,15 @@ class SoftmaxOp : public Operator {
   }
 
  private:
-  SoftmaxParam param_;
-};  // class SoftmaxOp
+  SoftmaxOutputParam param_;
+};  // class SoftmaxOutputOp
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(SoftmaxParam param);
+Operator* CreateOp(SoftmaxOutputParam param);
 
 #if DMLC_USE_CXX11
-class SoftmaxProp : public OperatorProperty {
+class SoftmaxOutputProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
     return {"data", "label"};
@@ -134,10 +134,10 @@ class SoftmaxProp : public OperatorProperty {
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     if (param_.multi_output) {
-      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel,
+      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel,
                          Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
     } else {
-      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel, Shape1(dshape[0]));
+      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, Shape1(dshape[0]));
     }
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -145,20 +145,20 @@ class SoftmaxProp : public OperatorProperty {
   }
 
   OperatorProperty* Copy() const override {
-    auto ptr = new SoftmaxProp();
+    auto ptr = new SoftmaxOutputProp();
     ptr->param_ = param_;
     return ptr;
   }
 
   std::string TypeString() const override {
-    return "Softmax";
+    return "SoftmaxOutput";
   }
 
   std::vector<int> DeclareBackwardDependency(
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[softmax_enum::kLabel], out_data[softmax_enum::kOut]};
+    return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -166,22 +166,35 @@ class SoftmaxProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_data[softmax_enum::kOut], in_grad[softmax_enum::kData]}};
+    return {{out_data[softmaxout_enum::kOut], in_grad[softmaxout_enum::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[softmax_enum::kData], out_data[softmax_enum::kOut]}};
+    return {{in_data[softmaxout_enum::kData], out_data[softmaxout_enum::kOut]}};
   }
 
-  Operator* CreateOperator(Context ctx) const;
+  Operator* CreateOperator(Context ctx) const override;
 
- private:
-  SoftmaxParam param_;
-};  // class SoftmaxProp
+ protected:
+  SoftmaxOutputParam param_;
+};  // class SoftmaxOutputProp
+
+class DeprecatedSoftmaxProp : public SoftmaxOutputProp {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    LOG(INFO) << "Softmax symbol is renamed to SoftmaxOutput. "
+      << "This API will be deprecated in Dec, 2015";
+    SoftmaxOutputProp::param_.Init(kwargs);
+  }
+
+  std::string TypeString() const override {
+    return "Softmax";
+  }
+};
 #endif  // DMLC_USE_CXX11
 
 }  // namespace op
 }  // namespace mxnet
-#endif  // MXNET_OPERATOR_SOFTMAX_INL_H_
+#endif  // MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
new file mode 100644
index 000000000000..bc1ba367ccaa
--- /dev/null
+++ b/src/operator/softmax_output.cc
@@ -0,0 +1,34 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file softmax_output.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./softmax_output-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(SoftmaxOutputParam param) {
+  return new SoftmaxOutputOp<cpu>(param);
+}
+
+Operator *SoftmaxOutputProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
+
+MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
+.describe("Perform a softmax transformation on input, backprop with logloss.")
+.add_argument("data", "Symbol", "Input data to softmax.")
+.add_arguments(SoftmaxOutputParam::__FIELDS__());
+
+MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
+.describe("DEPRECATED: Perform a softmax transformation on input. Please use SoftmaxOutput")
+.add_argument("data", "Symbol", "Input data to softmax.")
+.add_arguments(SoftmaxOutputParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/softmax.cu b/src/operator/softmax_output.cu
similarity index 51%
rename from src/operator/softmax.cu
rename to src/operator/softmax_output.cu
index 0ebbfc16ce68..02b92c041c47 100644
--- a/src/operator/softmax.cu
+++ b/src/operator/softmax_output.cu
@@ -1,17 +1,17 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file softmax.cu
+ * \file softmax_output.cu
  * \brief
  * \author Bing Xu
 */
 
-#include "./softmax-inl.h"
+#include "./softmax_output-inl.h"
 
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(SoftmaxParam param) {
-  return new SoftmaxOp<gpu>(param);
+Operator *CreateOp<gpu>(SoftmaxOutputParam param) {
+  return new SoftmaxOutputOp<gpu>(param);
 }
 
 }  // namespace op
diff --git a/tests/python/common/models.py b/tests/python/common/models.py
index 71df3f07cf47..2c998afcd1db 100644
--- a/tests/python/common/models.py
+++ b/tests/python/common/models.py
@@ -24,6 +24,6 @@ def conv():
 
     fl = mx.symbol.Flatten(data = mp2, name="flatten")
     fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
-    softmax = mx.symbol.Softmax(data = fc2, name = 'sm')
+    softmax = mx.symbol.SoftmaxOutput(data = fc2, name = 'sm')
     return softmax
 
diff --git a/tests/python/multi-node/common.py b/tests/python/multi-node/common.py
index 0db092462a78..d35a1a1fe3f4 100644
--- a/tests/python/multi-node/common.py
+++ b/tests/python/multi-node/common.py
@@ -86,7 +86,7 @@ def mlp():
     fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
     act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
     fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-    softmax = mx.symbol.Softmax(fc3, name = 'sm')
+    softmax = mx.symbol.SoftmaxOutput(fc3, name = 'sm')
     return softmax
 
 def lenet():
@@ -109,7 +109,7 @@ def lenet():
     # second fullc
     fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
     # loss
-    lenet = mx.symbol.Softmax(data=fc2)
+    lenet = mx.symbol.SoftmaxOutput(data=fc2)
     return lenet
 
 # Basic Conv + BN + ReLU factory
@@ -155,5 +155,5 @@ def inception():
     pool = mx.symbol.Pooling(data=in5b, pool_type="avg", kernel=(7,7), name="global_pool")
     flatten = mx.symbol.Flatten(data=pool, name="flatten1")
     fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10, name="fc1")
-    softmax = mx.symbol.Softmax(data=fc, name="loss")
+    softmax = mx.symbol.SoftmaxOutput(data=fc, name="loss")
     return softmax
diff --git a/tests/python/multi-node/imagenet.py b/tests/python/multi-node/imagenet.py
index 7663df8d1bad..f4d7c1e35bb3 100644
--- a/tests/python/multi-node/imagenet.py
+++ b/tests/python/multi-node/imagenet.py
@@ -97,5 +97,5 @@ def inception(nhidden):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SotfmaxOutput(data=fc1, name='softmax')
     return softmax
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index bc068153c24e..9d8f77fd7c65 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -24,7 +24,7 @@
 
 fl = mx.symbol.Flatten(data = mp2, name="flatten")
 fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
-softmax = mx.symbol.Softmax(data = fc2, name = 'sm')
+softmax = mx.symbol.SoftmaxOutput(data = fc2, name = 'sm')
 
 num_epoch = 1
 model = mx.model.FeedForward(softmax, mx.cpu(),
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 5f1c27062066..84a6f17f47d5 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -14,7 +14,7 @@
 fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
 act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
 fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-softmax = mx.symbol.Softmax(fc3, name = 'sm')
+softmax = mx.symbol.SoftmaxOutput(fc3, name = 'sm')
 
 def accuracy(label, pred):
     py = np.argmax(pred, axis=1)
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index c00350a3ad28..fca0093a09c9 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -77,7 +77,7 @@ def test_ndarray_choose():
     for repeat in range(nrepeat):
         indices = np.random.randint(shape[1], size=shape[0])
         assert same(npy[np.arange(shape[0]), indices],
-                    mx.nd.choose_element(arr, mx.nd.array(indices)).asnumpy())
+                    mx.nd.choose_element_0index(arr, mx.nd.array(indices)).asnumpy())
 
 
 def test_ndarray_choose():
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index b0743a6f0bb6..c767648f1ece 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -49,45 +49,34 @@ def test_elementwise_sum():
             shape = tuple(np.random.randint(1, int(1000**(1.0/dim)), size=dim))
             check_elementwise_sum_with_shape(shape, np.random.randint(1, 8))
 
-def check_slice_channel(dim):
+def check_slice_channel(dim, num):
+    ins = []
     if dim == 2:
-        a = np.ones((2, 2)) * 1.
-        b = np.ones((2, 2)) * 2.
-        c = np.ones((2, 2)) * 3.
-        d = np.ones((2, 2)) * 4.
-        e = np.hstack((a, b, c, d))
-    elif dim == 4:
-        a = np.ones((2, 2, 2, 2)) * 1.
-        b = np.ones((2, 2, 2, 2)) * 2.
-        c = np.ones((2, 2, 2, 2)) * 3.
-        d = np.ones((2, 2, 2, 2)) * 4.
-        e = np.hstack((a, b, c, d))
+        shape = (2,2)
+    else:
+        shape = (2, 2, 2 ,3)
+    ins = [np.ones(shape) * i for i in range(num)]
+    e = np.hstack(ins)
+
     e_nd = mx.nd.empty(e.shape)
     e_nd[:] = e
     data = mx.sym.Variable('data')
-    op = mx.sym.SliceChannel(data=data, num_outputs=4)
+    op = mx.sym.SliceChannel(data=data, num_outputs=num)
     arg_shape, output_shape, aux_shape = op.infer_shape(data=e_nd.shape)
     grad_nd = [mx.nd.empty(shape) for shape in arg_shape]
 
     exe = op.bind(mx.cpu(), args=[e_nd], args_grad=grad_nd)
-    assert len(exe.outputs) == 4
-    o1_nd = exe.outputs[0]
-    o2_nd = exe.outputs[1]
-    o3_nd = exe.outputs[2]
-    o4_nd = exe.outputs[3]
+    assert len(exe.outputs) == num
+    o_nd = [exe.outputs[i] for i in range(num)]
     # test forward
     exe.forward()
-    assert reldiff(o1_nd.asnumpy(), a) < 1e-5
-    assert reldiff(o2_nd.asnumpy(), b) < 1e-5
-    assert reldiff(o3_nd.asnumpy(), c) < 1e-5
-    assert reldiff(o4_nd.asnumpy(), d) < 1e-5
+    for i in range(num):
+        assert reldiff(o_nd[i].asnumpy(), ins[i]) < 1e-5
     # test backward
-    o1_nd += 4.
-    o2_nd += 3.
-    o3_nd += 2.
-    o4_nd += 1.
-    exe.backward([o1_nd, o2_nd, o3_nd, o4_nd])
-    assert reldiff(grad_nd[0].asnumpy(), np.hstack((a+4,b+3, c+2, d+1))) < 1e-5
+    for i in range(num):
+        o_nd[i] += i
+    exe.backward(o_nd)
+    assert reldiff(grad_nd[0].asnumpy(), np.hstack([ins[i] + i for i in range(num)])) < 1e-5
 
 def check_concat_with_shape(shapes):
     n = len(shapes)
@@ -140,8 +129,9 @@ def test_concat():
         check_concat_with_shape(shapes)
 
 def test_slice_channel():
-    check_slice_channel(2)
-    check_slice_channel(4)
+    check_slice_channel(2, 4)
+    check_slice_channel(4, 4)
+    check_slice_channel(2, 16)
 
 def check_regression(symbol, forward, backward):
     data = mx.symbol.Variable('data')
@@ -202,10 +192,25 @@ def check_multi_softmax_with_shape(shape, xpu):
     exec1.backward()
     print(grad.asnumpy())
 
+def test_python_op():
+    X = mx.symbol.Variable('X')
+    op = mx.operator.NumpyOp()
+    s = op.get_symbol(X, name='numpy_op')
+
+    x = mx.ndarray.ones((10))*10
+    dx = mx.ndarray.zeros((10))
+    dy = mx.ndarray.ones((10))
+    exec1 = s.bind(mx.cpu(), args=[x], args_grad = {'X': dx})
+    exec1.forward()
+    assert reldiff(x.asnumpy(), exec1.outputs[0].asnumpy()) < 1e-5
+    exec1.backward(dy)
+    assert reldiff(dy.asnumpy(), dx.asnumpy()) < 1e-5
+
 if __name__ == '__main__':
     test_elementwise_sum()
     test_concat()
     test_slice_channel()
     test_regression()
+    test_python_op()
     #check_softmax_with_shape((3,4), mx.cpu())
     #check_multi_softmax_with_shape((3,4,5), mx.cpu())
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index a0ebcd9edfcd..451f2e272a50 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -45,6 +45,30 @@ if [ ${TASK} == "cpp_test" ]; then
     exit 0
 fi
 
+if [ ${TASK} == "r_test" ]; then
+    make all || exit -1
+    # use cached dir for storing data
+    rm -rf ${PWD}/data
+    mkdir -p ${CACHE_PREFIX}/data
+    ln -s ${CACHE_PREFIX}/data ${PWD}/data
+
+    set -e
+    export _R_CHECK_TIMINGS_=0
+    export R_BUILD_ARGS="--no-build-vignettes --no-manual"
+    export R_CHECK_ARGS="--no-vignettes --no-manual"
+    
+    curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
+    chmod 755 ./travis-tool.sh
+    ./travis-tool.sh bootstrap
+    ./travis-tool.sh install_aptget r-cran-testthat r-cran-Rcpp r-cran-DiagrammeR r-cran-data.table r-cran-jsonlite r-cran-magrittr r-cran-stringr
+    
+    R CMD INSTALL R-package
+    cd ./R-package
+    ../travis-tool.sh install_deps
+    ../travis-tool.sh run_tests
+    exit 0
+fi
+
 if [ ${TASK} == "python_test" ]; then
     make all || exit -1
     # use cached dir for storing data
diff --git a/tests/travis/travis_after_failure.sh b/tests/travis/travis_after_failure.sh
new file mode 100644
index 000000000000..ad9616edd94c
--- /dev/null
+++ b/tests/travis/travis_after_failure.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+if [ ${TASK} == "r_test" ]; then
+    cat mxnet/mxnet.Rcheck/*.log
+fi
\ No newline at end of file
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index 7f362dbbe06d..87c51b82692a 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -51,10 +51,10 @@ def main():
                     first_conv = False
 
     model = mx.model.FeedForward(ctx=mx.cpu(), symbol=prob,
-            arg_params=arg_params, aux_params={}, num_round=1,
+            arg_params=arg_params, aux_params={}, num_epoch=1,
             learning_rate=0.05, momentum=0.9, wd=0.0001)
 
     model.save(args.save_model_name)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index 9b5bcde99848..88acea0b7c4e 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -41,7 +41,7 @@ def proto2script(proto_file):
         name = layer[i].name.replace('/', '_')
         if layer[i].type == 'Convolution' or layer[i].type == 4:
             type_string = 'mx.symbol.Convolution'
-            param = layer[i].convolution_param 
+            param = layer[i].convolution_param
             pad = 0 if len(param.pad) == 0 else param.pad[0]
             stride = 1 if len(param.stride) == 0 else param.stride[0]
             param_string = "num_filter=%d, pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d), no_bias=%s" %\
@@ -67,7 +67,7 @@ def proto2script(proto_file):
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'LRN' or layer[i].type == 15:
             type_string = 'mx.symbol.LRN'
-            param = layer[i].lrn_param  
+            param = layer[i].lrn_param
             param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" %\
                 (param.alpha, param.beta, param.k, param.local_size)
             need_flatten[name] = True
@@ -82,7 +82,7 @@ def proto2script(proto_file):
             param_string = "p=%f" % param.dropout_ratio
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'Softmax' or layer[i].type == 20:
-            type_string = 'mx.symbol.Softmax'
+            type_string = 'mx.symbol.SoftmaxOutput'
 
             # We only support single output network for now.
             output_name = name
@@ -96,7 +96,7 @@ def proto2script(proto_file):
             need_flatten[name] = True
         if type_string == '':
             raise Exception('Unknown Layer %s!' % layer[i].type)
-        
+
         if type_string != 'split':
             bottom = layer[i].bottom
             if param_string != "":
@@ -137,4 +137,4 @@ def main():
         print(symbol_string)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()