diff --git a/.gitmodules b/.gitmodules
index c1084105e4aa..0de60d4c80e7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,4 +6,4 @@
 	url = https://github.com/dmlc/dmlc-core.git
 [submodule "ps-lite"]
 	path = ps-lite
-	url = https://github.com/dmlc/ps-lite.git
+	url = https://github.com/dmlc/ps-lite
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d55086fd197a..67e0b881df5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,6 @@ else(MSVC)
 endif(MSVC)
 
 if(USE_OPENCV)
-  if(MSVC)
-    set(OpenCV_STATIC OFF)
-  endif()
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
@@ -68,7 +65,9 @@ if(USE_CUDNN)
 endif()
 
 add_subdirectory("dmlc-core")
-add_subdirectory("ps-lite")
+if(NOT MSVC)
+  add_subdirectory("ps-lite")
+endif()
 
 mxnet_source_group("Source"   GLOB_RECURSE "src/*.cc")
 mxnet_source_group("Source\\Cuda" GLOB_RECURSE "src/*.cu")
@@ -76,6 +75,16 @@ mxnet_source_group("Source\\Cuda" GLOB_RECURSE "src/*.cu")
 FILE(GLOB_RECURSE SOURCE "src/*.cc")
 FILE(GLOB_RECURSE cuda "src/*.cu")
 
+if(MSVC)
+  foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if(${flag_var} MATCHES "/MD")
+      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endif(${flag_var} MATCHES "/MD")
+  endforeach(flag_var)
+endif()
+
 if(USE_CUDA)
   # define preprocessor macro so that we will not include the generated forcelink header
   mshadow_cuda_compile(cuda_objs ${cuda})
@@ -93,8 +102,10 @@ endif()
 add_library(mxnet SHARED ${SOURCE})
 target_link_libraries(mxnet ${mshadow_LINKER_LIBS})
 target_link_libraries(mxnet dmlccore)
-target_link_libraries(mxnet pslite)
-target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+if(NOT MSVC)
+  target_link_libraries(mxnet pslite)
+  target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+endif()
 set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 
 # ---[ Linter target
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 63685d9af1fd..9f72042fb3ce 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -40,14 +40,18 @@ List of Contributors
 * [Full List of Contributors](https://github.com/dmlc/mxnet/graphs/contributors)
   - To contributors: please add your name to the list when you submit a patch to the project:)
 * [Qiang Kou](https://github.com/thirdwing)
-  - KK is a R ninja, he will make mxnet available for R users.
+  - KK is a R ninja, he makes mxnet available for R users.
+* [Tong He](https://github.com/hetong007)
+  - Tong is the major maintainer of MXNetR, he designs the mxnet interface and wrote many of the tutorials on R.
 * [Feng Wang](https://github.com/happynear)
   - Feng makes mxnet compatible with Windows Visual Studio.
 * [Li Dong](https://github.com/donglixp)
 * [Piji Li](https://github.com/lipiji)
 * [Hu Shiwen](https://github.com/yajiedesign)
 * [Boyuan Deng](https://github.com/bryandeng)
-* [Tong He](https://github.com/hetong007)
 * [Junran He](https://github.com/junranhe)
   - Junran makes device kvstore allocation strategy smarter
 * [Shuzhe Wu](https://github.com/II-Matto)
+* [Xiaodong](https://github.com/XD-DENG)
+* [Nan Xiao](https://github.com/road2stat)
+* [Junyuan Xie](https://github.com/piiswrong)
diff --git a/Makefile b/Makefile
index 5ee09d05ff34..da5e73ce504b 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,11 @@ else
 endif
 CFLAGS += -I./mshadow/ -I./dmlc-core/include -fPIC -Iinclude $(MSHADOW_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
-NVCCFLAGS = --use_fast_math -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+ifeq ($(DEBUG), 1)
+	NVCCFLAGS = -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+else
+	NVCCFLAGS = --use_fast_math -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+endif
 ROOTDIR = $(CURDIR)
 
 ifndef LINT_LANG
@@ -80,7 +84,7 @@ PS_PATH=./ps-lite
 DEPS_PATH=$(shell pwd)/deps
 include $(PS_PATH)/make/ps.mk
 ifeq ($(USE_DIST_KVSTORE), 1)
-	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/src
+	CFLAGS += -DMXNET_USE_DIST_KVSTORE -I$(PS_PATH)/include -I$(DEPS_PATH)/include
 	LIB_DEP += $(PS_PATH)/build/libps.a
 	LDFLAGS += -Wl,-rpath,$(DEPS_PATH)/lib $(PS_LDFLAGS_SO)
 endif
@@ -100,12 +104,12 @@ ifeq ($(USE_CUDA), 1)
 	ALL_DEP += $(CUOBJ)
 endif
 
-build/%.o: src/%.cc $(LIB_DEP)
+build/%.o: src/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d
 	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
 
-build/%_gpu.o: src/%.cu $(LIB_DEP)
+build/%_gpu.o: src/%.cu
 	@mkdir -p $(@D)
 	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M build/$*_gpu.o $< >build/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
@@ -120,7 +124,7 @@ lib/libmxnet.so: $(ALL_DEP)
 
 # ps-lite
 $(PS_PATH)/build/libps.a:
-	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) deps
+	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) protobuf zmq
 	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) ps
 
 $(DMLC_CORE)/libdmlc.a:
@@ -137,7 +141,7 @@ include tests/cpp/unittest.mk
 test: $(TEST)
 
 lint: rcpplint
-	python dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src scripts python
+	python dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src scripts python predict/python
 
 doc: doxygen
 
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index ab141f4abdc6..e1bb9570fb40 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -33,7 +33,9 @@ export(mx.io.extract)
 export(mx.kv.create)
 export(mx.metric.accuracy)
 export(mx.metric.custom)
+export(mx.metric.mae)
 export(mx.metric.rmse)
+export(mx.metric.rmsle)
 export(mx.model.FeedForward.create)
 export(mx.model.load)
 export(mx.model.save)
diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index 36fb3571a9b8..784a2e31a9d3 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -2,7 +2,7 @@
 #' @export
 mx.callback.log.train.metric <- function(period) {
   function(iteration, nbatch, env) {
-    if (nbatch %% period == 0) {
+    if (nbatch %% period == 0 && !is.null(env$metric)) {
       result <- env$metric$get(env$train.metric)
       cat(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value, "\n"))
     }
diff --git a/R-package/R/io.R b/R-package/R/io.R
index 5fe51c0eb70e..cde2b4c032f1 100644
--- a/R-package/R/io.R
+++ b/R-package/R/io.R
@@ -21,7 +21,8 @@ mx.io.extract <- function(iter, field) {
     padded <- iter$num.pad()
     data <- dlist[[field]]
     oshape <- dim(data)
-    packer$push(mx.nd.slice(data, 0, oshape[[1]] - padded))
+    ndim <- length(oshape)
+    packer$push(mx.nd.slice(data, 0, oshape[[ndim]] - padded))
   }
   iter$reset()
   return(packer$get())
diff --git a/R-package/R/metric.R b/R-package/R/metric.R
index 97cc7314977d..923fec996af5 100644
--- a/R-package/R/metric.R
+++ b/R-package/R/metric.R
@@ -1,7 +1,7 @@
 #' Helper function to create a customized metric
-#' 
+#'
 #' @export
-mx.metric.custom <-function(name, feval) {
+mx.metric.custom <- function(name, feval) {
   init <- function() {
     c(0, 0)
   }
@@ -18,18 +18,34 @@ mx.metric.custom <-function(name, feval) {
   return(ret)
 }
 
-#' Accuracy metric
+#' Accuracy metric for classification
 #'
 #' @export
 mx.metric.accuracy <- mx.metric.custom("accuracy", function(label, pred) {
-  ypred = max.col(pred, tie="first")
-  return(sum((label + 1) == ypred) / length(label))
+  ypred = max.col(t(as.array(pred)), tie="first")
+  return(sum((as.array(label) + 1) == ypred) / length(label))
 })
 
-#' RMSE metric
-#' 
+#' RMSE (Root Mean Squared Error) metric for regression
+#'
 #' @export
 mx.metric.rmse <- mx.metric.custom("rmse", function(label, pred) {
   res <- sqrt(mean((label-pred)^2))
   return(res)
 })
+
+#' MAE (Mean Absolute Error) metric for regression
+#'
+#' @export
+mx.metric.mae <- mx.metric.custom("mae", function(label, pred) {
+  res <- mean(abs(label-pred))
+  return(res)
+})
+
+#' RMSLE (Root Mean Squared Logarithmic Error) metric for regression
+#'
+#' @export
+mx.metric.rmsle <- mx.metric.custom("rmsle", function(label, pred) {
+  res <- sqrt(mean((log(pred + 1) - log(label + 1))^2))
+  return(res)
+})
diff --git a/R-package/R/model.R b/R-package/R/model.R
index e3011bd54c1f..f3a9c9964409 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -1,12 +1,13 @@
 # slice the shape on the highest dimension
 mx.model.slice.shape <- function(shape, nsplit) {
-  batchsize <- shape[[1]]
+  ndim <- length(shape)
+  batchsize <- shape[[ndim]]
   step <- as.integer((batchsize + nsplit - 1) / nsplit)
   lapply(0:(nsplit - 1), function(k) {
     begin = min(k * step, batchsize)
     end = min((k + 1) * step, batchsize)
     s <- shape
-    s[[1]] = end - begin
+    s[[ndim]] = end - begin
     return(list(begin=begin, end=end, shape=s))
   })
 }
@@ -266,12 +267,75 @@ mx.model.init.iter <- function(X, y, batch.size, is.train) {
   if (is.null(y)) {
     if (is.train) stop("Need to provide parameter y for training with R arrays.")
     shape <- dim(X)
-    y <- c(1:shape[[1]]) * 0
+    ndim <- length(shape)
+    y <- c(1:shape[[ndim]]) * 0
   }
   batch.size <- min(length(y), batch.size)
   return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
 }
 
+# select layout by matching shape, report error if nothing matches up.
+mx.model.select.layout.train <- function(X, y) {
+  if (is.null(y)) stop("Need to provide y for training")
+  y <- as.array(y)
+  dimX <- dim(X)
+  dimy <- dim(y)
+  if (length(dimX) != 2) return("colmajor")
+  rowmajor <- 0
+  colmajor <- 0
+  if (dimX[[1]] == dimy[[1]]) rowmajor <- 1
+  if (dimX[[length(dimX)]] == dimy[[length(dimy)]]) colmajor <- 1
+  if (rowmajor + colmajor != 1) {
+    stop("Cannot auto select array.layout, please specify this parameter")
+  }
+  if (rowmajor == 1) {
+    cat("Auto detect layout of input matrix, use rowmajor..\n")
+    return("rowmajor")
+  } else{
+    cat("Auto detect layout input matrix, use colmajor..\n")
+    return("colmajor")
+  }
+}
+
+# select layout by matching shape, report error if nothing matches up.
+mx.model.select.layout.predict <- function(X, model) {
+  dimX <- dim(X)
+  if (length(dimX) != 2) return("colmajor")
+  rowmajor <- 1
+  colmajor <- 1
+  # try row major
+  ret <- mx.symbol.infer.shape(model$symbol, data=c(dimX[[2]], 1))
+  if (!is.null(ret)) {
+    names = names(model$arg.params)
+    for (i in 1:length(names)) {
+      if (any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]]))) {
+        rowmajor <- 0
+      }
+    }
+  }
+  # try col major
+  ret <- mx.symbol.infer.shape(model$symbol, data=c(dimX[[1]], 1))
+  if (!is.null(ret)) {
+    names = names(model$arg.params)
+    for (i in 1:length(names)) {
+      if (any(ret$arg.shapes[[names[i]]] != dim(model$arg.params[[i]]))) {
+        colmajor <- 0
+      }
+    }
+  }
+  if (rowmajor + colmajor != 1) {
+    stop("Cannot auto select array.layout, please specify this parameter")
+  }
+  if (rowmajor == 1) {
+    cat("Auto detect layout of input matrix, use rowmajor..\n")
+    return("rowmajor")
+  } else{
+    cat("Auto detect layout input matrix, use colmajor..\n")
+    return("colmajor")
+  }
+}
+
+
 #' Create a MXNet Feedforward neural net model with the specified training.
 #'
 #' @param symbol The symbolic configuration of the neural network.
@@ -297,6 +361,12 @@ mx.model.init.iter <- function(X, y, batch.size, is.train) {
 #'     The callback when one mini-batch iteration ends.
 #' @param array.batch.size integer (default=128)
 #'     The batch size used for R array training.
+#' @param array.layout can be "auto", "colmajor", "rowmajor", (detault=auto)
+#'     The layout of array. "rowmajor" is only supported for two dimensional array.
+#'     For matrix, "rowmajor" means dim(X) = c(nexample, nfeatures),
+#'     "colmajor" means dim(X) = c(nfeatures, nexample)
+#'     "auto" will auto detect the layout by match the feature size,
+#'      and will report error when X is a square matrix to ask user to explicitly specify layout.
 #' @param kvstore string (default="local")
 #'     The parameter synchronization scheme in multiple devices.
 #' @return model A trained mxnet model.
@@ -308,9 +378,17 @@ function(symbol, X, y=NULL, ctx=NULL,
          initializer=mx.init.uniform(0.01),
          eval.data=NULL, eval.metric=NULL,
          iter.end.callback=NULL, epoch.end.callback=NULL,
-         array.batch.size=128,
+         array.batch.size=128, array.layout="auto",
          kvstore="local",
          ...) {
+  if (is.array(X) || is.matrix(X)) {
+    if (array.layout == "auto") {
+      array.layout <- mx.model.select.layout.train(X, y)
+    }
+    if (array.layout == "rowmajor") {
+      X <- t(X)
+    }
+  }
   X <- mx.model.init.iter(X, y, batch.size=array.batch.size, is.train=TRUE)
   if (!X$iter.next()) {
     x$reset()
@@ -324,7 +402,8 @@ function(symbol, X, y=NULL, ctx=NULL,
   }
   if (!is.list(ctx)) stop("ctx must be mx.context or list of mx.context")
   if (is.character(optimizer)) {
-    batchsize = input.shape[[1]]
+    ndim <- length(input.shape)
+    batchsize = input.shape[[ndim]]
     optimizer <- mx.opt.create(optimizer, rescale.grad=(1/batchsize), ...)
   }
 
@@ -346,10 +425,24 @@ function(symbol, X, y=NULL, ctx=NULL,
 #' @param X The dataset to predict.
 #' @param ctx mx.cpu() or mx.gpu(i) The device used to generate the prediction.
 #' @param array.batch.size The batch size used in batching. Only used when X is R's array.
+#' @param array.layout can be "auto", "colmajor", "rowmajor", (detault=auto)
+#'     The layout of array. "rowmajor" is only supported for two dimensional array.
+#'     For matrix, "rowmajor" means dim(X) = c(nexample, nfeatures),
+#'     "colmajor" means dim(X) = c(nfeatures, nexample)
+#'     "auto" will auto detect the layout by match the feature size,
+#'      and will report error when X is a square matrix to ask user to explicitly specify layout.
 #'
 #' @export
-predict.MXFeedForwardModel <- function(model, X, ctx=NULL, array.batch.size=128) {
+predict.MXFeedForwardModel <- function(model, X, ctx=NULL, array.batch.size=128, array.layout="auto") {
   if (is.null(ctx)) ctx <- mx.ctx.default()
+  if (is.array(X) || is.matrix(X)) {
+    if (array.layout == "auto") {
+      array.layout <- mx.model.select.layout.predict(X, model)
+    }
+    if (array.layout == "rowmajor") {
+      X <- t(X)
+    }
+  }
   X <- mx.model.init.iter(X, NULL, batch.size=array.batch.size, is.train=FALSE)
   X$reset()
   if (!X$iter.next()) stop("Cannot predict on empty iterator")
@@ -366,7 +459,8 @@ predict.MXFeedForwardModel <- function(model, X, ctx=NULL, array.batch.size=128)
     out.pred <- mx.nd.copyto(pexec$ref.outputs[[1]], mx.cpu())
     padded <- X$num.pad()
     oshape <- dim(out.pred)
-    packer$push(mx.nd.slice(out.pred, 0, oshape[[1]] - padded))
+    ndim <- length(oshape)
+    packer$push(mx.nd.slice(out.pred, 0, oshape[[ndim]] - padded))
   }
   X$reset()
   return(packer$get())
diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index 14334d5da376..6110dd51eac8 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -2,6 +2,18 @@
 # Generated by mxnet.export, do not edit by hand.
 ######
 
+#' Choose one element from each line(row for python, column for R/Julia) in lhs according to index indicated by rhs
+#' 
+#' @param lhs  NDArray
+#'     Left operand to the function.
+#' @param rhs  NDArray
+#'     Right operand to the function.
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.choose.element
+NULL
+
 #' Clip ndarray elements to range (a_min, a_max)
 #' 
 #' @param src  NDArray
@@ -28,6 +40,26 @@ NULL
 #' @name mx.nd.dot
 NULL
 
+#' Take square root of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function.
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.sqrt
+NULL
+
+#' Take square of the src
+#' 
+#' @param src  NDArray
+#'     Source input to the function.
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.square
+NULL
+
 #' Create iterator for dataset packed in recordio.
 #' 
 #' @param path.imglist  string, optional, default=''
diff --git a/R-package/README.md b/R-package/README.md
index 3c46288fb8c8..859cf95e4551 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -3,11 +3,11 @@
 [![Build Status](https://travis-ci.org/dmlc/mxnet.svg?branch=master)](https://travis-ci.org/dmlc/mxnet)
 [![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](http://mxnet.readthedocs.org/en/latest/R-package/index.html)
 
-You have find MXNet R Package! The MXNet R packages brings flexible and efficient GPU
+You have found MXNet R Package! The MXNet R packages brings flexible and efficient GPU
 computing and state-of-art deep learning to R.
 
 - It enables you to write seamless tensor/matrix computation with multiple GPUs in R.
-- It also enables you construct and customize the state-of-art deep learning models in R,
+- It also enables you to construct and customize the state-of-art deep learning models in R,
   and apply them to tasks such as image classification and data science challenges.
 
 Sounds exciting? This page contains links to all the related documents on R package.
diff --git a/R-package/demo/basic_model.R b/R-package/demo/basic_model.R
index d849c43974dc..1bf40d647c98 100644
--- a/R-package/demo/basic_model.R
+++ b/R-package/demo/basic_model.R
@@ -42,7 +42,6 @@ model <- mx.model.FeedForward.create(softmax, X=dtrain, eval.data=dtest,
                                      iter.end.callback=mx.callback.save.checkpoint("chkpt"),
                                      epoch.end.callback=mx.callback.log.train.metric(100))
 
-
 # do prediction
 pred <- predict(model, dtest)
 label <- mx.io.extract(dtest, "label")
@@ -51,11 +50,10 @@ dataX <- mx.io.extract(dtest, "data")
 pred2 <- predict(model, X=dataX)
 
 accuracy <- function(label, pred) {
-  ypred = max.col(as.array(pred))
+  ypred = max.col(t(as.array(pred)))
   return(sum((as.array(label) + 1) == ypred) / length(label))
 }
 
 print(paste0("Finish prediction... accuracy=", accuracy(label, pred)))
 print(paste0("Finish prediction... accuracy2=", accuracy(label, pred2)))
 
-
diff --git a/R-package/man/mx.metric.accuracy.Rd b/R-package/man/mx.metric.accuracy.Rd
index c8f4049a6ea6..174d77fed8f9 100644
--- a/R-package/man/mx.metric.accuracy.Rd
+++ b/R-package/man/mx.metric.accuracy.Rd
@@ -3,7 +3,7 @@
 \docType{data}
 \name{mx.metric.accuracy}
 \alias{mx.metric.accuracy}
-\title{Accuracy metric}
+\title{Accuracy metric for classification}
 \format{\preformatted{List of 3
  $ init  :function ()  
  $ update:function (label, pred, state)  
@@ -14,7 +14,7 @@
 mx.metric.accuracy
 }
 \description{
-Accuracy metric
+Accuracy metric for classification
 }
 \keyword{datasets}
 
diff --git a/R-package/man/mx.metric.mae.Rd b/R-package/man/mx.metric.mae.Rd
new file mode 100644
index 000000000000..a98df21f7d7f
--- /dev/null
+++ b/R-package/man/mx.metric.mae.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/metric.R
+\docType{data}
+\name{mx.metric.mae}
+\alias{mx.metric.mae}
+\title{MAE (Mean Absolute Error) metric for regression}
+\format{\preformatted{List of 3
+ $ init  :function ()  
+ $ update:function (label, pred, state)  
+ $ get   :function (state)  
+ - attr(*, "class")= chr "mx.metric"
+}}
+\usage{
+mx.metric.mae
+}
+\description{
+MAE (Mean Absolute Error) metric for regression
+}
+\keyword{datasets}
+
diff --git a/R-package/man/mx.metric.rmse.Rd b/R-package/man/mx.metric.rmse.Rd
index f6f4cc2d1d87..76b4696a910b 100644
--- a/R-package/man/mx.metric.rmse.Rd
+++ b/R-package/man/mx.metric.rmse.Rd
@@ -3,7 +3,7 @@
 \docType{data}
 \name{mx.metric.rmse}
 \alias{mx.metric.rmse}
-\title{RMSE metric}
+\title{RMSE (Root Mean Squared Error) metric for regression}
 \format{\preformatted{List of 3
  $ init  :function ()  
  $ update:function (label, pred, state)  
@@ -14,7 +14,7 @@
 mx.metric.rmse
 }
 \description{
-RMSE metric
+RMSE (Root Mean Squared Error) metric for regression
 }
 \keyword{datasets}
 
diff --git a/R-package/man/mx.metric.rmsle.Rd b/R-package/man/mx.metric.rmsle.Rd
new file mode 100644
index 000000000000..3e2737fe07b7
--- /dev/null
+++ b/R-package/man/mx.metric.rmsle.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/metric.R
+\docType{data}
+\name{mx.metric.rmsle}
+\alias{mx.metric.rmsle}
+\title{RMSLE (Root Mean Squared Logarithmic Error) metric for regression}
+\format{\preformatted{List of 3
+ $ init  :function ()  
+ $ update:function (label, pred, state)  
+ $ get   :function (state)  
+ - attr(*, "class")= chr "mx.metric"
+}}
+\usage{
+mx.metric.rmsle
+}
+\description{
+RMSLE (Root Mean Squared Logarithmic Error) metric for regression
+}
+\keyword{datasets}
+
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
new file mode 100644
index 000000000000..67452b8f634a
--- /dev/null
+++ b/R-package/src/Makevars.win
@@ -0,0 +1,13 @@
+# _*_ mode: makefile; _*_
+PKGROOT=../../
+
+# This file is only used for compilation from github
+# It will be replaced by more formal Rpackage structure
+# Where PKGROOT moved to root directory
+
+.PHONY: all mxnet
+all: $(SHLIB)
+
+
+PKG_CPPFLAGS = -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include
+PKG_LIBS = -L../inst/libs/x64/ -llibmxnet
diff --git a/R-package/src/base.h b/R-package/src/base.h
index fae6b005958e..a9763cc022be 100644
--- a/R-package/src/base.h
+++ b/R-package/src/base.h
@@ -268,10 +268,11 @@ inline std::string toPyString(const std::string &key, const Rcpp::RObject& val)
   if (len != 1) {
     RCHECK(TYPEOF(val) == INTSXP || TYPEOF(val) == REALSXP)
         << "Only accept integer vectors or simple types";
+    // Do shape convesion back to reversed shape.
     Rcpp::IntegerVector vec(val);
     os << "(";
     for (size_t i = 0; i < vec.size(); ++i) {
-      int value = vec[i];
+      int value = vec[vec.size() - i - 1];
       if (i != 0) os << ", ";
       os << value;
     }
@@ -327,14 +328,15 @@ inline std::vector<std::string> SafeGetListNames(const Rcpp::List& src) {
 }
 
 /*!
- * \brief convert Rcpp's Dimension to shape vector
+ * \brief convert Rcpp's Dimension to internal shape vector
+ * This will reverse the shape layout internally
  * \param rshape The dimension in R
- * \return A vector representation in R.
+ * \return A internal vector representation of shapes in mxnet.
  */
-inline std::vector<mx_uint> Dim2Vec(const Rcpp::Dimension &rshape) {
+inline std::vector<mx_uint> Dim2InternalShape(const Rcpp::Dimension &rshape) {
   std::vector<mx_uint> shape(rshape.size());
   for (size_t i = 0; i < rshape.size(); ++i) {
-    shape[i] = rshape[i];
+    shape[rshape.size() - i - 1] = rshape[i];
   }
   return shape;
 }
diff --git a/R-package/src/executor.cc b/R-package/src/executor.cc
index 33a28218f103..3b4fd9becf50 100644
--- a/R-package/src/executor.cc
+++ b/R-package/src/executor.cc
@@ -118,7 +118,7 @@ inline Rcpp::List* CreateArrayList(const Rcpp::List& source_array,
       RCHECK(Rcpp::is<NDArray>(source_array[i]))
           << "Expect input " << key << " to be list of " << NDArray::TypeName();
       NDArray src = NDArray::FromRObject(source_array[i]);
-      ret->at(i) = NDArray::Empty(src.shape(), ctx);
+      ret->at(i) = NDArray::Empty(src.dim(), ctx);
       NDArray dst = NDArray::FromRObject(ret->at(i));
       handles->at(i) = dst->handle;
       NDArray::CopyFromTo(src, &dst);
@@ -146,7 +146,7 @@ inline Rcpp::List* CreateGradList(const Rcpp::List& source_array,
       RCHECK(Rcpp::is<bool>(grad_reqs[i]))
           << "Expect input grad_reqs to be list of booleans";
       if (Rcpp::as<bool>(grad_reqs[i])) {
-        ret->at(i) = NDArray::Empty(NDArray::FromRObject(source_array[i]).shape(), ctx);
+        ret->at(i) = NDArray::Empty(NDArray::FromRObject(source_array[i]).dim(), ctx);
         handles->at(i) = NDArray::FromRObject(ret->at(i))->handle;
         grad_req_type->at(i) = 1;
       }
diff --git a/R-package/src/io.cc b/R-package/src/io.cc
index d0916b25b5ec..f84fd2159aea 100644
--- a/R-package/src/io.cc
+++ b/R-package/src/io.cc
@@ -43,6 +43,18 @@ ArrayDataIter::ArrayDataIter(const Rcpp::NumericVector& data,
                              const Rcpp::NumericVector& unif_rnds,
                              int batch_size,
                              bool shuffle) : counter_(0) {
+  Rcpp::IntegerVector dshape = data.attr("dim");
+  Rcpp::IntegerVector lshape = label.attr("dim");
+  if (dshape[dshape.size() - 1] != lshape[lshape.size() - 1]) {
+    if (dshape[0] == lshape[0]) {
+      RLOG_FATAL << "Seems X, y was passed in a Row major way, "
+                 << "MXNetR adopts a column major convention.\n"
+                 << "Please pass in transpose of X instead";
+    } else {
+      RLOG_FATAL << "Data and label shape in-consistent";
+    }
+  }
+
   std::vector<size_t> order(label.size());
   for (size_t i = 0; i < order.size(); ++i) {
     order[i] = i;
@@ -70,15 +82,16 @@ void ArrayDataIter::Convert(const Rcpp::NumericVector& src,
                             std::vector<NDArray> *out) {
   Rcpp::RObject dim = src.attr("dim");
   Rcpp::Dimension rshape(dim);
-  std::vector<mx_float> temp, batch;
-  ConvertToRowMajor(src, &temp);
+  size_t ndim = rshape.size();
+  std::vector<mx_float> temp(src.size()), batch;
+  std::copy(src.begin(), src.end(), temp.begin());
   out->clear();
-  out->reserve(rshape[0] / batch_size + 1);
+  out->reserve(rshape[ndim - 1] / batch_size + 1);
   size_t line_size = 1;
-  for (size_t i = 1; i < rshape.size(); ++i) {
+  for (size_t i = 0; i < rshape.size() - 1; ++i) {
     line_size *= rshape[i];
   }
-  rshape[0] = batch_size;
+  rshape[ndim - 1] = batch_size;
   batch.resize(batch_size * line_size, 0.0f);
 
   for (size_t begin = 0; begin < order.size(); begin += batch_size) {
@@ -123,7 +136,7 @@ int ArrayDataIter::NumPad() const {
 Rcpp::RObject ArrayDataIter::Create(const Rcpp::NumericVector& data,
                                     const Rcpp::NumericVector& label,
                                     const Rcpp::NumericVector& unif_rnds,
-                                    size_t batch_size,
+                                    int batch_size,
                                     bool shuffle) {
   return Rcpp::internal::make_new_object(
       new ArrayDataIter(data, label, unif_rnds, batch_size, shuffle));
diff --git a/R-package/src/io.h b/R-package/src/io.h
index e643b958e944..8a68ec7d30df 100644
--- a/R-package/src/io.h
+++ b/R-package/src/io.h
@@ -118,7 +118,7 @@ class ArrayDataIter : public DataIter {
   static Rcpp::RObject Create(const Rcpp::NumericVector& data,
                               const Rcpp::NumericVector& label,
                               const Rcpp::NumericVector& unif_rnds,
-                              size_t batch_size,
+                              int batch_size,
                               bool shuffle);
 
  private:
diff --git a/R-package/src/ndarray.cc b/R-package/src/ndarray.cc
index a15f05aac0e1..227237e7ac5a 100644
--- a/R-package/src/ndarray.cc
+++ b/R-package/src/ndarray.cc
@@ -121,30 +121,25 @@ inline void RowToColMajor(const mx_float *in_data,
   }
 }
 
-void ConvertToRowMajor(const Rcpp::NumericVector& rdata, std::vector<mx_float>* out) {
-  Rcpp::RObject dim = rdata.attr("dim");
-  Rcpp::Dimension rshape(dim);
-  out->resize(rdata.size());
-  ColToRowMajor(rdata.begin(), Dim2Vec(rshape),
-                out->size(), dmlc::BeginPtr(*out));
-}
-
 void NDArrayPacker::Push(const NDArray::RObjectType& nd) {
   NDArray arr(nd);
-  Rcpp::Dimension rshape = arr.shape();
+  Rcpp::Dimension rshape = arr.dim();
   if (shape_.size() == 0) {
     shape_.resize(rshape.size());
     for (size_t i = 0; i < shape_.size(); ++i) {
       shape_[i] = rshape[i];
     }
   } else {
-    for (size_t i = 1; i < shape_.size(); ++i) {
+    RCHECK(shape_.size() == rshape.size())
+        << "The number of dimension need to be matched";
+    for (size_t i = 0; i < shape_.size() - 1; ++i) {
       RCHECK(shape_[i] == rshape[i])
-          << "The dimension besides 0 need to be consistent for arrays pushed";
+          << "The dimension besides last need to be consistent for arrays pushed";
     }
-    shape_[0] += rshape[0];
+    shape_.back() += rshape[shape_.size() - 1];
   }
-  size_t begin = data_.size(), size = rshape.prod();
+  size_t begin = data_.size();
+  size_t size = rshape.prod();
   data_.resize(begin + size);
   MX_CALL(MXNDArraySyncCopyToCPU(
       arr->handle, dmlc::BeginPtr(data_) + begin, size));
@@ -156,8 +151,7 @@ Rcpp::NumericVector NDArrayPacker::Get() const {
   Rcpp::Dimension dim(sexp);
   Rcpp::NumericVector ret(dim);
   RCHECK(ret.size() == data_.size());
-  RowToColMajor(dmlc::BeginPtr(data_), shape_,
-                data_.size(), ret.begin());
+  std::copy(data_.begin(), data_.end(), ret.begin());
   return ret;
 }
 
@@ -165,18 +159,19 @@ Rcpp::RObject NDArrayPacker::CreateNDArrayPacker() {
   return Rcpp::internal::make_new_object(new NDArrayPacker());
 }
 
-Rcpp::Dimension NDArray::shape() const {
+Rcpp::Dimension NDArray::dim() const {
   mx_uint ndim;
   const mx_uint *pshape;
   MX_CALL(MXNDArrayGetShape(
       ptr_->handle, &ndim, &pshape));
   Rcpp::IntegerVector dat(pshape, pshape + ndim);
+  std::reverse(dat.begin(), dat.end());
   Rcpp::RObject ret = dat;
   return Rcpp::Dimension(ret);
 }
 
 NDArray NDArray::Clone() const {
-  std::vector<mx_uint> shape = Dim2Vec(this->shape());
+  std::vector<mx_uint> shape = Dim2InternalShape(this->dim());
   Context ctx = this->ctx();
   NDArrayHandle handle;
   MX_CALL(MXNDArrayCreate(dmlc::BeginPtr(shape),
@@ -194,7 +189,7 @@ Context NDArray::ctx() const {
 }
 
 size_t NDArray::Size() const {
-  Rcpp::Dimension dim = this->shape();
+  Rcpp::Dimension dim = this->dim();
   size_t sz = 1;
   for (size_t i = 0; i < dim.size(); ++i) {
     sz *= dim[i];
@@ -209,13 +204,12 @@ NDArray NDArray::Slice(mx_uint begin, mx_uint end) const {
 }
 
 Rcpp::NumericVector NDArray::AsNumericVector() const {
-  Rcpp::Dimension rshape = this->shape();
+  Rcpp::Dimension rshape = this->dim();
   std::vector<mx_float> temp(rshape.prod());
   MX_CALL(MXNDArraySyncCopyToCPU(
       ptr_->handle, dmlc::BeginPtr(temp), temp.size()));
   Rcpp::NumericVector ret(rshape);
-  RowToColMajor(dmlc::BeginPtr(temp), Dim2Vec(rshape),
-                temp.size(), ret.begin());
+  std::copy(temp.begin(), temp.end(), ret.begin());
   return ret;
 }
 
@@ -263,7 +257,7 @@ Rcpp::List NDArray::Load(const std::string& filename) {
 NDArray::RObjectType NDArray::Empty(
     const Rcpp::Dimension& rshape,
     const Context::RObjectType& rctx) {
-  std::vector<mx_uint> shape = Dim2Vec(rshape);
+  std::vector<mx_uint> shape = Dim2InternalShape(rshape);
   Context ctx(rctx);
   NDArrayHandle handle;
   MX_CALL(MXNDArrayCreate(dmlc::BeginPtr(shape),
@@ -310,13 +304,12 @@ NDArray::RObjectType NDArray::Array(
   Rcpp::NumericVector rdata(src);
   Rcpp::RObject dim = rdata.attr("dim");
   Rcpp::Dimension rshape(dim);
-  std::vector<mx_float> temp(rdata.size());
-  ColToRowMajor(rdata.begin(), Dim2Vec(rshape),
-                temp.size(), dmlc::BeginPtr(temp));
   RObjectType ret = NDArray::Empty(rshape, ctx);
+  std::vector<mx_float> temp(rdata.size());
+  std::copy(rdata.begin(), rdata.end(), temp.begin());
   MX_CALL(MXNDArraySyncCopyFromCPU(
       NDArray(ret)->handle,
-      dmlc::BeginPtr(temp), temp.size()));
+      dmlc::BeginPtr(temp), rdata.size()));
   return ret;
 }
 
@@ -578,15 +571,15 @@ NDArray::RObjectType DispatchOps(SEXP op, SEXP lhs, SEXP rhs) {
   return NDArray::RObject(out, true);
 }
 
-Rcpp::Dimension shape(const NDArray::RObjectType& src) {
-  return NDArray(src).shape();
+Rcpp::Dimension dim(const NDArray::RObjectType& src) {
+  return NDArray(src).dim();
 }
 
 Context::RObjectType ctx(const NDArray::RObjectType& src) {
   return NDArray(src).ctx().RObject();
 }
 
-size_t Size(const NDArray::RObjectType& src) {
+unsigned long Size(const NDArray::RObjectType& src) {  // NOLINT(*)
   return NDArray(src).Size();
 }
 
@@ -596,7 +589,12 @@ Rcpp::NumericVector AsNumericVector(const NDArray::RObjectType& src) {
 
 NDArray::RObjectType Slice(const NDArray::RObjectType& src,
                            mx_uint begin, mx_uint end) {
-  return NDArray(src).Slice(begin, end).RObject();
+  NDArray nd(src);
+  Rcpp::Dimension dim = nd.dim();
+  size_t ndim = dim.size();
+  RCHECK(dim[ndim - 1] >= end)
+      << "end=" << end << ", max-dim=" << dim[ndim - 1];
+  return nd.Slice(begin, end).RObject();
 }
 }  // namespace ndarray
 
@@ -610,7 +608,7 @@ void NDArray::InitRcppModule() {
   function("mx.nd.internal.empty.array", &NDArray::Empty);
   function("mx.nd.internal.dispatch.Ops", &ndarray::DispatchOps);
   // exposing members
-  function("mx.nd.internal.dim", &ndarray::shape);
+  function("mx.nd.internal.dim", &ndarray::dim);
   function("mx.nd.internal.ctx", &ndarray::ctx);
   function("mx.nd.internal.length", &ndarray::Size);
   function("mx.nd.internal.as.array", &ndarray::AsNumericVector);
diff --git a/R-package/src/ndarray.h b/R-package/src/ndarray.h
index deb0d27af881..131aae5433ad 100644
--- a/R-package/src/ndarray.h
+++ b/R-package/src/ndarray.h
@@ -101,9 +101,9 @@ class NDArray  {
   }
   /*!
    * \param src The source array.
-   * \return The shape of the array
+   * \return The dimension of the array
    */
-  Rcpp::Dimension shape() const;
+  Rcpp::Dimension dim() const;
   /*!
    * \brief Return a clone of NDArray.
    *  Do not expose this to R side.
@@ -269,14 +269,8 @@ class NDArrayFunction : public ::Rcpp::CppFunction {
 };
 
 /*!
- * \brief Convert the src into row major layout into out
- * \param src The source vector
- * \param out The output memory.
- */
-void ConvertToRowMajor(const Rcpp::NumericVector& src, std::vector<mx_float>* out);
-
-/*!
- * \brief An array packer that packs NDArray array together on dimension 0.
+ * \brief An array packer that packs NDArray array together on
+ *   slowest changing dimension.
  */
 class NDArrayPacker {
  public:
diff --git a/R-package/src/symbol.cc b/R-package/src/symbol.cc
index 26b5088bbd1b..82cd2cb86696 100644
--- a/R-package/src/symbol.cc
+++ b/R-package/src/symbol.cc
@@ -120,7 +120,9 @@ inline Rcpp::List BuildShapeData(mx_uint shape_size,
                                  const std::vector<std::string> &names) {
   Rcpp::List ret(shape_size);
   for (mx_uint i = 0; i < shape_size; ++i) {
-    ret[i] = Rcpp::IntegerVector(shape_data[i], shape_data[i] + shape_ndim[i]);
+    Rcpp::IntegerVector dim(shape_data[i], shape_data[i] + shape_ndim[i]);
+    std::reverse(dim.begin(), dim.end());
+    ret[i] = dim;
   }
   ret.names() = names;
   return ret;
@@ -136,7 +138,7 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
   for (size_t i = 0; i < kwargs.size(); ++i) {
     RCHECK(keys[i].length() != 0)
       << "Need to pass parameters in key=value style.\n";
-    std::vector<mx_uint> dim = Dim2Vec(kwargs[i]);
+    std::vector<mx_uint> dim = Dim2InternalShape(kwargs[i]);
     arg_shape_data.insert(arg_shape_data.end(), dim.begin(), dim.end());
     arg_ind_ptr.push_back(static_cast<mx_uint>(arg_shape_data.size()));
   }
diff --git a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
index 8a0e79c739c4..c1f63a164a5e 100644
--- a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
+++ b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
@@ -77,12 +77,10 @@ preproc.image <-function(im, mean.image) {
   # convert to array (x, y, channel)
   arr <- as.array(resized)
   dim(arr) = c(224, 224, 3)
-  # Change to the format of mxnet (channel, height, width)
-  sample <- aperm(arr, c(3, 2, 1))
   # substract the mean
-  normed <- sample - mean.img
-  # Reshape to format needed by mxnet
-  dim(normed) <- c(1, 3, 224, 224)
+  normed <- arr - mean.img
+  # Reshape to format needed by mxnet (width, height, channel, num)
+  dim(normed) <- c(224, 224, 3, 1)
   return(normed)
 }
 ```
@@ -106,9 +104,9 @@ dim(prob)
 As you can see ```prob``` is a 1 times 1000 array, which gives the probability
 over the 1000 image classes of the input.
 
-We can use the ```max.col``` to get the class index.
+We can use the ```max.col``` on the transpose of prob. get the class index.
 ```{r}
-max.idx <- max.col(prob)
+max.idx <- max.col(t(prob))
 max.idx
 ```
 
diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
index 23c8107c01ee..82ad3cd4515a 100644
--- a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
+++ b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
@@ -1,7 +1,7 @@
 Neural Network with MXNet in Five Minutes
 =============================================
 
-This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes. 
+This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes.
 
 We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`.
 
@@ -34,7 +34,7 @@ The next step is to define the structure of the neural network.
 ```{r}
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=20)
@@ -69,14 +69,16 @@ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
                                      epoch.end.callback=mx.callback.log.train.metric(100))
 ```
 
-Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate
+Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate.
 
 ```{r}
 preds = predict(model, test.x)
-pred.label = max.col(preds)-1
+pred.label = max.col(t(preds))-1
 table(pred.label, test.y)
 ```
 
+Note for multi-class prediction, mxnet outputs nclass x nexamples, each each row corresponding to probability of that class.
+
 ## Regression
 
 Again, let us preprocess the data first.
@@ -96,7 +98,7 @@ We can configure another network as what we have done above. The main difference
 ```{r}
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
index ebfbc505907a..1913887426cf 100644
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ b/R-package/vignettes/mnistCompetition.Rmd
@@ -27,9 +27,10 @@ train.y <- train[,1]
 Here every image is represented as a single row in train/test. The greyscale of each image falls in the range [0, 255], we can linearly transform it into [0,1] by
 
 ```{r}
-train.x <- train.x/255
-test <- test/255
+train.x <- t(train.x/255)
+test <- t(test/255)
 ```
+We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by mxnet (and the convention of R).
 
 In the label part, we see the number of each digit is fairly even:
 
@@ -59,7 +60,7 @@ softmax <- mx.symbol.Softmax(fc3, name="sm")
 6. Here comes the output layer. Since there's only 10 digits, we set the number of neurons to 10.
 7. Finally we set the activation to softmax to get a probabilistic prediction.
 
-## Training 
+## Training
 
 We are almost ready for the training process. Before we start the computation, let's decide what device should we use.
 
@@ -90,14 +91,14 @@ dim(preds)
 It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R:
 
 ```{r}
-pred.label <- max.col(preds) - 1
+pred.label <- max.col(t(preds)) - 1
 table(pred.label)
 ```
 
 With a little extra effort in the csv format, we can have our submission to the competition!
 
 ```{r}
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
@@ -133,12 +134,10 @@ lenet <- mx.symbol.Softmax(data=fc2)
 Then let us reshape the matrices into arrays:
 
 ```{r}
-train.array <- t(train.x)
-dim(train.array) <- c(1,28,28,nrow(train.x))
-train.array <- aperm(train.array, c(4,1,2,3))
-test.array <- t(test)
-dim(test.array) <- c(1,28,28,nrow(test))
-test.array <- aperm(test.array, c(4,1,2,3))
+train.array <- train.x
+dim(train.array) <- c(28, 28, 1, ncol(train.x))
+test.array <- test
+dim(test.array) <- c(28, 28, 1, ncol(test))
 ```
 
 Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
@@ -185,8 +184,8 @@ Finally we can submit the result to Kaggle again to see the improvement of our r
 
 ```{r}
 preds <- predict(model, test.array)
-pred.label <- max.col(preds) - 1
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+pred.label <- max.col(t(preds)) - 1
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
diff --git a/README.md b/README.md
index 664dfac89f88..aef489ff7920 100644
--- a/README.md
+++ b/README.md
@@ -12,16 +12,19 @@ deep learning programs together to maximize the efficiency and your productivity
 
 What's New
 ----------
-* [Note on Programming Models for Deep Learning](http://mxnet.readthedocs.org/en/latest/program_model.html)
-* [Pretrained Inception BatchNorm Network](example/notebooks/predict-with-pretrained-model.ipynb)
-* [Working with Numpy](example/mnist/mlp_numpy.py)
-* [Note on Dependency Engine for Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html)
+* [Training Deep Net on 14 Million Images on A Single Machine](https://mxnet-bing.readthedocs.org/en/latest/tutorial/imagenet_full.html)
+* [MXNet.jl Julia binding initial release](https://github.com/dmlc/MXNet.jl)
+* [Design Note: Squeeze the Memory Consumption of Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html)
+* [LSTM Example by using symbolic API](https://github.com/dmlc/mxnet/tree/master/example/rnn)
+* [MXNet R Package brings Deep learning for R!](https://github.com/dmlc/mxnet/tree/master/R-package)
+* [Design Note: Dependency Engine for Deep Learning](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html)
 
 Contents
 --------
 * [Documentation and Tutorials](http://mxnet.readthedocs.org/en/latest/)
 * [Open Source Design Notes](http://mxnet.readthedocs.org/en/latest/#open-source-design-notes)
 * [Code Examples](example)
+* [Pretrained Models](https://github.com/dmlc/mxnet-model-gallery)
 * [Installation](http://mxnet.readthedocs.org/en/latest/build.html)
 * [Features](#features)
 * [Contribute to MXNet](http://mxnet.readthedocs.org/en/latest/contribute.html)
@@ -36,7 +39,7 @@ Features
 * Auto parallelization
   - Write numpy-style ndarray GPU programs, which will be automatically parallelized.
 * Language agnostic
-  - With support for python, c++, more to come.
+  - With support for python, c++, R, more to come.
 * Cloud friendly
   - Directly load/save from S3, HDFS, AZure
 * Easy extensibility
diff --git a/dmlc-core b/dmlc-core
index 046a4a77e74d..c30a1a055644 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 046a4a77e74d45e5ac16f2a598c31d56d5ccce3d
+Subproject commit c30a1a0556442506b4cfe9a4701c8ca77c6d9a38
diff --git a/doc/R-package/classifyRealImageWithPretrainedModel.md b/doc/R-package/classifyRealImageWithPretrainedModel.md
index 16d96f9abbd2..f5c88fed24ba 100644
--- a/doc/R-package/classifyRealImageWithPretrainedModel.md
+++ b/doc/R-package/classifyRealImageWithPretrainedModel.md
@@ -9,6 +9,12 @@ real world image. The network architecture is decribed in [1].
 The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)
 This model gives the recent state-of-art prediction accuracy on image net dataset.
 
+Preface
+-------
+This tutorial is written in Rmarkdown.
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/classifyRealImageWithPretrainedModel.html)
+- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd)
+
 Pacakge Loading
 ---------------
 To get started, we load the mxnet package by require mxnet.
@@ -115,12 +121,10 @@ preproc.image <-function(im, mean.image) {
   # convert to array (x, y, channel)
   arr <- as.array(resized)
   dim(arr) = c(224, 224, 3)
-  # Change to the format of mxnet (channel, height, width)
-  sample <- aperm(arr, c(3, 2, 1))
   # substract the mean
-  normed <- sample - mean.img
-  # Reshape to format needed by mxnet
-  dim(normed) <- c(1, 3, 224, 224)
+  normed <- arr - mean.img
+  # Reshape to format needed by mxnet (width, height, channel, num)
+  dim(normed) <- c(224, 224, 3, 1)
   return(normed)
 }
 ```
@@ -144,16 +148,16 @@ dim(prob)
 ```
 
 ```
-## [1]    1 1000
+## [1] 1000    1
 ```
 
 As you can see ```prob``` is a 1 times 1000 array, which gives the probability
 over the 1000 image classes of the input.
 
-We can use the ```max.col``` to get the class index.
+We can use the ```max.col``` on the transpose of prob. get the class index.
 
 ```r
-max.idx <- max.col(prob)
+max.idx <- max.col(t(prob))
 max.idx
 ```
 
diff --git a/doc/R-package/fiveMinutesNeuralNetwork.md b/doc/R-package/fiveMinutesNeuralNetwork.md
index 1d6dd0eca3e8..a58eafa62474 100644
--- a/doc/R-package/fiveMinutesNeuralNetwork.md
+++ b/doc/R-package/fiveMinutesNeuralNetwork.md
@@ -1,7 +1,7 @@
 Neural Network with MXNet in Five Minutes
 =============================================
 
-This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes. 
+This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes.
 
 We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`.
 
@@ -50,7 +50,7 @@ The next step is to define the structure of the neural network.
 ```r
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=20)
@@ -88,6 +88,7 @@ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
 ```
 
 ```
+## Auto detect layout of input matrix, use rowmajor..
 ## Start training with 1 devices
 ## [1] Train-accuracy=0.5
 ## [2] Train-accuracy=0.514285714285714
@@ -111,12 +112,19 @@ model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
 ## [20] Train-accuracy=0.857142857142857
 ```
 
-Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate
+Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate.
 
 
 ```r
 preds = predict(model, test.x)
-pred.label = max.col(preds)-1
+```
+
+```
+## Auto detect layout of input matrix, use rowmajor..
+```
+
+```r
+pred.label = max.col(t(preds))-1
 table(pred.label, test.y)
 ```
 
@@ -127,6 +135,8 @@ table(pred.label, test.y)
 ##          1 36 33
 ```
 
+Note for multi-class prediction, mxnet outputs nclass x nexamples, each each row corresponding to probability of that class.
+
 ## Regression
 
 Again, let us preprocess the data first.
@@ -148,7 +158,7 @@ We can configure another network as what we have done above. The main difference
 ```r
 # Define the input data
 data <- mx.symbol.Variable("data")
-# A fully connected hidden layer 
+# A fully connected hidden layer
 # data: input source
 # num_hidden: number of neurons in this hidden layer
 fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
@@ -169,57 +179,58 @@ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
 ```
 
 ```
+## Auto detect layout of input matrix, use rowmajor..
 ## Start training with 1 devices
-## [1] Train-rmse=16.063282524034
-## [2] Train-rmse=12.2792375712573
-## [3] Train-rmse=11.1984634005885
+## [1] Train-rmse=16.0632825223292
+## [2] Train-rmse=12.2792375527391
+## [3] Train-rmse=11.1984634148088
 ## [4] Train-rmse=10.2645236892904
-## [5] Train-rmse=9.49711005504284
-## [6] Train-rmse=9.07733734175182
-## [7] Train-rmse=9.07884450847991
-## [8] Train-rmse=9.10463850277417
-## [9] Train-rmse=9.03977049028532
-## [10] Train-rmse=8.96870685004475
-## [11] Train-rmse=8.93113287361574
-## [12] Train-rmse=8.89937257821847
-## [13] Train-rmse=8.87182096922953
-## [14] Train-rmse=8.84476075083586
-## [15] Train-rmse=8.81464673014974
-## [16] Train-rmse=8.78672567900196
-## [17] Train-rmse=8.76265872846474
-## [18] Train-rmse=8.73946101419974
-## [19] Train-rmse=8.71651926303267
-## [20] Train-rmse=8.69457600919277
-## [21] Train-rmse=8.67354928674563
-## [22] Train-rmse=8.65328755392436
-## [23] Train-rmse=8.63378039680078
-## [24] Train-rmse=8.61488162586984
-## [25] Train-rmse=8.5965105183022
-## [26] Train-rmse=8.57868133563275
-## [27] Train-rmse=8.56135851937663
-## [28] Train-rmse=8.5444819772098
-## [29] Train-rmse=8.52802114610432
-## [30] Train-rmse=8.5119504512622
-## [31] Train-rmse=8.49624261719241
-## [32] Train-rmse=8.48087453238701
-## [33] Train-rmse=8.46582689119887
-## [34] Train-rmse=8.45107881002491
-## [35] Train-rmse=8.43661331401712
-## [36] Train-rmse=8.42241575909639
-## [37] Train-rmse=8.40847217331365
-## [38] Train-rmse=8.39476931796395
-## [39] Train-rmse=8.38129658373974
-## [40] Train-rmse=8.36804269059018
-## [41] Train-rmse=8.35499817678397
-## [42] Train-rmse=8.34215505742154
-## [43] Train-rmse=8.32950441908131
-## [44] Train-rmse=8.31703985777311
-## [45] Train-rmse=8.30475363906755
-## [46] Train-rmse=8.29264031506106
-## [47] Train-rmse=8.28069372820073
-## [48] Train-rmse=8.26890902770415
-## [49] Train-rmse=8.25728089053853
-## [50] Train-rmse=8.24580511500735
+## [5] Train-rmse=9.49711003902655
+## [6] Train-rmse=9.07733735504537
+## [7] Train-rmse=9.07884447337348
+## [8] Train-rmse=9.10463849901276
+## [9] Train-rmse=9.03977048081203
+## [10] Train-rmse=8.96870681959898
+## [11] Train-rmse=8.93113268945833
+## [12] Train-rmse=8.89937250031474
+## [13] Train-rmse=8.87182124831547
+## [14] Train-rmse=8.84476111567396
+## [15] Train-rmse=8.81464687265692
+## [16] Train-rmse=8.78672579209995
+## [17] Train-rmse=8.76265895056591
+## [18] Train-rmse=8.73946101364483
+## [19] Train-rmse=8.7165194446551
+## [20] Train-rmse=8.69457580107095
+## [21] Train-rmse=8.67354933875898
+## [22] Train-rmse=8.65328764760528
+## [23] Train-rmse=8.63378016812285
+## [24] Train-rmse=8.61488175856399
+## [25] Train-rmse=8.59651041652324
+## [26] Train-rmse=8.57868122898644
+## [27] Train-rmse=8.56135865255391
+## [28] Train-rmse=8.54448212525355
+## [29] Train-rmse=8.52802110389574
+## [30] Train-rmse=8.51195043845808
+## [31] Train-rmse=8.49624250344235
+## [32] Train-rmse=8.48087452797975
+## [33] Train-rmse=8.46582681750595
+## [34] Train-rmse=8.45107900842757
+## [35] Train-rmse=8.43661347614512
+## [36] Train-rmse=8.42241598595198
+## [37] Train-rmse=8.40847223745159
+## [38] Train-rmse=8.39476934189048
+## [39] Train-rmse=8.38129658669852
+## [40] Train-rmse=8.36804245552321
+## [41] Train-rmse=8.35499814305568
+## [42] Train-rmse=8.34215500774088
+## [43] Train-rmse=8.3295045517182
+## [44] Train-rmse=8.31703965839842
+## [45] Train-rmse=8.30475372106883
+## [46] Train-rmse=8.2926402584762
+## [47] Train-rmse=8.2806936364631
+## [48] Train-rmse=8.26890890119326
+## [49] Train-rmse=8.25728092677924
+## [50] Train-rmse=8.24580513680541
 ```
 
 It is also easy to make prediction and evaluate
@@ -227,6 +238,13 @@ It is also easy to make prediction and evaluate
 
 ```r
 preds = predict(model, test.x)
+```
+
+```
+## Auto detect layout of input matrix, use rowmajor..
+```
+
+```r
 sqrt(mean((preds-test.y)^2))
 ```
 
@@ -256,57 +274,58 @@ model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
 ```
 
 ```
+## Auto detect layout of input matrix, use rowmajor..
 ## Start training with 1 devices
-## [1] Train-mae=13.1889538083225
-## [2] Train-mae=9.81431959337658
-## [3] Train-mae=9.21576419870059
+## [1] Train-mae=13.1889538090676
+## [2] Train-mae=9.81431958410475
+## [3] Train-mae=9.21576420929697
 ## [4] Train-mae=8.38071537613869
-## [5] Train-mae=7.45462437611487
-## [6] Train-mae=6.93423301743136
-## [7] Train-mae=6.91432357016537
-## [8] Train-mae=7.02742733055105
-## [9] Train-mae=7.00618194618469
-## [10] Train-mae=6.92541576984028
-## [11] Train-mae=6.87530243690643
-## [12] Train-mae=6.84757369098564
-## [13] Train-mae=6.82966501611388
-## [14] Train-mae=6.81151759574811
-## [15] Train-mae=6.78394182841811
-## [16] Train-mae=6.75914719419347
-## [17] Train-mae=6.74180388773481
-## [18] Train-mae=6.725853071279
-## [19] Train-mae=6.70932178215848
-## [20] Train-mae=6.6928868798746
-## [21] Train-mae=6.6769521329138
-## [22] Train-mae=6.66184809505939
-## [23] Train-mae=6.64754504809777
-## [24] Train-mae=6.63358514060577
-## [25] Train-mae=6.62027640889088
-## [26] Train-mae=6.60738245232238
-## [27] Train-mae=6.59505546771818
-## [28] Train-mae=6.58346195800437
-## [29] Train-mae=6.57285477783945
-## [30] Train-mae=6.56259003960424
-## [31] Train-mae=6.5527790788975
-## [32] Train-mae=6.54353428422991
-## [33] Train-mae=6.5344172368447
-## [34] Train-mae=6.52557652526432
-## [35] Train-mae=6.51697905850079
-## [36] Train-mae=6.50847898812758
-## [37] Train-mae=6.50014844106303
-## [38] Train-mae=6.49207674844397
-## [39] Train-mae=6.48412070125341
-## [40] Train-mae=6.47650500999557
-## [41] Train-mae=6.46893867486053
-## [42] Train-mae=6.46142131653097
-## [43] Train-mae=6.45395035048326
-## [44] Train-mae=6.44652914123403
-## [45] Train-mae=6.43916216409869
-## [46] Train-mae=6.43183777381976
-## [47] Train-mae=6.42455544223388
-## [48] Train-mae=6.41731406417158
-## [49] Train-mae=6.41011292926139
-## [50] Train-mae=6.40312503493494
+## [5] Train-mae=7.45462434962392
+## [6] Train-mae=6.93423304392232
+## [7] Train-mae=6.91432355824444
+## [8] Train-mae=7.02742730538464
+## [9] Train-mae=7.00618193757513
+## [10] Train-mae=6.92541587183045
+## [11] Train-mae=6.87530209053722
+## [12] Train-mae=6.847573687012
+## [13] Train-mae=6.82966502538572
+## [14] Train-mae=6.81151769575146
+## [15] Train-mae=6.78394197610517
+## [16] Train-mae=6.75914737499422
+## [17] Train-mae=6.74180429437094
+## [18] Train-mae=6.72585320373376
+## [19] Train-mae=6.70932160268227
+## [20] Train-mae=6.69288677523534
+## [21] Train-mae=6.67695207827621
+## [22] Train-mae=6.66184799075127
+## [23] Train-mae=6.64754500372542
+## [24] Train-mae=6.63358518299129
+## [25] Train-mae=6.62027624067333
+## [26] Train-mae=6.60738218476375
+## [27] Train-mae=6.59505565381712
+## [28] Train-mae=6.58346203284131
+## [29] Train-mae=6.57285475134849
+## [30] Train-mae=6.56259016940991
+## [31] Train-mae=6.55277890273266
+## [32] Train-mae=6.54353418886248
+## [33] Train-mae=6.53441721167829
+## [34] Train-mae=6.52557678090202
+## [35] Train-mae=6.51697915651732
+## [36] Train-mae=6.50847910601232
+## [37] Train-mae=6.50014858543873
+## [38] Train-mae=6.49207666102383
+## [39] Train-mae=6.48412067078882
+## [40] Train-mae=6.47650481263797
+## [41] Train-mae=6.46893873314063
+## [42] Train-mae=6.46142139865292
+## [43] Train-mae=6.45395037829876
+## [44] Train-mae=6.44652904189295
+## [45] Train-mae=6.43916221575605
+## [46] Train-mae=6.43183771024148
+## [47] Train-mae=6.42455528063907
+## [48] Train-mae=6.41731397675143
+## [49] Train-mae=6.41011299813787
+## [50] Train-mae=6.40312501904037
 ```
 
 Congratulations! Now you have learnt the basic for using `mxnet`.
diff --git a/doc/R-package/index.md b/doc/R-package/index.md
index 68bc97aed699..744fcd6247f8 100644
--- a/doc/R-package/index.md
+++ b/doc/R-package/index.md
@@ -11,12 +11,7 @@ Sounds exciting? This page contains links to all the related documents on R pack
 
 Get Started
 -----------
-There are several information to get you started
-* [Installation Guide](../build.md) contains instructions to install mxnet.
-* [Tutorials](#tutorials) contains various examples how how mxnet can be applied to different cool tasks :)
-* [Contributor Guide](http://mxnet.readthedocs.org/en/latest/contribute.html#r-package)
-  - The R package section gives various guidelines on how to contribute code, tutorial, rmarkdown examples to mxnet.
-  - Your contribution is always welcomed!
+Checkout the [Installation Guide](../build.md) contains instructions to install mxnet, and [Tutorials](#tutorials) for examples on how to use mxnet for various tasks. 
 
 Tutorials
 ---------
@@ -25,3 +20,10 @@ Tutorials
 * [Handwritten Digits Classification Competition](mnistCompetition.md)
 * [Tutorial on NDArray and Symbol](ndarrayAndSymbolTutorial.md)
 
+Resources
+---------
+There are several information to get you started
+* [Installation Guide](../build.md) contains instructions to install mxnet.
+* [Contributor Guide](http://mxnet.readthedocs.org/en/latest/contribute.html#r-package)
+  - The R package section gives various guidelines on how to contribute code, tutorial, rmarkdown examples to mxnet.
+  - Your contribution is always welcomed!
diff --git a/doc/R-package/mnistCompetition.md b/doc/R-package/mnistCompetition.md
index 0e73f7700486..16a7ca761146 100644
--- a/doc/R-package/mnistCompetition.md
+++ b/doc/R-package/mnistCompetition.md
@@ -1,12 +1,12 @@
 Handwritten Digits Classification Competition
-======================================================
+=============================================
 
-[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set. We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
+[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set.
+We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
 
 This tutorial is written in Rmarkdown. You can download the source [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/mnistCompetition.Rmd) and view a
 hosted version of tutorial [here](http://mxnet.readthedocs.org/en/latest/R-package/mnistCompetition.html).
 
-
 ## Data Loading
 
 First, let us download the data from [here](https://www.kaggle.com/c/digit-recognizer/data), and put them under the `data/` folder in your working directory.
@@ -37,9 +37,10 @@ Here every image is represented as a single row in train/test. The greyscale of
 
 
 ```r
-train.x <- train.x/255
-test <- test/255
+train.x <- t(train.x/255)
+test <- t(test/255)
 ```
+We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by mxnet (and the convention of R).
 
 In the label part, we see the number of each digit is fairly even:
 
@@ -77,7 +78,7 @@ softmax <- mx.symbol.Softmax(fc3, name="sm")
 6. Here comes the output layer. Since there's only 10 digits, we set the number of neurons to 10.
 7. Finally we set the activation to softmax to get a probabilistic prediction.
 
-## Training 
+## Training
 
 We are almost ready for the training process. Before we start the computation, let's decide what device should we use.
 
@@ -163,14 +164,14 @@ dim(preds)
 ```
 
 ```
-## [1] 28000    10
+## [1]    10 28000
 ```
 
 It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R:
 
 
 ```r
-pred.label <- max.col(preds) - 1
+pred.label <- max.col(t(preds)) - 1
 table(pred.label)
 ```
 
@@ -184,7 +185,7 @@ With a little extra effort in the csv format, we can have our submission to the
 
 
 ```r
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
@@ -222,12 +223,10 @@ Then let us reshape the matrices into arrays:
 
 
 ```r
-train.array <- t(train.x)
-dim(train.array) <- c(1,28,28,nrow(train.x))
-train.array <- aperm(train.array, c(4,1,2,3))
-test.array <- t(test)
-dim(test.array) <- c(1,28,28,nrow(test))
-test.array <- aperm(test.array, c(4,1,2,3))
+train.array <- train.x
+dim(train.array) <- c(28, 28, 1, ncol(train.x))
+test.array <- test
+dim(test.array) <- c(28, 28, 1, ncol(test))
 ```
 
 Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
@@ -259,11 +258,11 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
 
 ```
 ## Start training with 1 devices
-## Batch [100] Train-accuracy=0.1054
-## Batch [200] Train-accuracy=0.1237
-## Batch [300] Train-accuracy=0.352766666666667
-## Batch [400] Train-accuracy=0.498824999999999
-## [1] Train-accuracy=0.519546539379474
+## Batch [100] Train-accuracy=0.1066
+## Batch [200] Train-accuracy=0.16495
+## Batch [300] Train-accuracy=0.401766666666667
+## Batch [400] Train-accuracy=0.537675
+## [1] Train-accuracy=0.557136038186157
 ```
 
 ```r
@@ -272,7 +271,7 @@ print(proc.time() - tic)
 
 ```
 ##    user  system elapsed 
-## 132.340 203.621  84.825
+## 130.030 204.976  83.821
 ```
 
 Training on GPU:
@@ -290,31 +289,31 @@ model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
 
 ```
 ## Start training with 1 devices
-## Batch [100] Train-accuracy=0.1055
-## Batch [200] Train-accuracy=0.1197
-## Batch [300] Train-accuracy=0.346266666666667
-## Batch [400] Train-accuracy=0.4925
-## [1] Train-accuracy=0.513699284009546
-## Batch [100] Train-accuracy=0.9577
-## Batch [200] Train-accuracy=0.961849999999999
-## Batch [300] Train-accuracy=0.966
-## Batch [400] Train-accuracy=0.968750000000003
-## [2] Train-accuracy=0.969404761904765
-## Batch [100] Train-accuracy=0.977399999999999
-## Batch [200] Train-accuracy=0.97815
-## Batch [300] Train-accuracy=0.980033333333335
-## Batch [400] Train-accuracy=0.981400000000003
-## [3] Train-accuracy=0.981761904761908
-## Batch [100] Train-accuracy=0.985799999999999
-## Batch [200] Train-accuracy=0.98575
-## Batch [300] Train-accuracy=0.986666666666668
-## Batch [400] Train-accuracy=0.987550000000003
-## [4] Train-accuracy=0.987880952380955
-## Batch [100] Train-accuracy=0.9918
-## Batch [200] Train-accuracy=0.9908
-## Batch [300] Train-accuracy=0.991566666666668
-## Batch [400] Train-accuracy=0.992175000000002
-## [5] Train-accuracy=0.992380952380955
+## Batch [100] Train-accuracy=0.1066
+## Batch [200] Train-accuracy=0.1596
+## Batch [300] Train-accuracy=0.3983
+## Batch [400] Train-accuracy=0.533975
+## [1] Train-accuracy=0.553532219570405
+## Batch [100] Train-accuracy=0.958
+## Batch [200] Train-accuracy=0.96155
+## Batch [300] Train-accuracy=0.966100000000001
+## Batch [400] Train-accuracy=0.968550000000003
+## [2] Train-accuracy=0.969071428571432
+## Batch [100] Train-accuracy=0.977
+## Batch [200] Train-accuracy=0.97715
+## Batch [300] Train-accuracy=0.979566666666668
+## Batch [400] Train-accuracy=0.980900000000003
+## [3] Train-accuracy=0.981309523809527
+## Batch [100] Train-accuracy=0.9853
+## Batch [200] Train-accuracy=0.985899999999999
+## Batch [300] Train-accuracy=0.986966666666668
+## Batch [400] Train-accuracy=0.988150000000002
+## [4] Train-accuracy=0.988452380952384
+## Batch [100] Train-accuracy=0.990199999999999
+## Batch [200] Train-accuracy=0.98995
+## Batch [300] Train-accuracy=0.990600000000001
+## Batch [400] Train-accuracy=0.991325000000002
+## [5] Train-accuracy=0.991523809523812
 ```
 
 ```r
@@ -323,7 +322,7 @@ print(proc.time() - tic)
 
 ```
 ##    user  system elapsed 
-##  10.176   1.608   7.743
+##   9.288   1.680   6.889
 ```
 
 As you can see by using GPU, we can get a much faster speedup in training!
@@ -332,8 +331,8 @@ Finally we can submit the result to Kaggle again to see the improvement of our r
 
 ```r
 preds <- predict(model, test.array)
-pred.label <- max.col(preds) - 1
-submission <- data.frame(ImageId=1:nrow(test), Label=pred.label)
+pred.label <- max.col(t(preds)) - 1
+submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
 write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
 ```
 
diff --git a/doc/build.md b/doc/build.md
index db8b4c585504..b3b354d19f49 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -7,20 +7,27 @@ if you have ideas to improve this page, please send a pull request!
 
 Contents
 --------
-- [Build MXNet Library](#build-mxnet-library)
-  - Introduces how to build the mxnet core library for all packages.
-  - Supported platforms: linux, windows, osx
+- [Building MXNet Library](#build-mxnet-library)
+  - [Prerequisites](#prerequisites)
+  - [Building on Linux](#building-on-linux)
+  - [Building on OSX](#building-on-osx)
+  - [Building on Windows](#building-on-windows)
+  - [Installing pre-built packages on Windows](#installing-pre-built-packages-on-windows)
 - [Advanced Build Configurations](#advanced-build-configuration)
   - Introduces how to build mxnet with advanced features such as HDFS/S3 support, CUDNN
 - [Python Package Installation](#python-package-installation)
 - [R Package Installation](#r-package-installation)
+- [Docker Images](#docker-images)
 
 Build MXNet Library
 -------------------
+
+### Prerequisites
+
 MXNet have a general runtime library that can be used by various packages such as python, R and Julia.
 This section gives details about how to build the mxnet library.
 - On Linux/OSX the target library will be ```libmxnet.so```
-- On Windows the target libary is ```mxnet.dll```
+- On Windows the target libary is ```libmxnet.dll```
 
 Things to do before get started:
 
@@ -36,7 +43,7 @@ The system dependency requirement for mxnet libraries are
 - BLAS library.
 - opencv (optional if you do not need image augmentation, you can switch it off in config.mk)
 
-### Linux
+### Building on Linux
 
 On Ubuntu >= 13.10, one can install the dependencies by
 
@@ -51,7 +58,7 @@ make -j4
 ```
 Then proceed to package installation instructions for python or R in this page.
 
-### OSX
+### Buillding on OSX
 On OSX, we can install the dependencies by
 
 ```bash
@@ -73,7 +80,7 @@ make -j4
 
 Then proceed to package installation instructions for python or R in this page.
 
-### Windows
+### Building on Windows
 
 Firstly, we should make your Visual Studio 2013 support more C++11 features.
 
@@ -88,6 +95,14 @@ Finally, use CMake to create a Visual Studio solution in `./build/`. During conf
 
 Then proceed to package installation instructions for python or R in this page.
 
+### Installing pre-built packages on Windows
+
+Mxnet also provides pre-built packages on Windows. The pre-built package includes pre-build MxNet library, the dependent thrid-party libraries, a sample C++ solution in Visual Studio and the Python install script.
+
+You can download the packages from the [Releases tab](https://github.com/dmlc/mxnet/releases) of MxNet. There are two variants provided: one with GPU support (using CUDA and CUDNN v3) and one without GPU support. You can choose one that fits your hardward configuration.
+
+After download, unpack the package into a folder, say D:\MxNet, then install the package by double clicking the setupenv.cmd inside the folder. It will setup environmental variables needed by MxNet. After that, you should be able to usee the provided VS solution to build C++ programs, or to [install Python package](#python-package-installation).
+
 Advanced Build Configurations
 -----------------------------
 The configuration of mxnet can be modified by ```config.mk```
@@ -120,15 +135,30 @@ cd python; python setup.py develop --user
 
 R Package Installation
 ----------------------
-To install the python package. First finish the [Build MXNet Library](#build-mxnet-library) step.
+To install the R package. First finish the [Build MXNet Library](#build-mxnet-library) step.
 Then use the following command to install mxnet at root folder
 
 ```bash
-R CMD INSTALL R-Package
+R CMD INSTALL R-package
 ```
 
 Hopefully, we will now have mxnet on R!
 
 ## Note on Library Build
 We isolate the library build with Rcpp end to maximize the portability
-  - MSVC is needed on windows to build the mxnet library, because of CUDA compatiblity issue of toolchains.
\ No newline at end of file
+  - MSVC is needed on windows to build the mxnet library, because of CUDA compatiblity issue of toolchains.
+
+Docker Images
+-------------
+Builds of MXNet are available as [Docker](https://www.docker.com/whatisdocker) images:
+[MXNet Docker (CPU)](https://hub.docker.com/r/kaixhin/mxnet/) or [MXNet Docker (CUDA)](https://hub.docker.com/r/kaixhin/cuda-mxnet/).
+These are updated on a weekly basis with the latest builds of MXNet. Examples of running bash in a Docker container
+are as follows:
+
+```bash
+sudo docker run -it kaixhin/mxnet
+sudo docker run -it --device /dev/nvidiactl --device /dev/nvidia-uvm --device /dev/nvidia0 kaixhin/cuda-mxnet:7.0
+```
+
+For a guide to Docker, see the [official docs](https://docs.docker.com/userguide/). For more details on how to use the
+MXNet Docker images, including requirements for CUDA support, consult the [source project](https://github.com/Kaixhin/dockerfiles).
diff --git a/doc/developer-guide/index.md b/doc/developer-guide/index.md
index f54e4fd04a99..9f8bf6938ea2 100644
--- a/doc/developer-guide/index.md
+++ b/doc/developer-guide/index.md
@@ -57,7 +57,9 @@ Open Source Design Notes
 * [Dependency Engine for Deep Learning](note_engine.md)
 	- Introduces the dependency tracking and scheduling component for general deep learning,
 	  this motivates the design of Engine module.
-
+* [Squeeze the Memory Consumption of Deep Learning](note_memory.md)
+	- Introduces how we can reduce memory consumption of deep nets
+	  
 List of Other Resources
 -----------------------
 * [Doxygen Version of C++ API](https://mxnet.readthedocs.org/en/latest/doxygen) gives a comprehensive document of C++ API.
diff --git a/doc/developer-guide/multi_node.md b/doc/developer-guide/multi_node.md
index 3f43636b41dd..14772251580e 100644
--- a/doc/developer-guide/multi_node.md
+++ b/doc/developer-guide/multi_node.md
@@ -4,7 +4,7 @@
 
 MXNet uses a two-level *parameter server* for data synchronization.
 
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/multi-node/ps_arch.png width=400/>
+<img src=https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/multi-node/ps_arch.png width=400/>
 
 - On the first layer, data are synchronized over multiple devices within a
   single worker machine. A device could be a GPU card, CPU, or other computational
diff --git a/doc/developer-guide/note_engine.md b/doc/developer-guide/note_engine.md
index a71949886f1d..5c8dedd73da1 100644
--- a/doc/developer-guide/note_engine.md
+++ b/doc/developer-guide/note_engine.md
@@ -35,7 +35,7 @@ However, it is quite hard to code the sequence manually, as the last operation,
 ```D = B * C```, needs to wait for both the above operations to complete before it starts running.
 We can represent the computation as the following dependency graph.
 
-![Dep Simple](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_simple.png)
+![Dep Simple](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png)
 
 In this specific case, the graph is also called data-flow graph, as it represents the dependency
 in terms of data and computation.
@@ -56,7 +56,7 @@ learning libraries when things go parallel.
 ### Data Flow Dependency
 The central thing that almost every dependency engine will have to solve, is the dataflow dependency problem.
 
-![Dep Simple](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_simple.png)
+![Dep Simple](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_simple.png)
 
 Data Flow dependency describes how the outcome of one computation can be used in other computations.
 As we have elaborated this in last section, we will only put the same figure here. Libraries that have
@@ -68,7 +68,7 @@ This is simple in the serial case. Because we can simply recycle the memory afte
 go out of scope. However, things becomes a bit harder in parallel case. Consider the following
 example
 
-![Dep Del](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_del.png)
+![Dep Del](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_del.png)
 
 In the above example, because both computation needs to use values from A. We cannot perform
 the memory recycling before these computation completes. So a correct engine
@@ -80,7 +80,7 @@ is executed after both ```B = A + 1``` and ```C = A + 2``` completes.
 Random number generators are commonly used in machine learning. However, they also bring
 interesting challenges for dependency engine. Consider the following example
 
-![Dep Rand](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_rand.png)
+![Dep Rand](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_rand.png)
 
 Here we are generating random numbers in a sequence. While it seems that the two random number
 generations can be parallelized. This is usually not the case. Because usually a pseudorandom
@@ -131,7 +131,7 @@ a simple SGD update, and copies the updated weight back to each GPU.
 This is a common data parallel program written in a serial manner.
 The following dependency graph shows how it can be parallelized:
 
-![Dep Net](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_net.png)
+![Dep Net](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_net.png)
 
 Few important notes:
 - The copy of gradient to CPU, can happen as soon as we get gradient of that layer.
@@ -202,14 +202,14 @@ Because we cannot assume the object we are scheduling on. What we can do instead
 ```virtual tag``` that is associated with each object to represent what we need to schedule.
 So at the beginning, user can allocate the variable tag, and attach it to each of object that we want to schedule.
 
-![Dep Net](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/tag_var.png)
+![Dep Net](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/tag_var.png)
 
 After having the variable tags, user call ```push``` to tell the engine about the function we want to execute.
 In addition, user need to specify the dependencies of the operation by ```read_vars``` and ```write_vars```.
 - ```read_vars``` are variable tags of objects which the operation will "read from", without changing its internal state.
 - ```mutate_vars``` are variable tags of objects which the operation will mutate their internal states.
 
-![Push Op](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/push_var.png)
+![Push Op](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_var.png)
 
 The above figure shows how we can push operation ```B = A + 1``` to dependency engine. Here ```B.data```,
 ```A.data``` are the real allocated space. We should note that engine is ***only aware of variable tags***.
@@ -227,16 +227,16 @@ The first line reads variable `A` and mutates variable `B`. The second line read
 
 The engine is going to maintain a queue for each variable, as the following animation shows for each of the four lines. Green blocks represents a read action, while a red one represents a mutation.
 
-![Dependency Queue](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_queue.gif)
+![Dependency Queue](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_queue.gif)
 
 Upon building this queue, the engine sees that the first two green blocks at the front of A's queue, could actually be run in parallel, because they are both read actions and won't conflict with each other. The following graph illustrates this point.
 
-![Dependency Parallelism](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/dep_parallel.png)
+![Dependency Parallelism](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_parallel.png)
 
 The cool thing about all this scheduling is, it is not confined to numerical calculations. Since everything scheduled is only a tag, the engine could schedule everything!
 
 The following figure gives a complete push sequence of the programs we mentioned in previous sections.
-![Push Seq](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/push_seq.png)
+![Push Seq](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/push_seq.png)
 
 ### Port Existing Codes to the Dependency Engine
 Because the generic interface do not take control of things like memory allocation and what operation to execute.
@@ -259,11 +259,11 @@ The general idea is as follows
 The following figure gives a visual example of the scheduling algorithm, which might give you a better sense
 of what is going on in the engine.
 
-![Dep Tracking](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/engine_queue_step.png)
+![Dep Tracking](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_step.png)
 
 The following figure gives another example that involves random number generations.
 
-![Dep Rand](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/engine/engine_queue_rand.png)
+![Dep Rand](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/engine_queue_rand.png)
 
 As we can see, the algorithm is mainly about update pending queues of operations and doing the right
 state transition when operation completed. More care should be taken to make sure the state transition
diff --git a/doc/developer-guide/note_memory.md b/doc/developer-guide/note_memory.md
new file mode 100644
index 000000000000..43a3299d0618
--- /dev/null
+++ b/doc/developer-guide/note_memory.md
@@ -0,0 +1,235 @@
+Squeeze the Memory Consumption of Deep Learning
+===============================================
+One important theme about deep learning is to train deeper and larger nets.
+While the hardware has been upgraded rapidly in recent years, the huge deepnet monsters are
+always hungry about the GPU RAMS. Being able to use less memory for the same net also means we can
+use larger batch size, and usually higher GPU utilization rate.
+
+This article discusses how memory allocation optimization can be done for deep neural nets, and provide
+some of candidate solutions to the problems. The solutions discussed in this article is by no means complete,
+but rather as example that we think is useful to most cases.
+
+Computation Graph
+-----------------
+We will start the discussion by introducing computation graph, since this is the tool that will help us in the later
+part of the section. Computation graph describes the (data-flow) dependencies between the operations in the deep nets.
+The operation performed in the graph can either be fine-grained or coarse grained.
+The following figure gives two examples of computation graph.
+
+![Comp Graph Example](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/comp_graph_example.png)
+
+The idea of computation graph is deeply rooted in the packages such as Theano, CGT. Actually they also exists implicitly
+in most libraries as the network configuration. The major difference in these library comes to how do they calculate gradient.
+There are mainly two ways, doing back-propagation on the same graph, or have an explicit backward path that calculates
+the gradient needed. 
+
+![Backward Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_graph.png)
+
+Libraries like caffe, cxxnet, torch uses the backprop on same graph. While libraries like Theano, CGT takes the explicit
+backward path approach. We will adopt the ***explicit backward path*** way in the article, because it brings several advantages
+in turns of optimization.
+
+However, we should emphasize that choosing the explicit backward path way for execution will not restrict us
+to scope of symbolic libraries such as Theano, CGT. We can also use the explicit backward path for gradient calculation of
+layer-based(which ties forward, backward together) libraries. The following graph shows how this can be done.
+Basically, we can introduce a backward node that links to the forward node of the graph, and calls the ```layer.backward```
+in the backward operations. 
+
+![Backward Layer](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/explicit_back_layer.png)
+
+So this discussion applies to almost all deep learning libraries that exists
+(There are differences between these libraries,  e.g. high order differentiation. which are beyond the scope of in this article).
+
+Why explicit backward path is better? Let us explain it with two examples. The first reason is that the explicit backward path
+clearly describes the dependency between the computation. Consider the following case, where we want to get
+the gradient of A and B. As we can see clearly from the graph, that computation of ```d(C)``` gradient do not depend on F.
+This means we can free the memory of ```F``` right after the the forward computation is done, similarly the memory 
+of ```C``` can be recycled.
+
+![Backward Prune](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_dep_prune.png)
+
+Another advantage of explicit backward path is to be able to have a different backward path rather than an mirror of forward one.
+One common example is the split connection case, as shown in the following figure.
+
+![Backward Agg](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_agg_grad.png)
+
+In this example, the output of B is referenced by two operations. If we want to do the gradient calculation in the same
+network, an explicit split layer need to be introduced. This means we need to do the split for the forward pass as well.
+In this figure, the forward pass do not contain a split layer, but the graph will automatically insert a gradient
+aggregation node before passing gradient back to B. This helps us to save the memory cost of allocating output of
+split layer, as well as operation cost of replicate the data in forward pass.
+
+If we adopt the explicit backward view of computation graph, there is no difference between the forward pass
+and backward pass. We will simply go forward in topological order of the computation graph, and carry out computations.
+This also simplifies our discussions. The problem now becomes:
+
+- How to allocate the memory of each output node of a computation graph?
+
+Hmm, seems it has nothing to do with deep learning, but more of context of compiling, data flow optimization etc.
+But it is really the hungry monster of deep learning that motivates us attack this problem, and benefit from it.
+
+What can be Optimized
+---------------------
+Hopefully you are convinced that the computation graph is a good way to discuss memory allocation optimization techniques.
+As you can see some memory saving can already been bought by using explicit backward graph. Let us discuss more about
+what optimization we can do, and what is the baseline.
+
+Asumme we want to build a neural net with ```n``` layers. A typical implementation of neural net will
+need to allocate node space for output of each layer, as well as gradient values for back-propagation.
+This means we need roughly ```2 n``` memory cells. This is the same in the explicit backward graph case, as
+the number of nodes in backward pass in roughly the same as forward pass. 
+
+### Inplace Operations
+One of the very first thing that we can do is inplace memory sharing of operations. This is usually done for
+simple operations such as activation functions. Consider the following case, where we want to
+compute the value of three chained sigmoid function.
+
+![Inplace op](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline.png)
+
+Because we can compute sigmoid in the ```inplace``` manner, that is, use the same memory for input and output.
+We can simply allocate one copy of memory, and use it compute arbitrary length of sigmoid chain.
+
+However, the inplace optimization sometimes can be done in the wrong way, especially when the package tries
+to be a bit general. Consider the following case, where the value of B is not only used by C, but also F.
+
+![Inplace trap](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png)
+
+We cannot perform inplace optimization because the value of B is still needed after ```C=sigmoid(B)``` is computed.
+So an algorithm that simply do inplace optimization for every sigmoid operation might fall into such trap,
+and we need to be careful on when we can do it.
+
+### Normal Memory Sharing
+Memories can also be shared besides the inplace operation. Consider the following case, because the 
+value of B is no longer needed when we compute E, we can reuse the memory to hold the result of E.
+
+![Normal Sharing](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_normal.png)
+
+We would like to point out that is ***memory sharing does not necessarily require same data shape****.
+In the above example, the shape of ```B``` and ```E``` can be different, and we can simply allocate a
+memory region that is the maximum of the two sizes and share it between the two.
+
+### Real Neural Net Allocation Example
+The above examples are all make up cases, that only contains the computation of the forward pass.
+Actually the idea holds the same for the real neural net cases. The following figure shows an allocation
+plan we can do for a two layer perception.
+
+![Net Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png)
+
+In the above example:
+- Inplace optimization is applied on computing ```act1```, ```d(fc1)```, ```out``` and ```d(fc2)```.
+- The memory sharing is used between ```d(act1)``` and ```d(A)```.
+
+Memory Allocation Algorithm
+---------------------------
+We have discussed how the general techniques to optimize memory allocations in previous section.
+However, we also see that there are traps which we want to avoid like the inplace case.
+How can we allocate the memory correctly? This is not a new problem. For example, it is very similar
+to register allocation in compilers. So there could be a lot we can borrow. We do not attempt to give
+a comprehensive review of techniques here, but rather introduce some simple but useful trick to attack
+the problem.
+
+The key problem is we want to place resources, such that they do not conflict each other.
+More specifically, each variable have a ```life time``` between the time it get computed till the last time it get used.
+In the multilayer perception case, the ```life time``` of ```fc1``` ends after ```act1``` get computed.
+
+![Net Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_mlp.png)
+
+The principle is ***to only allow memory sharing between the variables whose lifetime do not overlap***. There are multiple
+ways to solve this problem. One possible way is to construct the conflicting graph of with each variable as node and link edge
+between variables with overlapping lifespan, and run a graph-coloring algorithm. This will likely require ```$O(n^2)$```
+complexity where ```n``` is number of nodes in the graph, which could be an reasonable price to pay. 
+
+We will introduce another simple heuristic here. The idea is to simulate the procedure of traversing the graph,
+and keep a counter of future operations that depends on the node.
+
+![Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_step.png)
+
+- An inplace optimization can be performed when only current operation depend on the source(i.e. counter=1)
+- A memory can be recycled into the box on the upper right corner when counter goes to 0
+- Every time, when we need new memory, we can either get it from the box, or allocate a new one.
+
+One note is that during the simulation, no memory is allocated, but we rather keep record of how much memory each node need,
+and allocate the maximum of the shared parts in the final memory plan.
+
+### Static vs Dynamic Allocation
+
+If you think carefully, you will find the above strategy exactly simulates the dynamic memory allocation procedure in imperative
+languages such as python. The counter is the reference counter of each memory object, and the object get garbage collected when
+the reference counter goes to zero. In that sense, we are simulating the dynamic memory allocation once to create a static allocation plan. 
+Now the question is, can we simply use an imperative language that dynamically allocates and de-allocates memories?
+
+The major difference is that the static allocation is only done once, so we can afford to use more complicated algorithms
+- For example, do searching over memories sizes that are similar to the require memory block.
+- The allocation can also be made graph aware, see more discussion in next section.
+- The dynamic way will push more pressure on fast memory allocator and garbage collector.
+
+There is also one takeaway for users who want to reply on dynamic memory allocations:
+***do not take unnecessary reference of object***. For example, if we organize all the nodes in
+a list and store then in a Net object, these nodes will never get de-referenced, getting us no gain of the space.
+Unfortunately, this is one common way to organize the code.
+
+
+Allocation for on Parallel Operations
+-------------------------------------
+In the previous section, we discussed how we can ```simulate``` the running procedure of computation graph,
+and get a static allocation plan. However, there are more problems when we want to optimize for parallel computation
+as resource sharing and parallelization are on the two ends of a balance.
+Let us look at the following two allocation plan for the same graph:
+
+![Parallel Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/parallel_alloc.png)
+
+Both allocation plans are valid, if we run the computation in a serial manner from ```A[1]``` to ```A[8]```.
+However, the allocation plan on the left side introduces extra dependencies, which means we cannot
+run computation of ```A[2]``` and ```A[5]``` in parallel, while the right one can.
+
+As we can see that if we want to parallelizing the computation, more care need to be done in terms of computation.
+
+### Stay Safe and Correct First
+Stay correct, this is the very first principle we need to know. This means execute in a way to take the implicit dependency
+memory sharing into consideration. This can done by adding the implicit dependency edge to execution graph.
+Or even simpler, if the execution engine is mutate aware as described in the
+[dependency engine note](http://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html), push the operation
+in sequence and write to the same variable tag that represents the same memory region.
+
+Another way is always produce memory allocation plan that is safe, which means never allocate same memory to nodes that can
+be parallelized. This may not be the ideal case, because sometimes memory reduction is more desirable, and there is not too
+much gain we can get by multiple computing stream execution on the same GPU.
+
+### Try to Allow More Parallelization
+Given that we can always be correct, we are now safe to do some optimizations. The general idea is to try to
+encourage memory sharing between nodes that cannot be parallelized. This again can be done by creating a ancestor relation
+graph and query this during allocation, which cost around ```$O(n^2)$``` time to construct. We can also use heuristic here,
+for example, one way is to color the path in the graph.
+The idea is shown in the figure below, every time we tries to find a longest path in the graph, color them to same color,
+and continue.
+
+![Path Color](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/graph_color.png)
+
+After we get the color of the node, we can only allow sharing (or encourage such sharing ) between nodes in the same color.
+This is a more strict version than the ancestor relation, but only cost ```$O(n)$``` time if we only search for first ```k``` path.
+
+The strategy discussed here is by no means the only solution, we can expect more sophisticated approaches along this line.
+
+How much can We Save
+--------------------
+Thanks for reading till this part! We have discussed the techniques and algorithms we can use to squeeze the memory usage of deep learning.
+Now comes the question on how much we can really save by using these techniques.
+
+The answer is we can roughly reduce the memory consumption ***by half*** using these techniques. This is on the coarse grained operation graphs that are already optimized with big operations. More memory reduction could be seen if we are optimizing a fine-grained computation network used by symbolic libraries such as Theano.
+
+Most of the ideas in this article inspires the design of mxnet.
+We provide an [Memory Cost Estimation Script](https://github.com/dmlc/mxnet/tree/master/example/memcost),
+which you can play with to see how much memory we need under different strategies.
+
+If you play with the script, there is one option called ```forward_only```, which shows the cost only running the forward pass.
+You will find that the cost is extremely low compared to others.  You won't be surprised if you read previous part of
+the article, this is simply because more memory re-use if we only run the forward pass. So here are the two takeaways:
+
+- Use computation graph to allocate the memory smartly and correctly.
+- Running deep learning prediction cost much less memory than deep learning training.
+
+Contribution to this Note
+-------------------------
+This note is part of our effort to [open-source system design notes](http://mxnet.readthedocs.org/en/latest/#open-source-design-notes)
+ for deep learning libraries. You are more welcomed to contribute to this Note, by submitting a pull request.
+
diff --git a/doc/env_var.md b/doc/env_var.md
index 16a9ee4ff3bc..d274be269c6e 100644
--- a/doc/env_var.md
+++ b/doc/env_var.md
@@ -3,7 +3,7 @@ Environment Variables
 MXNet have several settings that can be changed via environment variable.
 Usually you do not need to change these settings, but they are listed here for reference.
 
-* MXNET_GPU_WORKER_NTHREADS (default=1)
+* MXNET_GPU_WORKER_NTHREADS (default=2)
   - Maximum number of threads that do the computation job on each GPU.
 * MXNET_GPU_COPY_NTHREADS (default=1)
   - Maximum number of threads that do memory copy job on each GPU.
@@ -16,7 +16,7 @@ Usually you do not need to change these settings, but they are listed here for r
 * MXNET_EXEC_MATCH_RANGE (default=10)
   - The rough matching scale in symbolic execution memory allocator.
   - Set this to 0 if we do not want to enable memory sharing between graph nodes(for debug purpose).
-* MXNET_EXEC_NUM_TEMP (default=4)
+* MXNET_EXEC_NUM_TEMP (default=1)
   - Maximum number of temp workspace we can allocate to each device.
   - Set this to small number can save GPU memory.
   - It will also likely to decrease level of parallelism, which is usually OK.
diff --git a/doc/index.md b/doc/index.md
index 6ca7dff52683..6298ba800996 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -3,18 +3,14 @@ MXNet Documentation
 [MXNet](https://github.com/dmlc/mxnet) is a deep learning framework designed for both *efficiency* and *flexibility*.
 It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
 
-How to Get Started
-------------------
-* Check out [Python Getting started Guide](python/tutorial.md)
-* The [example](../example) folder contains example usecases of mxnet.
-
 User Guide
 ----------
 * [Build and Installation](build.md)
 * [Python Package Document](python/index.md)
 * [R Package Document](R-package/index.md)
+* [MXNet.jl Julia Package](https://github.com/dmlc/MXNet.jl)
 * [Frequently Asked Questions](faq.md)
-
+* [Pretrained Model Gallery](pretrained.md)
 
 Developer Guide
 ---------------
@@ -35,6 +31,12 @@ in terms of abstraction, optimization and trade-offs.
 
 * [Programming Models for Deep Learning](program_model.md)
 * [Dependency Engine for Deep Learning](developer-guide/note_engine.md)
+* [Squeeze the Memory Consumption of Deep Learning](developer-guide/note_memory.md)
+
+Tutorial
+--------
+* [Training Deep Net on 14 Million Images on A Single Machine](tutorial/imagenet_full.md)
+
 
 Indices and tables
 ------------------
diff --git a/doc/pretrained.md b/doc/pretrained.md
new file mode 100644
index 000000000000..159b56017698
--- /dev/null
+++ b/doc/pretrained.md
@@ -0,0 +1,7 @@
+Pretrained Model Gallary
+========================
+This document contains the the pretrained in MXNet
+
+* [89.9% Top-5 Validation Accuracy for ImageNet 1,000 Classes Challenge](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-1k-inception)
+* [37.2% Top-1 Training Accuracy for Full ImageNet 21,841 Classes](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception)
+
diff --git a/doc/program_model.md b/doc/program_model.md
index 753d7d77acb4..fdfb6799d882 100644
--- a/doc/program_model.md
+++ b/doc/program_model.md
@@ -38,7 +38,7 @@ The difference in symbolic programs is when ```C = B * A``` is executed, there i
 Instead, these operations generates a computation graph (symbolic graph) that represents the computation it described.
 The following picture gives a computation graph to compute ```D```.
 
-![Comp Graph](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph.png)
+![Comp Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png)
 
 Most symbolic style programs will contain, either explicitly or implicitly, a ```compile``` step.
 This converts the computation graph into a function that can be called.
@@ -88,7 +88,7 @@ d = c + 1
 ...
 ```
 
-![Comp Graph](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph.png)
+![Comp Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph.png)
 
 Assume each cell in the array cost 8 bytes. How many memory do we need to cost if we are going to execute the above program in python console?
 Let us do some math, we need memory for 4 arrays of size 10, that means we will need ```4 * 10 * 8 = 320``` bytes. On the other hand,
@@ -110,7 +110,7 @@ Another optimization that symbolic programs can do is operation folding. In the
 Which is represented in the following graph. This means one GPU kernel will be executed(instead of two) if the computation runs on GPU.
 This is actually what we will do to hand crafted operations in optimized libraries such as cxxnet, caffe. Doing so will improve the computation efficiency.
 
-![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph_fold.png)
+![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_fold.png)
 
 We cannot do that in imperative programs. Because the intermediate value can be reference
 some point in the future. The reason that such optimization is possible in symbolic programs, is that we get the entire computation graph, and a clear
@@ -178,7 +178,7 @@ grad_a, grad_b = f(A=np.ones(10), B=np.ones(10)*2)
 The grad function of D generate a backward computation graph, and return a gradient node ```gA, gB```.
 They corresponds to the red nodes in the following figure.
 
-![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/prog_model/comp_graph_backward.png)
+![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_backward.png)
 
 What the imperative program did was actually the same as the symbolic way. It implicitly saves a backward
 computation graph in the grad closure. When we invoked the ```d.grad```, we start from ```d(D)```,
diff --git a/doc/python/symbol_in_pictures.md b/doc/python/symbol_in_pictures.md
index 64c5d9cb1f25..dd924f98c141 100644
--- a/doc/python/symbol_in_pictures.md
+++ b/doc/python/symbol_in_pictures.md
@@ -8,7 +8,7 @@ Compose Symbols
 The symbols are description of computation we want to do. The symbolic construction API generates the computation
 graph that describes the need of computation. The following picture is how we compose symbols to describe basic computations.
 
-![Symbol Compose](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/compose_basic.png)
+![Symbol Compose](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_basic.png)
 
 - The ```mxnet.symbol.Variable``` function creates argument nodes that represents inputs to the computation.
 - The Symbol is overloaded with basic element-wise arithmetic operations. 
@@ -18,14 +18,14 @@ Configure Neural Nets
 Besides fine-grained operations, mxnet also provide a way to perform big operations that is analogy to layers in neural nets.
 We can use these operators to describe a neural net configuration.
 
-![Net Compose](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/compose_net.png)
+![Net Compose](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_net.png)
 
 
 Example of Multi-Input Net
 --------------------------
 The following is an example of configuring multiple input neural nets.
 
-![Multi Input](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/compose_multi_in.png)
+![Multi Input](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_multi_in.png)
 
 
 Bind and Execute Symbol 
@@ -33,11 +33,11 @@ Bind and Execute Symbol
 When we need to execute a symbol graph. We call bind function to bind ```NDArrays``` to the argument nodes
 to get a ```Executor```.
 
-![Bind](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/bind_basic.png)
+![Bind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/bind_basic.png)
 
 You can call ```Executor.Forward``` to get the output results, given the binded NDArrays as input.
 
-![Forward](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_forward.png)
+![Forward](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_forward.png)
 
 
 Bind Multiple Outputs
@@ -45,7 +45,7 @@ Bind Multiple Outputs
 You can use ```mx.symbol.Group``` to group symbols together then bind them to 
 get outputs of both.
 
-![MultiOut](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_multi_out.png)
+![MultiOut](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_multi_out.png)
 
 But always remember, only bind what you need, so system can do more optimizations for you.
 
@@ -55,7 +55,7 @@ Calculate Gradient
 You can specify gradient holder NDArrays in bind, then call ```Executor.backward``` after ```Executor.forward```
 will give you the corresponding gradients.
 
-![Gradient](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_backward.png)
+![Gradient](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_backward.png)
 
 
 Simple Bind Interface for Neural Nets
@@ -65,14 +65,14 @@ graph like neural nets. ```Symbol.simple_bind``` provides a way to simplify
 the procedure. You only need to specify input data shapes, and the function will allocate the arguments, and bind
 the Executor for you.
 
-![SimpleBind](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_simple_bind.png)
+![SimpleBind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_simple_bind.png)
 
 Auxiliary States
 ----------------
 Auxiliary states are just like arguments, except that you cannot take gradient of them. These are states that may 
 not be part of computation, but can be helpful to track. You can pass the auxiliary state in the same way as arguments.
 
-![SimpleBind](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/symbol/executor_aux_state.png)
+![SimpleBind](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/executor_aux_state.png)
 
 More Information
 ----------------
diff --git a/doc/python/tutorial.md b/doc/python/tutorial.md
index a32442173ba1..09a70df07c04 100644
--- a/doc/python/tutorial.md
+++ b/doc/python/tutorial.md
@@ -257,7 +257,7 @@ We can also specify the automatic generated names explicitly:
 ```python
 >>> net = mx.symbol.Variable('data')
 >>> w = mx.symbol.Variable('myweight')
->>> net = sym.FullyConnected(data=data, weight=w, name='fc1', num_hidden=128)
+>>> net = sym.FullyConnected(data=net, weight=w, name='fc1', num_hidden=128)
 >>> net.list_arguments()
 ['data', 'myweight', 'fc1_bias']
 ```
diff --git a/doc/tutorial/imagenet_full.md b/doc/tutorial/imagenet_full.md
new file mode 100644
index 000000000000..bf692ec5882b
--- /dev/null
+++ b/doc/tutorial/imagenet_full.md
@@ -0,0 +1,108 @@
+# Training Deep Net on 14 Million Images by Using A Single Machine
+
+This note describes how to train a neural network on Full ImageNet Dataset [1] with 14,197,087 images in 21,841 classes. **We achieved a state-of-art model by using 4 GeForce GTX 980 cards on a single machine in 8.5 days.**
+
+There are several technical challenges in this problem.
+1. How to pack and store the massive data.
+2. How to minimize the memory consumption of the network, so we can use net with more capacity than those used for ImageNet 1K
+3. How to train the model fast.
+
+We also released our pre-trained model for this full ImageNet dataset.
+
+## Data Preprocessing
+The raw full ImageNet dataset is more than 1TB. Before training the network, we need to shuffle these images then load batch of images to feed the neural network. Before we describe how we solve it, let’s do some calculation first:
+
+Assume we have two good storage device [2]:
+
+```
+| Device                    | 4K Random Seek        | Sequential Seek |
+| ------------------------- | --------------------- | --------------- |
+| WD Black (HDD)            | 0.43 MB /s (110 IOPS) | 170 MB/s        |
+| Samsung 850 PRO (SSD)     | 40 MB/s (10,000 IOPS) | 550 MB/s        |
+```
+
+A very naive approach is loading from a list by random seeking. If use this approach, we will spend 677 hours with HDD or 6.7 hours with SSD respectively. This is only about read. Although SSD looks not bad, but 1TB SSD is not affordable for everyone.
+
+But we notice sequential seek is much faster than random seek. Also, loading batch by batch is a sequential action. Can we make a change? The answer is we can't do sequential seek directly. We need random shuffle the training data first, then pack them into a sequential binary package.
+
+This is the normal solution used by most deep learning packages. However, unlike ImageNet 1K dataset, where we ***cannot*** store the images in raw pixels format.  Because otherwise we will need more than 1TB space. Instead, we need to pack the images in compressed format.
+
+***The key ingredients are***
+- Store the images in jpeg format, and pack them into binary record.
+- Split the list, and pack several record files, instead of one file.
+   - This allows us to pack the images in distributed fashion, because we will be eventually bounded by the IO cost during packing.
+   - We need to make the package being able to read from several record files, which is not too hard.
+This will allow us to store the entire imagenet dataset in around 250G space.
+
+After packing, together with threaded buffer iterator, we can simply achieve an IO speed of around 3,000 images/sec on a normal HDD.
+
+## Training the model
+
+
+Now we have data. We need to consider which network structure to use. We use Inception-BN [3] style model, compared to other models such as VGG, it has fewer parameters, less parameters simplified sync problem. Considering our problem is much more challenging than 1k classes problem, we add suitable capacity into original Inception-BN structure, by increasing the size of filter by factor of 1.5 in bottom layers of original Inception-BN network.
+This however, creates a challenge for GPU memory. As GTX980 only have 4G of GPU RAM. We really need to minimize the memory consumption to fit larger batch-size into the training. To solve this problem we use the techniques such as node memory reuse, and inplace optimization, which reduces the memory consumption by half, more details can be found in  [memory optimization note](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html)
+
+Finally, we cannot train the model using a single GPU because this is a really large net, and a lot of data. We use data parallelism on four GPUs to train this model, which involves smart synchronization of parameters between different GPUs, and overlap the communication and computation. A [runtime denpdency engine](https://mxnet.readthedocs.org/en/latest/developer-guide/note_engine.html) is used to simplify this task, allowing us to run the training at around 170 images/sec.
+
+Here is a learning cureve of the training process:
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/imagenet_full/curve.png "Learning Curve")
+
+## Evaluate the Performance
+Train Top-1 Accuracy over 21,841 classes: 37.19%
+
+There is no official validation set over 21,841 classes, so we are using ILVRC2012 validation set to check the performance. Here is the result:
+
+```
+| Accuracy | Over 1,000 classes | Over 21,841 classes |
+| -------- | ------------------ | ------------------- |
+| Top-1    | 68.3%              | 41.9%               |
+| Top-5    | 89.0%              | 69.6%               |
+| Top=20   | 96.0%              | 83.6%               |
+```
+
+As we can see we get quite reasonable result after 9 iterations. Notably much less number of iterations is needed to achieve a stable performance, mainly due to we are facing a larger dataset.
+
+We should note that this result is by no means optimal, as we did not carefully pick the parameters and the experiment cycle is longer than the 1k dataset. We think there is definite space for improvement, and you are welcomed to try it out by yourself!
+
+
+## The Code and Model
+The code and step guide is publically available at [https://github.com/dmlc/mxnet/tree/master/example/imagenet](https://github.com/dmlc/mxnet/tree/master/example/imagenet)
+
+We also release a pretrained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception)
+
+## How to Use The Model
+We should point out it 21k classes is much more challenging than 1k. Directly use the raw prediction is not a reasonable way.
+
+Look at this picture which I took in Mount Rainier this summer:
+
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/imagenet_full/rainier.png "Mount Rainer")
+
+We can figure out there is a mountain, valley, tree and bridge. And the prediction probability is :
+
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/imagenet_full/prob.png "Probability")
+
+We notice there are several peaks. Let's print out the label text in among 21k classes and ImageNet 1k classes:
+
+```
+| Rank  | Over 1,000 classes          | Over 21,841 classes        |
+| ----- | --------------------------- | -------------------------- |
+| Top-1 | n09468604 valley            | n11620673 Fir              |
+| Top-2 | n09332890 lakeside          | n11624531 Spruce           |
+| Top-3 | n04366367 suspension bridge | n11621281 Amabilis fir     |
+| Top-4 | n09193705 alp               | n11628456 Douglas fir      |
+| Top-5 | n09428293 seashore          | n11627908 Mountain hemlock |
+```
+
+There is no doubt that directly use probability over 21k classes loss diversity of prediction. If you carefully choose a subset by using WordNet hierarchy relation, I am sure you will find more interesting results.
+
+## Note
+[1] Deng, Jia, et al. "Imagenet: A large-scale hierarchical image database." *Computer Vision and Pattern Recognition*, 2009. CVPR 2009. IEEE Conference on. IEEE, 2009.
+
+[2] HDD/SSD data is from public website may not be accurate.
+
+[3] Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network training by reducing internal covariate shift." *arXiv preprint arXiv:1502.03167* (2015).
+
+
+
+
+
diff --git a/example/README.md b/example/README.md
index 3921d3fcd198..1f8f271d98e4 100644
--- a/example/README.md
+++ b/example/README.md
@@ -7,14 +7,14 @@ Notebooks
 * [composite symbol](notebooks/composite_symbol.ipynb) gives you a demo of how to composite a symbolic Inception-BatchNorm Network
 * [cifar-10 recipe](notebooks/cifar-recipe.ipynb) gives you a step by step demo of how to use MXNet
 * [cifar-100](notebooks/cifar-100.ipynb) gives you a demo of how to train a 75.68% accuracy CIFAR-100 model
-* [predict with pretained model](notebooks/predict-with-pretrained-model.ipynb) gives you a demo of use a pretrained Inception-BN Network
 * [simple bind](notebooks/simple_bind.ipynb) gives you a demo of some details in ```mx.model``` module.
 
 Contents
 --------
 * [mnist](mnist) gives examples on training mnist.
 * [cifar10](cifar10) gives examples on CIFAR10 dataset.
-
+* [adversary](adversary) Find adversary sample by using fast sign method
+* [rnn](rnn) LSTM example
 
 Python Howto
 ------------
diff --git a/example/adversary/adversary_generation.ipynb b/example/adversary/adversary_generation.ipynb
new file mode 100644
index 000000000000..3fafebd9b14d
--- /dev/null
+++ b/example/adversary/adversary_generation.ipynb
@@ -0,0 +1,375 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fast Sign Adversary Generation Example\n",
+    "\n",
+    "This notebook demos find adversary example by using symbolic API and integration with Numpy",
+    "\n",
+    "Reference: \n",
+    "\n",
+    "[1] Goodfellow, Ian J., Jonathon Shlens, and Christian Szegedy. \"Explaining and harnessing adversarial examples.\" arXiv preprint arXiv:1412.6572 (2014)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import mxnet as mx\n",
+    "import numpy as np\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.cm as cm\n",
+    "\n",
+    "from data import mnist_iterator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build Network\n",
+    "\n",
+    "note: in this network, we will calculate softmax, gradient in numpy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "dev = mx.gpu()\n",
+    "batch_size = 100\n",
+    "train_iter, val_iter = mnist_iterator(batch_size=batch_size, input_shape = (1,28,28))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# input\n",
+    "data = mx.symbol.Variable('data')\n",
+    "# first conv\n",
+    "conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)\n",
+    "tanh1 = mx.symbol.Activation(data=conv1, act_type=\"tanh\")\n",
+    "pool1 = mx.symbol.Pooling(data=tanh1, pool_type=\"max\",\n",
+    "                          kernel=(2,2), stride=(2,2))\n",
+    "# second conv\n",
+    "conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)\n",
+    "tanh2 = mx.symbol.Activation(data=conv2, act_type=\"tanh\")\n",
+    "pool2 = mx.symbol.Pooling(data=tanh2, pool_type=\"max\",\n",
+    "                          kernel=(2,2), stride=(2,2))\n",
+    "# first fullc\n",
+    "flatten = mx.symbol.Flatten(data=pool2)\n",
+    "fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)\n",
+    "tanh3 = mx.symbol.Activation(data=fc1, act_type=\"tanh\")\n",
+    "# second fullc\n",
+    "fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def Softmax(theta):\n",
+    "    max_val = np.max(theta, axis=1, keepdims=True)\n",
+    "    tmp = theta - max_val\n",
+    "    exp = np.exp(tmp)\n",
+    "    norm = np.sum(exp, axis=1, keepdims=True)\n",
+    "    return exp / norm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def LogLossGrad(alpha, label):\n",
+    "    grad = np.copy(alpha)\n",
+    "    for i in range(alpha.shape[0]):\n",
+    "        grad[i, label[i]] -= 1.\n",
+    "    return grad"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Prepare useful data for the network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data_shape = (batch_size, 1, 28, 28)\n",
+    "arg_names = fc2.list_arguments() # 'data' \n",
+    "arg_shapes, output_shapes, aux_shapes = fc2.infer_shape(data=data_shape)\n",
+    "\n",
+    "arg_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]\n",
+    "grad_arrays = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]\n",
+    "reqs = [\"write\" for name in arg_names]\n",
+    "\n",
+    "model = fc2.bind(ctx=dev, args=arg_arrays, args_grad = grad_arrays, grad_req=reqs)\n",
+    "arg_map = dict(zip(arg_names, arg_arrays))\n",
+    "grad_map = dict(zip(arg_names, grad_arrays))\n",
+    "data_grad = grad_map[\"data\"]\n",
+    "out_grad = mx.nd.zeros(model.outputs[0].shape, ctx=dev)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Init weight "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "for name in arg_names:\n",
+    "    if \"weight\" in name:\n",
+    "        arr = arg_map[name]\n",
+    "        arr[:] = mx.rnd.uniform(-0.07, 0.07, arr.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def SGD(weight, grad, lr=0.1, grad_norm=batch_size):\n",
+    "    weight[:] -= lr * grad / batch_size\n",
+    "\n",
+    "def CalAcc(pred_prob, label):\n",
+    "    pred = np.argmax(pred_prob, axis=1)\n",
+    "    return np.sum(pred == label) * 1.0\n",
+    "\n",
+    "def CalLoss(pred_prob, label):\n",
+    "    loss = 0.\n",
+    "    for i in range(pred_prob.shape[0]):\n",
+    "        loss += -np.log(max(pred_prob[i, label[i]], 1e-10))\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train a network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Accuracy: 0.92\t Train Loss: 0.28077\n",
+      "Train Accuracy: 0.97\t Train Loss: 0.08434\n",
+      "Train Accuracy: 0.98\t Train Loss: 0.05849\n",
+      "Train Accuracy: 0.99\t Train Loss: 0.04577\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:11: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:4: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_round = 4\n",
+    "train_acc = 0.\n",
+    "nbatch = 0\n",
+    "for i in range(num_round):\n",
+    "    train_loss = 0.\n",
+    "    train_acc = 0.\n",
+    "    nbatch = 0\n",
+    "    train_iter.reset()\n",
+    "    for data, label in train_iter:\n",
+    "        arg_map[\"data\"][:] = data\n",
+    "        model.forward(is_train=True)\n",
+    "        theta = model.outputs[0].asnumpy()\n",
+    "        alpha = Softmax(theta)\n",
+    "        train_acc += CalAcc(alpha, label.asnumpy()) / batch_size\n",
+    "        train_loss += CalLoss(alpha, label.asnumpy()) / batch_size\n",
+    "        losGrad_theta = LogLossGrad(alpha, label.asnumpy())\n",
+    "        out_grad[:] = losGrad_theta\n",
+    "        model.backward([out_grad])\n",
+    "        # data_grad[:] = grad_map[\"data\"]\n",
+    "        for name in arg_names:\n",
+    "            if name != \"data\":\n",
+    "                SGD(arg_map[name], grad_map[name])\n",
+    "        \n",
+    "        nbatch += 1\n",
+    "    #print(np.linalg.norm(data_grad.asnumpy(), 2))\n",
+    "    train_acc /= nbatch\n",
+    "    train_loss /= nbatch\n",
+    "    print(\"Train Accuracy: %.2f\\t Train Loss: %.5f\" % (train_acc, train_loss))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get pertubation by using fast sign method, check validation change"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Val Batch Accuracy:  1.0\n",
+      "Val Batch Accuracy after pertubation:  0.04\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:4: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
+     ]
+    }
+   ],
+   "source": [
+    "val_iter.reset()\n",
+    "data, label = val_iter.next()\n",
+    "arg_map[\"data\"][:] = data\n",
+    "model.forward(is_train=True)\n",
+    "theta = model.outputs[0].asnumpy()\n",
+    "alpha = Softmax(theta)\n",
+    "print(\"Val Batch Accuracy: \", CalAcc(alpha, label.asnumpy()) / batch_size)\n",
+    "#########\n",
+    "grad = LogLossGrad(alpha, label.asnumpy())\n",
+    "out_grad[:] = grad\n",
+    "model.backward([out_grad])\n",
+    "noise = np.sign(data_grad.asnumpy())\n",
+    "arg_map[\"data\"][:] = data.asnumpy() + 0.15 * noise\n",
+    "model.forward(is_train=True)\n",
+    "raw_output = model.outputs[0].asnumpy()\n",
+    "pred = Softmax(raw_output)\n",
+    "print(\"Val Batch Accuracy after pertubation: \", CalAcc(pred, label.asnumpy()) / batch_size)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Visualize example after pertubation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "true: 9\n",
+      "pred: 8\n"
+     ]
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPwAAAD8CAYAAABTq8lnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJztfU2MLNlZ5blVr6oyq+q913YztC2rB7Ogd0hGI3njGZmF\nhUCWYNgYtYSwkBmxYBiEWNhmAQwsBixhIVigYbCRzSB+NAiPWcBgpEFjFvwY2WNmMDRI3ZLxuLv9\n09XvvarMrKzKO4uXX76TJ78bEZkZERmZeY8UisisrIjIyHvu9//dEGNERkbGfuBg0zeQkZHRHjLh\nMzL2CJnwGRl7hEz4jIw9QiZ8RsYeIRM+I2OPsDLhQwjfGUL4+xDCP4YQ3l/nTWVkZDSDsEocPoRw\nCOAfALwLwJcA/DWA52OMX6DP5AB/RsYGEWMM+t6dFc/1dgD/FGN8CQBCCL8D4HsAfIE/9O53v3t2\n/MILL+C5555b8XLNY9P3N5lMMBgMcHV1hcFgsHA8HA43dm9todfr4fT0FP1+f7a37fT0FKPRKPl8\nrq6usOtJZAcHBzg6OsKdO3fc7eDgicL+4osv+udY8dpvAfBFev3P0/cyMjI6jFUl/EpTqc3AvO/K\nrDyZTHBzc7Px69/e3uL29haTyWT2fOwZhRAK90Czz7jo+re3tzg8PFzr/Hfu3MHh4SEODw9xcHAw\n2/j7hRBm79tn79y5g6OjI0wmk+S57f6858P7XceqhP8SgGfp9bN4LOXn8MILLzy50J07s0E8mUzc\n403i9PQUV1dXG7t+jBHD4RCj0Qjj8Rjj8XiO+MCTwc6Dnl/bs0w953URQii89p07qw6nxzg5OcHJ\nyQmOj49xdHQ0Iz5PLEzy4+Pjue9VNIZubm5wcHDQ2fG3LmzslGHVX+gzAL4lhPBWAP8PwPcBeF4/\nxDaxPViTYLpt+oGfnJxgMBhs7PqTyQTX19ez7ebmBjc3Ny7hbdDzdnBw4D5Xk751gAln1+TX6+L4\n+Hi2sV1qtil/f5PoMcbZRFQ2hrzxZ+iStrkKzP9huLi4cD+3EuFjjDchhH8P4H8AOATwEfbQp2AP\n3AYzb3UNym1FjHEm2T0Jz4Pdc9gcHh4uPNMQwmzSqANl98Cq9yowqX10dDRzTh0eHs5J+IODgzlt\nkTWOMnjPB9gfdR5YXcIjxvhHAP5oic/PEf76+nphcO8zYozuJOhJeLNZebtz587sWdrgt2e+LhEN\nJuH1+kbSdeFNYqrSs4TXCaiMuDbu+Jwxxr0ae+sZXUuCCT8ej3F9fY3RaITr6+u9eugebOAVmTus\nUhvRjo+PcXJyMkcQO58977oIz84yvf7x8fHa5/dMFSY9E1zJXsUstMlQyV5FO9gVtEZ4HoAmiUaj\nEUajEYbD4UY95F2B50xiCW+EMwl7fHyMXq83I5wNZtWm6pTwdn1Tv09OTmb3sO51Ug5BddqFEDCZ\nTObU+ypmy2g0mj0fI7udr4oPYBfQKuHtIbOEHw6HGAwGmfDwQ2k8CFml9ghnn2eysw28LjwNo9fr\nodfrod/v13J+3WtIzjYLsS0TVjNJbhOEjcMs4WsCk9ge7ng8njvm9zLSUNIyGZgIReSuEscv+l+b\naNR3YMdNY5n79XB7e7vgH9gnsgMNE57j2jc3NzN73cJO5pTKqAaT3iyZjOS3t7czE0mfb1Ecfxkv\nN5sPnCRTlwaR0TwaJTzHtc1DaoklNiC3Pf7ZJtTpybanPl+LfKjTz4vj23tlMLKbV17j5BndR2sS\n3hJLWIXPEn45sCd/PB7P3mepb884lbiTiqFXyZLTUKCGzTK6j9YkvA1K27zU0YxisEqvry0OXxTH\n57CaJrlUscFTCT+Z8NuDVgnvxZgz4auBve/62vLEvWer6aeauMNqehmKYuQZ24HWVHqz1Tm+nMm+\nHLgIxsjOzjcvhu+p9BxW07BeETROzk7DjO1AaxIe8OPL2WlXHWbDc7os78vi+F7ijhG+1+uVXt+L\nj2eybxcaJTw7ljLqhanpvDd4Utfi6Op4W8aG33ZwroLW03NBDrC79fKt5tJnrIeyevgymOrObZL2\nyQbX/HsusQUwC2HuYr28IRN+i1BWD1/2v1zoso9hNXZa3t7ezjkqDw4O5joOcT8BYHdMz0z4LUFZ\nLXrVODonzexbppyW1/L7h4eHC2HNuvsJdAGZ8FuEonr4qnF07Xq6bxL+8PCwMBnJkpdYsmfCZ2wE\nRfXwZYS3/1WTYN8IbxOmV0+vz8OiIrtUPpsJv0UoqoevEkf3HH77SHg+5np6Nm+4hHaXnk8m/Bah\nqB4+x9HLwVEOr57eutpy34YqDtFtQqOEX6VtcSo5J2O+TbT2ZN+HOHoT4FwGr8fArqFRwnPb3DKY\nc8Rr77RLcdCMzSE1tuyY+wloCfeuoFHCL9P2qKiJ4644TDI2Cy448jbrs8iEr9ovb1vQGQmv/ep3\nNSySsTlohaGVafOxtl3btSYtnZHw3M22qb7qGfsNbRFmmzYN8XoK7Ao6RXgOi2j9d0bGumCz0VqC\n2TYajeYIvqv9Gjqj0ntqvDV2yBI+ow6UrY2gTUN2sW9DZyS8tm3SRQIyMtYFCxJu+mlrI3A3Idvv\nCtENjRJ+mdiwFSrYdnR0NPd61x78srCssKK10zeNKrXk6/TGbwpK7E3fU5O5KJ3KtNNaZXaY7Hti\nSQhhri98F8tbvTZm/Nrrhc/Zb02jrB5+00IlVYtfp9OwM4TnJYT0h7Ba5X1GCGGu4WQXq93YKebF\nu73inTbLc8vq4Tftjbdnps8OQG3O684QHpiX8Pbw7Ufa9I/RBWhb6S5KeI1z86a1/MB8QUvTKKuH\n37SEbyMXZS3ChxBeAvAAwC2AcYzx7Wuca/ZjeCul7DvheW231Prpm0Yqzm2xbpuoeDJvm/BF9fBd\nIHzT69evK+EjgG+PMX593RthcvNrU78y4cOCKty1jjWpOLelq/LvyCac2f5Nf4+yevhNE17D0PY8\neRnwdVGHSl/br2QDQCX7LoZHVoHn9OqShNdlmDnsNRqNZqRisrdJtLJ6+E3D1gsEFnNR6kIdEv5P\nQwi3AP5zjPG/rHoi+wG4LhnYzVjoquh6PXsqscXi3JYmrY082iZ8qh5+0zC/BmtKnIvSBQn/jhjj\nl0MI/wLAp0IIfx9j/LT98YUXXph98Omnn8bTTz9dekKuT2Y7i//epRhu16DPoujZecfLDCq9lk7S\nGlqyRh29Xg/9fh/9fh+np6ez47LfsSzOz9fV1zyWujpeYowL+Sf8OsY415qMV/4xLaoMaxE+xvjl\n6f4rIYQ/APB2ADPCP/fcc8ucy61RZnWrqCd7V3/ENsFNHJTM+tw0gYe1KgDusTfR8vFwOFzYTMIP\nh8MZ0ZnwtvV6vUqEZxNP+yV4efDblBOvfoXj4+O5+7Y2XN5vaJOn4eLiwr3GyoQPIZwCOIwxPgwh\nnAH4DgD/cdXzAcU18UDxYoaZ8Ivk1j2HxeyY9ymJacc6oehmNntqY+nu7ct+Q8519445OsAlr9tS\nYl2UixJCmJlE/LsuO/bXkfDPAPiD6cXuAPitGOOfrHqyshhujHEuJGVx1Ez0eagkZ2lQtMwUh6VU\nFbbNm0R4z9VnVnLKr7n/nrdVIbyND++YIwJ2zM6vrkMnZo5o1BWaXpnwMcYXAbxt7Tt4cr45wnOd\nsq1R5816qoruM3j256w223Nba2/P6nKK8KnOtwcHBwuxd92Y8N5xGeG9hhUsFIbDIY6OjjAYDGY+\nIO48uw3jRHNRvNCht1WdDDqVaceE11plAAtxXFPrt+GHbAOenc7NLq2tNW9MPM8+5mPP9ufXJm09\nDe3m5mahrTYfn5yclBLeS+bRxB5tNV13WKtJpHJRWOKrOWOvq05onSG8DS6V8ObwAbCQbstqT8Zj\nMOk1ldUI53nI+/3+ghNMnafsN1E/isXUizbTJlJbGeFVCOhrS6hhyW6Za9sAzRMwNZ43nlRtr5GI\nInSG8AAWVHr28Koazxl4WcIveuhVuvN68P1+H2dnZzg7O8Pp6els71VpMelTmX4qgVKbZ1JwQVAV\nwpvnX7fhcDiTjEz2rhUYFcHLEwB8Ycix+WXSbztDeJ6lWCqwp1XLZveR7Kk4uw0SJjc75Y6Pj3F6\nejq3mZS3LUVUlfBMcn5dFjYzYjPpeatCeG99PLs+O+500cyUN7tr46fI627Sn3+bZSezzhA+oxyp\n+LptTCZPirIKz2vFczqzntsqFT2/gBcW8o7NsVoU0lvmGXgLcZgGoZEHb3VdL7uua8RvCpnwWwQl\nCcdiDw4OXKecOujUWcZltl7M3Zx1mvShmV4eae2c6uVf1ab2/BOshXiaDWsAXuadfd99QSb8lkE9\n8Szxjo6OZoRmaW7HRgbTBJgULIWNEHzsheE8dTIlrb1kkVUSpjwJb4TV76RqfyoCAewP6TPhtwgq\n3b04uznl2BlneyWB1tWrBORQj71OJd0Yce1zfM98vpRWUIX4+t21ht1LJuJNCW6Za/tCdiATfuug\nZNc4O0v4s7MznJ+f4/z8HGdnZ8k8bM6lT6XV2rWLbHBP2vPE4ZF9GSmv313vKyXh7Rlxaiqv+74N\nHvy6kAm/JUhJd4/wLOHPz89x79493L17d3YePp++B6SdWqn/KZLUTPoqdn+V56DJKfZeKmXYSM85\nG6x18H3uOjLhtwhqv7PzymxzS6w5PT3F+fk57t69i3v37uHevXuuDVuUtJEiQFW7Xf/uORtX+f58\nX/xMOBTnmS0e4dUE2XU0SvhlChZ0La+udCHZJFRyKsE1zq4lpxxnt0w6L76uz1qJvork43tn1bvo\nc1XBXn7WFlLPxTSfEMJCc8hl8tB3AY0S/urqqvJnb25uZllTvHLnviXXpNRjc1JprJ03U93Pzs5m\nsXYv+cTCbBZfLyqaMSzzG6T+T9uVVU0H5efBJgJ/HwBuVqGZOKPRCIeHh3MLlvK59gWNEn4wGFT+\n7O3t7Vx+tBJ+H36UIvvYCH90dJSsKTfnnBHeEm44vVRj66nsOD2uCrX/lfzL5H1XeU6s4nuE7/V6\nszFlGhJLeO63sA/ojISfTCYLS/fuY7daz4NtAzvlmNOUWduY8Oyl5tg6E1BVfgBLha48suvxuqQv\n0oB4UmTCm1RnbcC+r6n42WlXA5aR8PbwvbW590XCM1JeeXbMnZ2dzVT48/PzhZ5xvV5vQcIXkdBC\nVYZVnrt3bu9a/PlVia8TVkrC88IO9gysUMvCk/uCThFeyyn30X5XoquEt2w6IzyH3bymFmzDF8XZ\njdys3q7qxU5J8jIfQdXkG++9FOHH4/Gc2q4VmV3q698GOqPSe2ol25H7BI/42sTC1Pm7d+/i/v37\nuHfv3ozcXhyaVVmDquBsTvDvsSohlNz8W3o2fpXnohMDv2c5CUZ4k+w8nnShDO6LuA/ojIQH0svk\n7hPhPbKzfco2PMfZn3rqqcJ6dUtUST1LT81ehexVpPoqZOfnY/+rTk6V8EZu/j68NrxNhFmlrwnW\niy6jHCmSmwTSmDKHnGxLFbiwBCsiMDvouH0S9yQAikld5PFnqXtzc4OTk5OF2m6+Ry+zz/sOHMnw\nmn4Y8a2pSlfX5msDOdOuQ2D1XdtIm+3OtrmVti5btpoC+1G446yFS7WfGvd+Lwrx2V7Lc7VkNzVR\ned/JC2HaXh2dmlOvz2ufkAnfIagdyt1dzHbnDjEqqcpIUQazcb2egqPRKNmckluJFxG/qC+9JcYo\nMZWk7MzUjDvbq4akpK8jp39bkQnfIbCtbiTXrDHP++51oFl2MKudq+vCDYfDhX7zrAWMx+PSBB6t\nz7f9aDRCv99fmOTY6Xh0dDRX1WdkV3NFHZ0s5T0JnwmfsTGoR97UeO5koyo9O54823cZqM3Oq79e\nXV0tNJDU10XqfIxxlhBkyULD4RCnp6ezSYO1F/ZZWBIQd67hZ6bOv2Ul/D4hE74jsMHHKj1XwDHZ\nPQnPDq9ViZ9S6QeDAS4vL2eSfjAYLGzD4bA0Vdcq+AaDAc7OzuZqJ8bj8UKfeg7jcZKNRRxS39ez\n4S3JJiXh94X4mfAdAifYcAhOV2lRCa+E5/Ppe0UoUumvrq5wdXWFy8tLd391dVXquDOym1+A06gt\nVGaZcbroSKosNpVLwJNnSsLvm4ceyITvFFgqqYS3Yhi14ZXwjGUHc0qlN8I/evTI3R4+fIjLy8sF\nwivx1SxgsnOMnHP52czRZ2WhPG8i8FR6zwG4T9Id2ELCcx40Z01V/dHKYrybhGfDlznsVpVUaveW\nSffLy8sFkuveU+P5NUtd1WBMrbf10HWhSC3sub29nSvz1co+dd55XXf3jezAlhG+KA+6LMlHwzne\n8SahA1Sl/DKSPYWitFomu0r2y8vLmRQ3FZ7VcitnTmXVaaKO2tfqrFPfhKb78qTPSNVg2ETDnvtV\nIhm7gK0hPP/IvNyO/Y0XGvCgNl3K0bVJqITXhheedF/FMZfKkPOW+GJV3shujjpVzb1z63Xse3oZ\ncZ5fQidjTvnVQh9NAuJnas4+nej3DVtFePYiczGIeWCLwM4w3jSBY5PggchFMGW2+yrqvG5VJLwR\nnW1xdrx5OfLesTonOQSnWXE8ofG96iTAf/MID8BV57sy2beFUsKHED4K4N0AXo0xfuv0vTcC+F0A\n3wTgJQDviTFeNHifAJ6o9DwA2J4rAueje4OhC/BSaz0Jz6vFLEP4FNGLwnFsv7OHXVX6KnUT+tzZ\nbPEmNE8K831X+V58vXVTj3cBVST8bwD4FQAfp/c+AOBTMcYPhRDeP339gQbubwa121gqVbFlzcFj\nA4IHnTqwNgXPacfSz7Nx15Hu6gRLOezMKZdKujEJr99FX3uE50nNWx7Ks+GN7HbsTXrqvPNU+kx4\nBzHGT4cQ3ipvfzeAd06PPwbgz9Aw4af3Mit3ZMle5YfjBhBsQ3ap3r6I8EUScBXSq9fbJk8umFEJ\nr+uyq0pv3yG1TxFel78q8lHwhGV/YxU/lXiUkvB8j/uAVW34Z2KMr0yPXwHwTE33kwTbbqkfuAhm\n46vDiJ1Jm4TnpdfQFdu5no1bhpTqWybhzYa30Bnn0vPm2cWp16kUYrXhVcLzd7HzGVLS217nsFwN\nTrsYYwwhtMKYdcgZYyztfa9OJ88J1RRijC6ZTJpeX18DWCz/PDg4qOS0BLAg0bm8NbUV9RjkGPfR\n0dEC2fS111pbJbtHdnXc6XPjY804ZNJrvH6V3zUVhaiKIg2o7Pycm6D5BzY5l2FVwr8SQnhTjPHl\nEMKbAby64nk6A8/Lqw+1SXB1GPskjJCWdsplpb1eby4jrQxKdD1Wu1ftbM5L9yYLnoh0Ugoh4I1v\nfOOsHdf5+flCo80U2ZcBT0imCdo5NJV32d+2bHyUncd7JuqY9M7PJiyTnr+DZWIaLi58H/qqhP8k\ngPcC+IXp/hMrnqdT0AHMr1kTaAJefgCr2tfX17NqMy5HBZ50eqny/ZTk+p4NWs/GLhqM5jzzmnHY\n9oY3vAFPPfUU7t27N6uYY8J7zSmWNVl4r2DNaZVGqUo8HSdl52FzjY+BJz6mMs0r9RtW/Q5VwnK/\njccOum8IIXwRwE8B+HkAvxdCeB+mYblKV+s4ODzltcxuEjzQgUWyD4dD3L17d5bVxpLdiFmGlHQq\nSlhh0gPpBSYAf0Dze/fv38f9+/dx9+5dnJ+flxJ+2dCZqsGquhvZPROl6vn5d9GtTChwQpX5j/hZ\nVzk/hxxXMU2qeOmfT/zpXZWvsgVgEpjzirdl1slbBRxiU7KbE82T7BzWMqQIooRXp12ZF93+lnLO\naXWaZjbywpYpwq+b8qzfjb+jt8jJOoRXx2UZ4e1ZqibF11cHKvt06tAytybTrg2wGqU93ZomvP3w\nrGVweGwwGCxIdm59VeX+VDoU+So80quKruo3E533dmxr1dtmdqfXyMObVKp+P88p6RGeO9pWIT0T\n3sjIYcoqhOeJVR2u3u/PY9DMJu/5VH1OmfBT6MPmjDP7UZuEeZF1MA2HQ/R6vVm9uX2WyW7JLwov\nGcVTB5XwNnhUwnvSW4+LNm85LF0dx+57FYcdfx8mtW3qtFtVpef6fe77V0b4o6OjuQnVnpv+Fqm+\nghaF0Ml2mQSsTHiC97BNwjZN+BDCgmTn0FW/3599zluQgouJUvDi8CrtDSq1OQtOF7zw+tB5e2ur\nzUtiMeFTiTDLEF+92byp024dlV4n5cFgUEp48wPxZHrnzp25/0ud3yYU1aA4FyNL+BVR5PhZ5n/5\nHGWwH8zzJVg/dwvJGcm1gKVKjLfMscWFOzypWK16aqtCel4GSzMHV7HXvWfvSUmewNeR8N71il4X\n3Z83MTHRPT8STwwcOq3VabdPUInG9laZFzylHi8jRXTAarYY23S6eZluTKJUgoc63VgSs8/Aio9Y\nRdfXqc3Lmfcy6daFkkkdX+uG5divYeOD/7+KDW/ajH3eNA8gvWT6sp74wnuo5Sw7ALVb1YFVhfBe\n/NT+tqxTSNW0MsLb+ufs+GKbL6Umq81uhOcwkEl9JbA65dQzr691vbt1EmxSz09Dq2yaMZE08aaq\nFqZCgcdIGeHtOZoaHmOcOVvtd9f6BLvHupAJT+AfU+1Zi0OnwM4cbrllJK4ClVCs4ocQSgnvJXQU\nEYn/poT3HIQaZvM2vr6+9iR/nRIegCvhjfDcIXcVld4IzwKBJ8sq5+EJzp7xZDKZaR0aCl7X7FBk\nwk/BarBJc35dlnhjP9p4PJ6TsEbWKmAJxeetSngjkU5WLIVskHrvGeGV7EdHRwtr16mXOJVdp5/T\nyaAJlZ5t4yIJv65Kz++pkEjdn27cKcgIrwlfdZEdyISfA6eoqvpWJqUnkydNEoFF9bwMbL/zexau\nA7Cw8oses2rKUoRztQ2q0gOYG8ga+jMfgZcHzv6Cor03CdSlzuszVPudVeU6bHh7zfUFZedhzUML\nlIoKmFibWBeZ8ASOA9sP64WsPDCxmOzLSDANkZlkNylc5rRTwnseXJbu9tpgA5nJrsUxVTc7d8qR\nqMd1oEjCG+nX8dLbPduxVQpWtbPNQWfS3CS82uye87cuZMJPwVKOiVLVoWM/uDqOliU8Z9LxvZUR\n3gYSS1MbkB7pPdh35z4BGrbT+1rlOLWvA15YTlV6L722CniS4oQZu24ZLDnKJk/20g+HwwWPvB5n\nCS/wBpIdq6PIsx/LBqAXY+d9HSg6p673ZktA6TJUtnEM3+rVq0pnQ51krIKq10898zLCe86wZcNe\nKR9IFXixeLvPpgu0gB0iPKuJnp1piR/cWWUZL7H+UKp23d7eLvRpr9vDak4dIzpnqE0mkwXC6+Z5\n1Tl0xs/Rjvn5No0yc6BswmW/hEf6ulJrU2p32Xk4954nnjoFRhl2hvDAfHmmeoO5N5x2fq06mNk+\n1Hg7p8Rq+mYdP6iZCab+XV5ezpHdpHnRpv3f9bXa1oY2pbxn73PUw56Ft3FkxLPjvXDXMjFuPbeO\nhbLf+ebmplDTaAM7Q3j1quvmpYIuS3iVHJy66aVvNiHhjfBM9pubGwyHQ1et581Uf16llZNr2NHH\nr+35Ng1PS9P7UDXcc6oWSXiW7m3Xw3MVpo6RtrAzhAfmwyQqwbxUUA6jlSE1iDRRItW0YF1wnHYw\nGMyRfTQa4erqqpDwli5rVWr9fn8h3s5kt/frchZVhZKek3c0gsGRDM53KCK8Sudl7Hc977L18N5E\n0aZ0B3aI8GqvcydUzd9WdX8ZCV8U7imKodYBk/CmutqgY7KnSH9ycoLz8/PZIOXKLXtemrSjIbym\n4ZGdN7abmeQa2WBistPOSKlbVdLreZeth/fMgazSrwHNc2Y1Vhs48MBa1oZP1csXOXTqgBHeyD4a\njeY0lzLC88Rkz4uTawzqOGub9Ep4LiPV1GMmmRJeTS2WqKwlLCvhV62HV81kmRh+XdgZwtvg1Mos\nU2XNKcWf1eMi8CBJ1cun4qd1qvRGdm/i0mWp1H7X6reijjlG9E1I+JQ/RsnOWYxseqRUektjtc/w\nvgpSEr5qPbydIzVO2sDOEN5DKsRUNJBTk4L9n8ZPWYo0jZS2YPeqDkQNP5mjzrrkeCvH2Pk4c5Cd\nZk0ilY9vmz0DYD7BiT3ymoeum2FVgvHEr9dsI46+LnaG8DwAbMBzOEclmELDQHrcFXhONB28nqeb\nO6PY/6gU5P+1c/L/6zWKXq+CIt9KUYTE9rbY5SbDXl3HzhAeeFJZZoOAbdAqhPfKOu1v2wJNVvGK\nVQCf8Px5m1j4PTs/7/V4HXhVdKn75cIh23tr1nthr30m/84QXgcED5Tb29XWjwfaU2eXQSpUxmT0\nJDwTnk0TLuvV/HsmvJI9ZTKtCi/+ztAJykwSIzlLeI5GbMJW7ip2hvDAE3J7UqGMtJaNp11MNhGL\nroIi0qtkVjVZHZBGIK/5h52HJ8CmCO9l1+n9elESi5QY4VMSft/JDuwQ4XlA6OsqCTYHBwdz6ZGq\nynYRfK8ML2NNnwE7vIxAVtetEl67o/I1eKsDqXOmYuvsJTeVfpO56l3HzhAemA9dcfimSqydk06Y\n7Db4uybhGSqRVVJ6hPUkpteY0XP6pba64J2rSMLbyqmehK87F2LbsTOEtwEBYC7NsupgZEluA9zK\nSrdpsKhKn7KLPSeYNbwosuFZ1dZrrYtUjJz3XtmrEd7Ue69iMeMxdobwwHr16ZPJZOasYxXXbFzz\n9Gs65DLJKZ7Ti/frIISw0CXWq/1nwmvGmWaeeROHR/S6JLxmwWlWmhGaVXiT7Lb3vPTLJtekJhzN\na1hlDGwaO0X4daEqI3umDw8P52K8XP5aFakYfx0SMoRQud4/pZIXEVvDcHWr8QBmk6qRSo+vrq5w\neXk52/i1SvhVqtFSE429tslknQll08iEn0ITd1T9PTw8dBcJqGofMqFSRTzrIITg1vt74TQ7rkr0\npmx1haYt6/7q6iq5sf3u9Yir8huxmeMVuGgPQfYRbAsy4QleHN/ePzg4cJcrWsYhZB7v1Aot60LL\ngb0lnMrI7IXF2iK+PX9u46WeeFXh+VgJySWwVZBKmbVJSDeW8Nsi5UtHWQjhowDeDeDVGOO3Tt/7\nGQA/BOCRfG5iAAAdbklEQVQr0499MMb4x03dZBvQ2LS9Z68PDg4WUjqXTdvUaj4l6LpI1ft75Ew5\n9spIr+eoEyzhPTvdNv4bv/b6Eqyi0nMyEre6TuXq75qE/w0AvwLg4/ReBPDhGOOHG7mrDSEVxzeJ\nv04tMxOMy3d5UcV1YNpD2UIPTNxlJbv+v72uC2bDs/f90aNHMzvdiM3OO968gpllJmQvGYmz+TZd\ny14HSgkfY/x0COGtzp+6G5heEZyR5cXx1YmzbIyXSWkNOqx67fj4eO37V2egEt4j+jJqvJ6jbrB0\nHQ6HM6fcw4cP8fDhw7msOk/lZwJ6TS7KoMlImtyj3vlN1LOvi3X0yB8NIfwAgM8A+IkY40VN97QR\n2ICwH9WTaBym8Y7LYARUCW9NOtaFd88pNVwJrsdF6nxT5DfpyhLeCP/666/POeW4vNeONUS2rG3t\nZfNxco86ALcxR39Vwv8qgJ+dHv8cgF8E8L5a7qgAKUnFWOfHWPfHSxGBnXXemuq2rYui64cQ3Iae\nXmdbjud7PgDLPKx7sGthDDvqLi8v56IkqZV36sa2ENwmwzKsRPgY46t2HEL4dQB/uMp5lkEqhKTN\nEbz4aVt2lkpHPua++EVe9Dqu7z2nw8PDWQPLs7Mzd+v3++j3+8lYPg94LUhZ5/na/6bWzPPCYU3Y\n0PbcuGsSp1ZbAtY6Zl1T6PV6OD09nb2+uPAV7pUIH0J4c4zxy9OX3wvgb1c5z7JIxbAtLdZzqLQV\nNuHB4t2n9pxrYrlkvgfv+kZ43pjwvV5vbhUbXsucoVl5yzxf73P2XhHRta/8sjH2KuDnZ6vo2rkt\nSqNjjM29TZO+CqqE5X4bwDsBfEMI4YsAfhrAt4cQ3obH3voXAfxwo3eJ8r7zNgN7bYCbcDAV3WNZ\nX3ztolvH/ZU9n6OjoyTRbeNMPV2sw1CUjVYVTAw+TiXc6EKQTXnJ2anKZcL2vsX27T5szO2U0y7G\n+Lzz9kcbuJdSFMWxQwizAcKzrvYy29T9aYy8SQmfstO5L72S//z8vHSxDi/X3bSoZWLd3h4ol/Ae\n2euU8vz8lOz2m3HrNLv/nSJ8V2A2PBOKe7Gz/Qo8+SHqJNSq91e0tlvdhOewH5sQJycnher82dnZ\njOCe9mRgsqcaRBYhRfgYY6kNz6msdZPdnh/3QOAJnPsqsD/DIgvsxOwytorwHNbS5ZNSpZ9tER4o\nDrsdHx+7Dr067y91fV11JqXSqw8i1ThDSV+1Y6tHciU8Z7Z5hOfr12032+/Cx9wem5+FvWdZmNuC\nrSE8sKgy82BWspsN3+aP4UlYdoSl4uR1S3glvHnfmewe8VMJObZ5ZOdElarJLSnCp1R5s+HZXm8i\nPMbf2Rqi8HWY7Pa96/LBtIVGCV9HfrjBi1vz6xCCW/RgnymTLlVQFmdXG1i3dVGUWHNwML8QhUd0\nXleOJyJT+71rAYu92DlezoUl/Hne63upLVX15pUiN6U+F03AJkjUt1HnpN00GiU8xwXXxeHh4Uw1\n9hxKng3NMVRuV7VKHNXLQmO1nO+v6bCbt1mcXReM1GMOu+mS2UxKL87uVYwxKT0Se+E7df7ZsaXQ\nXl5ezppZVFmkMaM6GiV8v9+v7VyHh4dz9d7e6q9qw9pAMsJ7cXqgegyVY9wa7+Y4+6rrzxdBY+za\n3SYVZ1fSe4k1nlPOI28qbGb7VK1BUVMJPr68vJwVyzDhtcR1G5xjXcXWSPiDgydtpD0JqnFoL6zi\nxemXCat4sW4Ot2m5a1NeeF3+2p6LR3jeeH14vk+bND0pzAQtcqjZUsys9nPIjsNoutnfvPp2dQhm\nsq+HrZHwXkKLEoqdLfaaCerF6Vclu8bYvTh73amzmvbJ+e8admPHnG2aN+89w6I4uxKei1c0Tu4d\nexMA77kyjbvPNuGR31dsFeE9dZrJzjFUfq1tq8xe5RLYKvC84KzGtxVn5+uz402973xsK+iqw7Mo\nk05NoBTZtc9bakulPuv5telEW4lT+4CtUekBfykiT8Iz2TU5wshug2wZQnoSnp1gbcTZ1V9g3vZ+\nvz8jue5tU9s/ZcN7HnntAuO1ofKceuzJT0n+stfZaVcftkbCA344it8zSXVw8GT1UxvA3MmGB1TV\nkAr7Cbw49/HxcaNxdu/6RngvocYjfqrS0Mse08Qabi7JUp5J72XJeX0A+ZxK7CKPfsb6aJTwdcSe\nl4FOBvo3tf1tW/b8dqyvU9de9vypycyT6p4Kb954jbVrYk0qhqxqPRNfa9V580J1/F6K6PbeLqAo\nVwIoX2yjaWxVpt0mwSSwgc8+AavKS0nRqlpEymzhenYvNVaJruHLqgkiqVg6d6Ph9lO8KYm9vVf0\nsivSuyhPw557KiwJtEP6TPglYOaAEV59Al58Hqim1muc3atnTzWu4PJWTazhSIFKn5R091R7j/CP\nHj2aJcs8evSoNA7vhed2heyA/xvyazYn2UcCoDXHZCb8EuAfyVJJ2cHF4TgNDVYB2+hejN+T7ufn\n57Njr12V5irYdbwJSJNbqkh46zf3+uuvV35+uy7hUz0JYnyysg6vfcC+iqaRCV8ROvD5PZP6FvLi\nlF52IJbBiwJonJ0dcUz2s7MzN+ymKj1Q3IQy5TRjwnML6QcPHuDi4gIXFxdzURNPtV0lpXmboITX\nPA3LZVAVv64oThVkwi8B9farF5tDSBoarAKOs2shDHvjVZU34nOojfcq4e1aRd9Tyc6tm1mlN8J/\n/etfXwj3aQiwKG13V2C/OWeFmok1mUzmwqD2us08g0z4JcDOFZN4XLzikb2qFPMy+bw4e4rsWs/u\nOf9S0QtGymFXZMO//vrreO211xaqGTkb0c6t19gleBKe/Soszdmeryt0WwWZ8BVhg9OkuNrD3DiB\nJfUyaqsX59fwW5EdX8UxVybZ+biKDc8SnjUSO2bzxruO93pb4flgeNLWXJBN9GzYG8Lz4GepzMRc\nFcsQWvd2zNWANki8enYOu2kv+dT1qlaapYpbjLSaBadxdn22XmXiumDNoO0YtppuRY5HnVg9Kd6m\n7W7YG8IDizYyx0DXbdZxcHBQWg9fFGfX8lav+IVLW+3cLC3WRSrvnZ+TZ3qYJqLRgbprCTyHnxKv\nSXB2obc+PPs7xuPxQr2HFQhpf/02NZy9IbwOVB3EdRC+qB6+LM6uYTeP9JxQYwPJBhl3nLHvtCy8\nVVFVytt38fr2sWe6ifJgzfFfpYnmOtCUYW6qyfd3c3ODw8PDmdbDppFH+DajFXtDeGC+QYb9CEbE\ndQdMCKG0Hp41DI2zHx8fu2Rn0mtPe076qUvCF2XFGTzn4snJycJ3aoLwXhsz7m3QJNSMYQlt1+aI\nhsbZbWL2CN8W9orwRjhOirGBWwfhy+rhecIx4nr17FoAY+WtPFmoqmj3sA6KylhTKj13D/a68TQl\n4c2ByARsmvA62ah/QlV6luymDXgLpWSVvgHwQNXX6zrt7HxF9fCpsJvZ5L1eb0GF13XfvPzsOrO0\nUio9O6aKwofssON9E4TXMt3r6+vGieOZE/zs+f74tan46vj0ztE09obwwJPyWU2KqeuBlxVOsEah\n9q9544sWevTCWZwMtC60hFXtePsOXnJJr9ebm9h4Xzfh7R61Jr9p1TjlMFQJb8fWb8E2TWbS/28D\ne0N4Dn959fJ1XiOVt56SjEVLQDHhPUeaqtzrwFPnPTveSy6x9e2L0nbXhUp4JvxgMGhFSmpYUEOD\nHML00pnL/r9p7A3hgWYGYeoa+l5KnWfJrj3jNc4eQphTFw0sWcpQlPTCvea9zbrWcCiJtZY2UHT/\nRTH6tgjV9QzCvSJ800hlugFIqvHcmkpbSJsJwrajhsmWGWD6eU2jTbWosu3q6gpXV1dzoaU2vczs\nJ+FcCoNNRPqMdq1IZx1kwtcITWdlO7bIbjcJb1Jfm0t6hF9lIGu6rB6zmuy1s9KVYdpOHFH/geYG\npOLz9plM+kz42qFOKw7F6QKPVgGXKm81wscY53ror5Nd5jmMbNMutLpZK6vhcLgxCW/Pksluz1hN\nEI2DZ5QQPoTwLICPA/hGABHAr8UYfzmE8EYAvwvgmwC8BOA9McaLhu+10/AkO1euFan0RnovpAcs\nVul5qnkVsERnh59JQpbm1hteNyM/t5DelIRX34hNQpusN+86yiT8GMCPxxg/F0I4B/A3IYRPAfhB\nAJ+KMX4ohPB+AB+YbnsNj+ycOss17l4oTs0AHrRqby9rvxuK0lO9BpV8rCvNbErCaytyS5zSuP9k\n0n69eddRSPgY48sAXp4ePwohfAHAWwB8N4B3Tj/2MQB/hkx4APMFMtqTrsyGT3mWWcLbfhXvsxcH\n5uwxlfC27JPZ7txjnr32bTvtQnjcNJTVe3vNajzXm2c8RmUbPoTwVgDfBuAvATwTY3xl+qdXADxT\n+51tIdR+Z7J7hNfGFuyUq+qgW8Vpx9dgwrOEZ8JzV1pPO2hTpbfNkqZ44lMnpy0vZv+TnXYVCT9V\n538fwI/FGB/yjBljjCEE90m+8MILs+Onn34aTz/99Hp323GwU0kdcGyz86KOnF7LJPJi7qkYv4dU\nvJqz1bSene10z25POfvaJDzvFUZy9oHUmdrbZdjvVIZSwocQjvCY7L8ZY/zE9O1XQghvijG+HEJ4\nM4BXvf997rnnlrjl7YZJHfXE8+o0d+/encXceQVX1go8u9z+VoaipBM2D3gVGfbCa9iNk3BS8f8s\nNbsBi/gYLi58H3qZlz4A+AiAv4sx/hL96ZMA3gvgF6b7Tzj/vnfgSjiT6GyrG+HtfW2UwWopS7Oq\nqbNFSTX8vqnxbKvz3sJu7In3zIu2s9gy1keZhH8HgO8H8PkQwmen730QwM8D+L0QwvswDcs1dodb\nAk4xNcIXrRKjhFcpbnZnVbW5yIvPcWjNRWdbndV5XhyyqKVTJvt2ocxL/+cAUrrku+q/ne2FEVYX\neTw/P8fdu3dxfn4+F3dPqfR2Lk4qqWone9lznjRWwlsHWs2s0zXf7f+zOr+9yJl2NUJVeiP8vXv3\ncH5+PuesS6n0RnZT76tmiakH3rQD29u5WcKbNDdPfNFyzxwa9HwDGduBTPiakFLpTcLfu3dvYSko\nlfB2HiVU1cUsONzGJoGdyz7DTjsmfKpKTru6GLJav33IhK8RptIfHx/PVHeT8Pfv359rfaW937RR\nxDoZdCzJOevMPscS3rLoLM6udfDaxkmvmbFdaJTwXmNFrynAtsRJU/cdwnwDSwvJac58qmOtFyte\n9pmYNsAE1+Qab812Ds15ST9tx9rL4DkMbe+FELt0711Ao4S/urqaHXPKqXfcddJznrtXDWdqfGqR\nCE0GqeM760Bm+1ybWVxfX8/F2L3OqavW2bcFTQvWY84pYN9DF7/LptAo4QeDweyYmxfoti3Q/Hh+\nrc64KmRXNb4qUim2Zp9zXjxLco6z62IK3Dyiq2E3Nlu8zSvsaTP1dxvQmoTnnHJv/fSuS3hgsa88\nb6a6c4Yd2+lNkV1TZrXfGxfDWCadva8LKmhIr2uk56ShVAsu7fteNWlpX9CahOcup/YDcDpp18Eq\nvObKszrPKj2vQMO2eh3EB/y+dmyvc+MKzqQrWv0klaHXBeh31L70Xk++LOHn0Srh2YHCJaTbAm1E\nyWt/a2GMSnjOptNmGavAc1yphDc1XstcuWONR3g7X5cJ7/WlT3Xd7dJ32DRaU+k9NV5LHLsO9kOw\nN57z5rkvndrw6xI95Z3WGLv2bDfCs4T3bPiy62wanp+CU4E952P21M+jNQl/dHQEYJ7sbTZPWBfa\nYcVbv52LYjzHnZ1H9+v4L8pseLPdlfBVutZ0jSRF33EwGBQ2Csl4jEYJryuaWkMCU3PN5mqrpzmQ\nXixC/67/4y2+oF1rtKe8LqhYdp0qSNnZbLsbmbWRhRJePfTbAJbg2rxjW77DJtEa0zyHC3vndXKo\nG9qNpsyBpsdVl4fi3vK6ZDTfyypgFVXV1pubG7eBhdnwRXXumSj7g40R3ohmf2vaeeflAXB1Wtmm\najy3pzo/P5+R3at1LyL7MuTnZ6hpr1bfrk0o2WnHDi5W5TPh9wet5tIz4W2g2yCu0tFlHVg4TTfN\nA0hl1KWWiDIJf35+Pte2SiW8nXtVaDWcruXOYThuT8Wk99Ym3yZ1PmN9bETCm/rO6mgbhDcJrdEC\ng5JdF5Lw2kwb2c/Pz5POupRjbtV8eW4r7XmsmexMei9OnSX8fqFVCc8EZ0cTS8GmwI0krMiEQ4MG\nJjynz3oqvUl4I7xWwamEZ6walvOcVSzdUxL+6urK7TqbCb9faF3CA/OVXJ4HuwlwzJ/teR7wbLNr\nb3l12vFCkCblU3n2dVUFskrveeU5Ju3Z8RqbzjHq/UPrhGdy2b6NPHqOgzOJOVZbRnZNtlEJr5l0\ndVcDeumzWuJaZMN78elM9v1C6yr9pgbYZDJZcNhxTgBrAN5myxmpROTJITWJWaupMmhuPB/HGBfI\nzVlzHHrTNeDMUbcN8CYj22vqbBfTf7uOvep4w34DywMw+9rWJmOpzq9vbm4WVn7t9/uz6rPRaJQM\n6VX1T/AA1mKWyWTixtn5+PLyctZ9dhOLPa6Lonr3yWSykDefnY7LY28IX5YHYP4EtcFtPx6P3VVf\njWwnJyeuGl9Vpdewm263t7eFK7pamyouf2177bd1Yb9DquZd6921315GOfaG8MATZ6HlAYQQZiSz\nZYaZ6LyNx+O5hBuT7ka6Xq/ndsJZxoZP9ZOzTReL0E3TZ7dNwmtxTFm9e84jWB57RXiTIJz0YxKf\nk3C8tNvxeDznmTdysUrvxe95q3JvXMetx0z2FPG1/HWbJGBZvbvmEGSVfnnsDeF5MOlrU/FT9epM\n+NPTU1xeXuL09HTBSabaAZsEVVR6lmTeXhtZ6J5t3G204VnCc36Bfh+vu+62fMdNY28ID8y3ajay\nsyQHFotmmPBaKKMqtPa8M0dgldThGOOcuqrqKy8LpRl0Nul4i0hsq4TXxS6Hw2GycCiTvTr2hvA2\nKIwAqRBaqrjl+vp6lmDD6jyHwLwGnTY4qxCenVK6pZJp+D64JfU2dnzhtGGviUeud18fjRLeq3Mv\nirM2jXWvx8ktrGradufOnZXbIpsq62XMpVZ5VQnPlXR6vA1SXp1zumVyr49GCc/rVafiq9umlmls\nnL3ppi2YBlFVstveVFn2uGu3Gi/+bq9TIb1tITxPpLl8txk0Svh+vz87ZnVNt22BFyvn76GpuUWS\nXt+356Mrutp2dXW1oGHoVpS0sg2kYW1JV5HJqAeFhA8hPAvg4wC+EUAE8Gsxxl8OIfwMgB8C8JXp\nRz8YY/xj/X+W8NpPXOvht2FAAoukV8JbVR5/J4/c3jm5vfRgMMCjR4/w4MEDPHjwAA8fPnTtek5E\nUY2Jj7eBNBqK27YowzagTMKPAfx4jPFzIYRzAH8TQvgUHpP/wzHGDxf9M0t4bi3MGW5V88y7AFXn\nOUHGJjGuwPMkfOq1p9Ib4V977TU8ePBgwYuvHv1UWu62OLdSq9Zuw71vCwoJH2N8GcDL0+NHIYQv\nAHjL9M+lLFXCc8YZS8htQpFKr6G4MqJpgYyp9KPRaEb4119/HRcXF7i4uHDj85yMog7RbSI7gKTJ\nl0lfHyrb8CGEtwL4NgB/AeAdAH40hPADAD4D4CdijBf6P6zSe2o8273bACWRl/5aRHYluO6LJPxr\nr722IP30deq8eu2uwvM7ZLLXi0qEn6rz/w3Aj00l/a8C+Nnpn38OwC8CeJ/+H0t4bWtl5Gi6000T\n8ApbVCLxoC1T6+29FOEvLi7wta99rTDP3tOUto0o3iS1bd+h6yglfAjhCMDvA/ivMcZPAECM8VX6\n+68D+EPvf1988cXZ8f3799Hv93F0dDSTStZY4ubmyTrym4zTF8EmKu4hZ7Hwy8vLuW444/F41tvu\n+vp61uPOzuPtr6+v8fDhQzx8+HDOO2/b5eWlG1/n44z9hYVmy1DmpQ8APgLg72KMv0TvvznG+OXp\ny+8F8Lfe/z/33HOzY+ubzos5sLrGSStdVOvMi27rtRnJrZPOeDyekZ73dsxLbXmq/Xg8xte+9jV8\n9atfnTnpLi8vMRwO58JTRQ7BjP2FdWAyXFwsWNgAyiX8OwB8P4DPhxA+O33vJwE8H0J4Gx57618E\n8MNlN8RNI43cJpWs73vKaWNlrJuExskfPXo0K4qZTCYYjUazBpbenvvue4S/ubnBxcXFzF5nwlvI\nLZM+Y12Ueen/HIBnZP/RKhdjwttAtYlASx/VwdcFsIS3iIOp+YPBYK5dlh4b4VP2/O3t7SzebrF3\nJrzFoz2tJ5M+oypaK55hCa8LS3KPua7G6VnC2yKZRvbRaISTkxO3eIZLZFPOOjvX5eXlrE2VHbOE\nT8XYMzKqotVqOSa4JamYKq/LMrE3vwtgwrNkt5j50dHRQsMLro33Ji0mq5kFXq48S3j7v0z4jFXQ\nuoQH5ju9msSySQBYXJGmKxLeogmTyWSWAsuqu9dAo6jvvtrymkzDx54Kn1X6jGXROuGZ1LY3wttr\njtN3JTHHJLxNRB6hvbp63ReRU9V2jVR495SRsQwaJXyRs02JYKWkXouprsC+T1fMjLqRmqT4N6ii\nZaSaiGSsh1T7tWXQKOGvr68rf5ZTRDmZJNup7UHbbOtxUeltjNFtDeZNGhmrQduor0L8Rgm/zGon\nXBTCrZky2dsDR1K8NfKK2mgblvFhZCwHJfwqz7ZThNeyyC5k2O0TNDmKk4ds9R3egPk8CSW6agkZ\n68GLAG094bNKvzno0tiaImz191rmrM7KVGgyYz14C50s+1w7Y8PrwgtZpW8fnCfBxUBWCMRSBXhM\ndpP89v8s4dU0yGr9+vCWMNtKCW+loVml3xwsGcqyIXnxzJOTkzlpwp1++H2WQLo4Z8b68JyhW+m0\ns/h2Vuk3A5YaZr8b4fv9Pnq93qyIyXISbm5u3GWylfR2vox6sE7Ys9Ffgb23g8FgriGGIlUJlsne\nHjx1nJ134/F4bhntlJqect5tEmXjb9No6/5a+xWGw2Fbl8rIWEDXx19b95ddpxkZe4RM+IyMPUJo\nykYOIWTjOyNjg4gxLjhYGiN8RkZG95BV+oyMPUImfEbGHqEVwocQvjOE8PchhH8MIby/jWsugxDC\nSyGEz4cQPhtC+KsO3M9HQwivhBD+lt57YwjhUyGEF0IIfxJCeKpj9/czIYR/nj7Dz4YQvnND9/Zs\nCOF/hhD+bwjh/4QQ/sP0/U48v4L7a+X5NW7DhxAOAfwDgHcB+BKAvwbwfIzxC41eeAmEEF4E8K9i\njF/f9L0AQAjh3wB4BODjMcZvnb73IQBfjTF+aDppviHG+IEO3d9PA3gYSxYYbeHe3gTgTZEWQAXw\nbwH8IDrw/Aru7z1o4fm1IeHfDuCfYowvxRjHAH4HwPe0cN1l0ZnKjhjjpwG8Jm9/N4CPTY8/hseD\nZCNI3B/QgWcYY3w5xvi56fEjALYAaieeX8H9AS08vzYI/xYAX6TX/4wnX7AriAD+NITwmRDCv9v0\nzSTwTIzxlenxKwCe2eTNJPCjIYT/HUL4yCZNDkN4sgDqX6KDz4/u7y+mbzX+/Nog/DbE/d4RY/w2\nAN8F4EemKmtnER/bYV17rr8K4JsBvA3Al/F4gdGNYaou/z4eL4D6kP/WhecXZIFWtPT82iD8lwA8\nS6+fxWMp3xnYOnkxxq8A+AM8NkO6hlem9h9CCG8G8GrJ51tFjPHVOAWAX8cGn2F4sgDqb8bpAqjo\n0PMLiQVa23h+bRD+MwC+JYTw1hDCMYDvA/DJFq5bCSGE0xDC3enxGYDvQGJxzA3jkwDeOz1+L4BP\nFHy2dUxJZEguMNrCfbgLoKIjzy91f209v1Yy7UII3wXglwAcAvhIjPE/NX7RigghfDMeS3Xgcbnw\nb236/kIIvw3gnQC+AY/tzZ8C8N8B/B6AfwngJQDviTH6S4S2f38/DeDb8VgdnS0wSjZzm/f2rwH8\nLwCfxxO1/YMA/godeH6J+/tJAM+jheeXU2szMvYIOdMuI2OPkAmfkbFHyITPyNgjZMJnZOwRMuEz\nMvYImfAZGXuETPiMjD1CJnxGxh7h/wP1lwmC4rJXUQAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x7f16709a6128>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import random as rnd\n",
+    "idx = rnd.randint(0, 99)\n",
+    "images = data.asnumpy()  + 0.15 * noise\n",
+    "plt.imshow(images[idx, :].reshape(28,28), cmap=cm.Greys_r)\n",
+    "print(\"true: %d\" % label.asnumpy()[idx])\n",
+    "print(\"pred: %d\" % np.argmax(pred, axis=1)[idx])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index ce51758ba962..efb1122504a0 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -61,7 +61,7 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 
 get_data.GetCifar10()
 batch_size = 128
-num_round = 10
+num_epoch = 10
 num_gpus = 1
 
 train_dataiter = mx.io.ImageRecordIter(
@@ -84,12 +84,12 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 def test_cifar():
     logging.basicConfig(level=logging.DEBUG)
     gpus = [mx.gpu(i) for i in range(num_gpus)]
-    model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_round = num_round,
+    model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_epoch=num_epoch,
                                  learning_rate=0.05, momentum=0.9, wd=0.0001,
                                  initializer=mx.init.Uniform(0.07))
 
     model.fit(X=train_dataiter, eval_data=test_dataiter,
-              epoch_end_callback=mx.callback.Speedometer(batch_size))
+              batch_end_callback=mx.callback.Speedometer(batch_size))
 
 if __name__ == "__main__":
     test_cifar()
diff --git a/example/imagenet/README.md b/example/imagenet/README.md
index 33bc50c9feae..2039b2ec6c18 100644
--- a/example/imagenet/README.md
+++ b/example/imagenet/README.md
@@ -12,7 +12,11 @@ Note: A commonly mistake is forgetting shuffle the image list. This will lead fa
 
 - [alexnet.py](alexnet.py) : alexnet with 5 convolution layers followed by 3
   fully connnected layers
-- [inception.py](inception.py) : inception + batch norm network
+- [inception.py](inception.py) : inception + batch norm network for ImageNet with 1000 classes problem
+- [inception-full.py](inception-full.py) : This inception network is used for ImageNet with 21841 classes
+
+## Notebooks
+- [predict with pretained model](predict-with-pretrained-model.ipynb) gives you a demo of use a pretrained Inception-BN Network
 
 ## Results
 
diff --git a/example/imagenet/alexnet.py b/example/imagenet/alexnet.py
index 9a74631a2174..dbf5e9a28ba4 100644
--- a/example/imagenet/alexnet.py
+++ b/example/imagenet/alexnet.py
@@ -59,4 +59,4 @@
     wd            = 0.00001)
 logging.basicConfig(level = logging.DEBUG)
 model.fit(X = train, eval_data = val,
-          epoch_end_callback = mx.callback.Speedometer(batch_size=batch_size))
+          batch_end_callback = mx.callback.Speedometer(batch_size=batch_size))
diff --git a/example/imagenet/inception-full.py b/example/imagenet/inception-full.py
new file mode 100644
index 000000000000..d703a6db59a2
--- /dev/null
+++ b/example/imagenet/inception-full.py
@@ -0,0 +1,101 @@
+# pylint: skip-file
+import sys
+sys.path.insert(0, "../mxnet/python")
+import mxnet as mx
+import logging
+from data import ilsvrc12_iterator
+
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.symbol.Convolution(data=data, workspace=512, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
+    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
+    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
+    return act
+
+def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
+    # 1x1
+    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
+    # concat
+    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    # concat
+    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def inception(nhidden, grad_scale):
+    # data
+    data = mx.symbol.Variable(name="data")
+    # stage 1
+    conv1 = ConvFactory(data=data, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
+    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
+    # stage 2
+    conv2red = ConvFactory(data=pool1, num_filter=128, kernel=(1, 1), stride=(1, 1), name='conv2red')
+    conv2 = ConvFactory(data=conv2red, num_filter=288, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
+    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
+    # stage 2
+    in3a = InceptionFactoryA(pool2, 96, 96, 96, 96, 144, "avg", 48, '3a')
+    in3b = InceptionFactoryA(in3a, 96, 96, 144, 96, 144, "avg", 96, '3b')
+    in3c = InceptionFactoryB(in3b, 192, 240, 96, 144, '3c')
+    # stage 3
+    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 96, "avg", 128, '4d')
+    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+    # stage 4
+    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+    # global avg pooling
+    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
+    # linear classifier
+    flatten = mx.symbol.Flatten(data=avg, name='flatten')
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
+    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    return softmax
+
+softmax = inception(21841, 1.0)
+
+batch_size = 64
+num_gpu = 4
+gpus = [mx.gpu(i) for i in range(num_gpu)]
+input_shape = (3, 224, 224)
+
+train = ilsvrc12_iterator(batch_size=batch_size, input_shape=(3,224,224))
+
+model_prefix = "model/Inception-Full"
+num_round = 10
+
+logging.info("This script is used to train ImageNet fullset over 21841 classes.")
+logging.info("For noraml 1000 classes problem, please use inception.py")
+
+model = mx.model.FeedForward(ctx=gpus, symbol=softmax, num_round=num_round,
+                             learning_rate=0.05, momentum=0.9, wd=0.00001)
+
+model.fit(X=train,
+          eval_metric="acc",
+          batch_end_callback=[mx.callback.Speedometer(batch_size), mx.callback.log_train_metric(100)],
+	      epoch_end_callback=mx.callback.do_checkpoint(model_prefix))
diff --git a/example/imagenet/inception.py b/example/imagenet/inception.py
index 6d0dc36bddad..263f3a22733f 100644
--- a/example/imagenet/inception.py
+++ b/example/imagenet/inception.py
@@ -95,4 +95,4 @@ def inception(nhidden, grad_scale):
 
 model.fit(X=train, eval_data=val,
           eval_metric="acc",
-          epoch_end_callback=mx.callback.Speedometer(batch_size))
+          batch_end_callback=mx.callback.Speedometer(batch_size))
diff --git a/example/notebooks/predict-with-pretrained-model.ipynb b/example/imagenet/predict-with-pretrained-model.ipynb
similarity index 100%
rename from example/notebooks/predict-with-pretrained-model.ipynb
rename to example/imagenet/predict-with-pretrained-model.ipynb
diff --git a/example/memcost/Makefile b/example/memcost/Makefile
new file mode 100644
index 000000000000..ca6b543be4d4
--- /dev/null
+++ b/example/memcost/Makefile
@@ -0,0 +1,22 @@
+
+.PHONY: no_optimization with_inplace with_sharing with_both
+
+no_optimization:
+	@echo "Estimating the cost with no optimization..."
+	@MXNET_EXEC_ENABLE_INPLACE=false MXNET_EXEC_MATCH_RANGE=0 python inception_memcost.py
+
+with_inplace:
+	@echo "Estimating the cost with inplace optimization..."
+	@MXNET_EXEC_ENABLE_INPLACE=true MXNET_EXEC_MATCH_RANGE=0 python inception_memcost.py
+
+with_sharing:
+	@echo "Estimating the cost with memory sharing ..."
+	@MXNET_EXEC_ENABLE_INPLACE=false python inception_memcost.py
+
+with_both:
+	@echo "Estimating the cost with all optimizations ..."
+	@python inception_memcost.py
+
+forward_only:
+	@echo "Estimating the cost of forward only ..."
+	@python inception_memcost.py 'null'
diff --git a/example/memcost/README.md b/example/memcost/README.md
new file mode 100644
index 000000000000..446c31e65ff2
--- /dev/null
+++ b/example/memcost/README.md
@@ -0,0 +1,30 @@
+Memory Cost of Deep Nets under Different Allocations
+====================================================
+This folder contains a script to show the memory cost of different allocation strategies,
+discussed in [Note on Memory Optimization](http://mxnet.readthedocs.org/en/latest/developer-guide/note_memory.html).
+
+We use inception-bn as an example, with batch size of 32.
+
+How to See the cost
+-------------------
+The possible options are gathered together in the [Makefile](Makefile).
+Type the following command to see the allocation cost. Look for the
+```Final message Total x MB allocated```
+- ```make no_optimization```
+  - Shows the cost without any optimization.
+- ```make with_inplace```
+  - Shows the cost with inplace optimization.
+- ```make with_sharing```
+  - Shows the cost with memory allocating algorithm for sharing.
+- ```make with_both```
+  - Shows the cost of memory allocation with both inplace and sharing optimization.
+- ```make forward_only```
+  - Shows the cost of when we only want to run forward pass.
+
+Notes
+-----
+- You can change the symbol in the [inception_memcost.py](inception_memcost.py) to the net you interested in.
+- You will need to install mxnet or type make on the root folder before use the script.
+- The estimation is only on space cost of intermediate node.
+  - The cost of temporal workspace is not estimated, so you will likely need more memory when running real nets.
+- The estimation does real allocation on CPU, the plan is the same on GPU.
diff --git a/example/memcost/inception_memcost.py b/example/memcost/inception_memcost.py
new file mode 100644
index 000000000000..8183c6774724
--- /dev/null
+++ b/example/memcost/inception_memcost.py
@@ -0,0 +1,90 @@
+# pylint: skip-file
+import sys
+sys.path.append('../../python/')
+import mxnet as mx
+import logging
+
+def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
+    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
+    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
+    return act
+
+def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
+    # 1x1
+    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
+    # concat
+    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    # concat
+    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def inception(nhidden, grad_scale):
+    # data
+    data = mx.symbol.Variable(name="data")
+    # stage 1
+    conv1 = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
+    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
+    # stage 2
+    conv2red = ConvFactory(data=pool1, num_filter=64, kernel=(1, 1), stride=(1, 1), name='conv2red')
+    conv2 = ConvFactory(data=conv2red, num_filter=192, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
+    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
+    # stage 2
+    in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
+    in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
+    in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
+    # stage 3
+    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
+    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+    # stage 4
+    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+    # global avg pooling
+    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
+    # linear classifier
+    flatten = mx.symbol.Flatten(data=avg, name='flatten')
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
+    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    return softmax
+
+
+
+softmax = inception(1000, 1.0)
+batch_size = 32
+softmax = inception(1000, 1.0)
+
+if len(sys.argv) == 2:
+    grad_req = sys.argv[1]
+else:
+    grad_req = 'write'
+
+texec = softmax.simple_bind(ctx=mx.cpu(),
+                            data=(batch_size, 3, 224, 224),
+                            grad_req=grad_req)
+# We extract the memory cost from the execution plan
+print(texec.debug_str().split('\n')[-3])
diff --git a/example/mnist/lenet.py b/example/mnist/lenet.py
index 6d185f8d278c..40779150ccfb 100644
--- a/example/mnist/lenet.py
+++ b/example/mnist/lenet.py
@@ -33,7 +33,7 @@
 # dev = [mx.gpu(i) for i in range(2)]
 dev = mx.gpu()
 model = mx.model.FeedForward(
-    ctx = dev, symbol = lenet, num_round = 20,
+    ctx = dev, symbol = lenet, num_epoch = 20,
     learning_rate = 0.05, momentum = 0.9, wd = 0.00001)
 model.fit(X=train, eval_data=val,
-          epoch_end_callback=mx.callback.Speedometer(100))
+          batch_end_callback=mx.callback.Speedometer(100))
diff --git a/example/mnist/mlp.py b/example/mnist/mlp.py
index 7facf2d3bc50..0cfffe55cbe4 100644
--- a/example/mnist/mlp.py
+++ b/example/mnist/mlp.py
@@ -22,7 +22,7 @@
 logging.basicConfig(level=logging.DEBUG)
 
 model = mx.model.FeedForward(
-    ctx = mx.cpu(), symbol = mlp, num_round = 20,
+    ctx = mx.cpu(), symbol = mlp, num_epoch = 20,
     learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
 
 model.fit(X=train, eval_data=val)
diff --git a/example/mnist/mlp_numpy.py b/example/mnist/mlp_numpy.py
index af1cd011b148..114a6bf257d5 100644
--- a/example/mnist/mlp_numpy.py
+++ b/example/mnist/mlp_numpy.py
@@ -39,7 +39,7 @@
 logging.basicConfig(level=logging.DEBUG)
 
 model = mx.model.FeedForward(
-    ctx = mx.cpu(), symbol = mlp, num_round = 20,
+    ctx = mx.cpu(), symbol = mlp, num_epoch = 20,
     learning_rate = 0.1, momentum = 0.9, wd = 0.00001)
 
 # train by using Numpy ndarray direcly
diff --git a/example/notebooks/cifar-100.ipynb b/example/notebooks/cifar-100.ipynb
index 48d6e00c5464..8e8c53a2d75b 100644
--- a/example/notebooks/cifar-100.ipynb
+++ b/example/notebooks/cifar-100.ipynb
@@ -503,8 +503,8 @@
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\",\n",
-    "          epoch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
-    "          iter_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
+    "          batch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
+    "          epoch_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
    ]
   },
   {
@@ -597,8 +597,8 @@
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\",\n",
-    "          epoch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
-    "          iter_end_callback=mx.callback.do_checkpoint(model_prefix))"
+    "          batch_end_callback=mx.callback.Speedometer(batch_size, 200),\n",
+    "          epoch_end_callback=mx.callback.do_checkpoint(model_prefix))"
    ]
   },
   {
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index 832b68687572..eae38dab736c 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -269,15 +269,15 @@
     "model.fit(X=train_dataiter,\n",
     "          eval_data=test_dataiter,\n",
     "          eval_metric=\"accuracy\",\n",
-    "          epoch_end_callback=mx.callback.Speedometer(batch_size))\n",
+    "          batch_end_callback=mx.callback.Speedometer(batch_size))\n",
     "\n",
     "# if we want to save model after every round, we can add check_point call back\n",
     "# model_prefix = './cifar_'\n",
     "# model.fit(X=train_dataiter,\n",
     "#           eval_data=test_dataiter,\n",
     "#           eval_metric=\"accuracy\",\n",
-    "#           epoch_end_callback=mx.helper.Speedometer(batch_size),\n",
-    "#           iter_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
+    "#           batch_end_callback=mx.helper.Speedometer(batch_size),\n",
+    "#           epoch_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
    ]
   },
   {
diff --git a/example/rnn/README.md b/example/rnn/README.md
new file mode 100644
index 000000000000..74654c08b88c
--- /dev/null
+++ b/example/rnn/README.md
@@ -0,0 +1,11 @@
+RNN Example
+===========
+This folder contains RNN examples using low level symbol interface.
+
+- [lstm.py](lstm.py) Functions for building a LSTM Network
+- [lstm_ptb.py](lstm_ptb.py) PennTreeBank language model by using LSTM
+- [char_lstm.ipynb](char_lstm.ipynb) Notebook to demo how to train a character LSTM by using ```lstm.py```
+
+
+Performance Note:
+More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer [Environment Variables](https://mxnet.readthedocs.org/en/latest/env_var.html)
diff --git a/example/rnn/char_lstm.ipynb b/example/rnn/char_lstm.ipynb
new file mode 100644
index 000000000000..72ba3f18dc41
--- /dev/null
+++ b/example/rnn/char_lstm.ipynb
@@ -0,0 +1,559 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Char LSTM Example.\n",
+    "This example aims to show how to use lstm to build a char level language model, and generate text from it. \n",
+    "We use a tiny shakespeare text for demo purpose. \n",
+    "\n",
+    "Data can be found at [https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from collections import Counter\n",
+    "from collections import defaultdict\n",
+    "import mxnet as mx\n",
+    "import numpy as np\n",
+    "import sys\n",
+    "import lstm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set basic network parameters. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "batch_size = 32\n",
+    "seq_len = 32\n",
+    "num_hidden = 256\n",
+    "num_embed = 256\n",
+    "num_lstm_layer = 2\n",
+    "num_round = 21\n",
+    "learning_rate= 1\n",
+    "wd=0.00001\n",
+    "momentum=0.0\n",
+    "clip_gradient=1\n",
+    "update_period = 1\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make dictionary from text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def make_dict(text, max_vocab=10000):\n",
+    "    lst = list(text)\n",
+    "    cnt = Counter(lst)\n",
+    "    print(\"Total unique char: %d\" % len(cnt))\n",
+    "    common = cnt.most_common(max_vocab - 1)\n",
+    "    dic = defaultdict(int)\n",
+    "    idx = 0\n",
+    "    for c, _ in common:\n",
+    "        dic[c] = idx\n",
+    "        idx += 1\n",
+    "    if len(dic) == max_vocab - 1:\n",
+    "        dic[\"_UNKNOWN_\"] = idx\n",
+    "    return dic\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Transfer text into data batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def make_batch(file_path, batch_size=32, seq_lenth=32, max_vocab=10000, dic=None):\n",
+    "    fi = open(file_path)\n",
+    "    text = fi.read()\n",
+    "    fi.close()\n",
+    "    if dic == None:\n",
+    "        dic = make_dict(text, max_vocab)\n",
+    "    lookup_table = dict((idx, c) for c, idx in dic.items())\n",
+    "    char_lst = list(text)\n",
+    "    num_batch = int(len(char_lst) / batch_size)\n",
+    "    char_lst = char_lst[:num_batch * batch_size]\n",
+    "    data = np.zeros((num_batch, batch_size), dtype=\"float32\")\n",
+    "    idx = 0\n",
+    "    for j in range(batch_size):\n",
+    "        for i in range(num_batch):\n",
+    "            if char_lst[idx] in dic:\n",
+    "                data[i][j] = dic[char_lst[idx]]\n",
+    "            else:\n",
+    "                char_lst[idx] = dic[\"_UNKNOWN_\"]\n",
+    "            idx += 1\n",
+    "    return data, dic, lookup_table\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total unique char: 65\n"
+     ]
+    }
+   ],
+   "source": [
+    "X, dic, lookup_table = make_batch(\"./input.txt\", batch_size=batch_size, seq_lenth=seq_len)\n",
+    "vocab = len(dic)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Move tail text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def drop_tail(X, seq_len):\n",
+    "    shape = X.shape\n",
+    "    nstep = int(shape[0] / seq_len)\n",
+    "    return X[0:(nstep * seq_len), :]\n",
+    "\n",
+    "train_val_fraction = 0.9\n",
+    "size = X.shape[0]\n",
+    "X_train = X[:int(size * train_val_fraction), :]\n",
+    "X_val = X[int(size * train_val_fraction):, :]\n",
+    "\n",
+    "X_train = drop_tail(X_train, seq_len)\n",
+    "X_val = drop_tail(X_val, seq_len)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set up LSTM model on GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model = lstm.setup_rnn_model(mx.gpu(),\n",
+    "                             num_lstm_layer=num_lstm_layer,\n",
+    "                             seq_len=seq_len,\n",
+    "                             num_hidden=num_hidden,\n",
+    "                             num_embed=num_embed,\n",
+    "                             num_label=vocab,\n",
+    "                             batch_size=batch_size,\n",
+    "                             input_size=vocab,\n",
+    "                             initializer=mx.initializer.Uniform(0.1),\n",
+    "                             dropout=0.)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train LSTM model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training swith train.shape=(31360, 32)\n",
+      "Training swith val.shape=(3456, 32)\n",
+      "batch_size=32\n",
+      "seq_len=32\n",
+      "Epoch [125] Train: NLL=3.368, Prep=29.019\n",
+      "Epoch [250] Train: NLL=3.289, Prep=26.811\n",
+      "Epoch [375] Train: NLL=3.180, Prep=24.044\n",
+      "Epoch [500] Train: NLL=3.070, Prep=21.534\n",
+      "Epoch [625] Train: NLL=2.971, Prep=19.503\n",
+      "Epoch [750] Train: NLL=2.891, Prep=18.011\n",
+      "Epoch [875] Train: NLL=2.824, Prep=16.846\n",
+      "Iter [0] Train: Time: 40.182 sec, NLL=2.775, Prep=16.041\n",
+      "Iter [0] Val: NLL=2.288, Prep=9.857\n",
+      "Epoch [1000] Train: NLL=2.347, Prep=10.451\n",
+      "Epoch [1125] Train: NLL=2.321, Prep=10.188\n",
+      "Epoch [1250] Train: NLL=2.298, Prep=9.951\n",
+      "Epoch [1375] Train: NLL=2.276, Prep=9.741\n",
+      "Epoch [1500] Train: NLL=2.256, Prep=9.541\n",
+      "Epoch [1625] Train: NLL=2.234, Prep=9.338\n",
+      "Epoch [1750] Train: NLL=2.215, Prep=9.160\n",
+      "Epoch [1875] Train: NLL=2.196, Prep=8.987\n",
+      "Iter [1] Train: Time: 40.342 sec, NLL=2.184, Prep=8.885\n",
+      "Iter [1] Val: NLL=1.988, Prep=7.298\n",
+      "Epoch [2000] Train: NLL=2.050, Prep=7.766\n",
+      "Epoch [2125] Train: NLL=2.032, Prep=7.631\n",
+      "Epoch [2250] Train: NLL=2.014, Prep=7.490\n",
+      "Epoch [2375] Train: NLL=2.002, Prep=7.405\n",
+      "Epoch [2500] Train: NLL=1.988, Prep=7.297\n",
+      "Epoch [2625] Train: NLL=1.974, Prep=7.200\n",
+      "Epoch [2750] Train: NLL=1.961, Prep=7.106\n",
+      "Epoch [2875] Train: NLL=1.949, Prep=7.024\n",
+      "Iter [2] Train: Time: 40.377 sec, NLL=1.943, Prep=6.981\n",
+      "Iter [2] Val: NLL=1.808, Prep=6.101\n",
+      "Reset learning rate to 0.9\n",
+      "Epoch [3000] Train: NLL=1.850, Prep=6.359\n",
+      "Epoch [3125] Train: NLL=1.844, Prep=6.323\n",
+      "Epoch [3250] Train: NLL=1.831, Prep=6.238\n",
+      "Epoch [3375] Train: NLL=1.822, Prep=6.185\n",
+      "Epoch [3500] Train: NLL=1.812, Prep=6.124\n",
+      "Epoch [3625] Train: NLL=1.805, Prep=6.077\n",
+      "Epoch [3750] Train: NLL=1.797, Prep=6.033\n",
+      "Epoch [3875] Train: NLL=1.790, Prep=5.990\n",
+      "Iter [3] Train: Time: 40.348 sec, NLL=1.787, Prep=5.973\n",
+      "Iter [3] Val: NLL=1.695, Prep=5.446\n",
+      "Epoch [4000] Train: NLL=1.736, Prep=5.676\n",
+      "Epoch [4125] Train: NLL=1.734, Prep=5.663\n",
+      "Epoch [4250] Train: NLL=1.722, Prep=5.595\n",
+      "Epoch [4375] Train: NLL=1.715, Prep=5.555\n",
+      "Epoch [4500] Train: NLL=1.707, Prep=5.514\n",
+      "Epoch [4625] Train: NLL=1.703, Prep=5.492\n",
+      "Epoch [4750] Train: NLL=1.697, Prep=5.459\n",
+      "Epoch [4875] Train: NLL=1.693, Prep=5.434\n",
+      "Iter [4] Train: Time: 40.372 sec, NLL=1.691, Prep=5.427\n",
+      "Iter [4] Val: NLL=1.617, Prep=5.039\n",
+      "Epoch [5000] Train: NLL=1.659, Prep=5.257\n",
+      "Epoch [5125] Train: NLL=1.653, Prep=5.221\n",
+      "Epoch [5250] Train: NLL=1.645, Prep=5.179\n",
+      "Epoch [5375] Train: NLL=1.638, Prep=5.143\n",
+      "Epoch [5500] Train: NLL=1.633, Prep=5.119\n",
+      "Epoch [5625] Train: NLL=1.629, Prep=5.101\n",
+      "Epoch [5750] Train: NLL=1.625, Prep=5.079\n",
+      "Epoch [5875] Train: NLL=1.621, Prep=5.059\n",
+      "Iter [5] Train: Time: 40.363 sec, NLL=1.621, Prep=5.059\n",
+      "Iter [5] Val: NLL=1.569, Prep=4.804\n",
+      "Reset learning rate to 0.81\n",
+      "Epoch [6000] Train: NLL=1.603, Prep=4.966\n",
+      "Epoch [6125] Train: NLL=1.588, Prep=4.895\n",
+      "Epoch [6250] Train: NLL=1.585, Prep=4.879\n",
+      "Epoch [6375] Train: NLL=1.579, Prep=4.852\n",
+      "Epoch [6500] Train: NLL=1.574, Prep=4.827\n",
+      "Epoch [6625] Train: NLL=1.571, Prep=4.812\n",
+      "Epoch [6750] Train: NLL=1.567, Prep=4.793\n",
+      "Iter [6] Train: Time: 40.353 sec, NLL=1.565, Prep=4.781\n",
+      "Iter [6] Val: NLL=1.529, Prep=4.615\n",
+      "Epoch [6875] Train: NLL=1.574, Prep=4.824\n",
+      "Epoch [7000] Train: NLL=1.560, Prep=4.760\n",
+      "Epoch [7125] Train: NLL=1.545, Prep=4.686\n",
+      "Epoch [7250] Train: NLL=1.544, Prep=4.684\n",
+      "Epoch [7375] Train: NLL=1.538, Prep=4.654\n",
+      "Epoch [7500] Train: NLL=1.534, Prep=4.635\n",
+      "Epoch [7625] Train: NLL=1.530, Prep=4.620\n",
+      "Epoch [7750] Train: NLL=1.528, Prep=4.607\n",
+      "Iter [7] Train: Time: 40.353 sec, NLL=1.526, Prep=4.598\n",
+      "Iter [7] Val: NLL=1.496, Prep=4.463\n",
+      "Epoch [7875] Train: NLL=1.530, Prep=4.619\n",
+      "Epoch [8000] Train: NLL=1.522, Prep=4.579\n",
+      "Epoch [8125] Train: NLL=1.511, Prep=4.533\n",
+      "Epoch [8250] Train: NLL=1.511, Prep=4.532\n",
+      "Epoch [8375] Train: NLL=1.506, Prep=4.508\n",
+      "Epoch [8500] Train: NLL=1.503, Prep=4.494\n",
+      "Epoch [8625] Train: NLL=1.499, Prep=4.479\n",
+      "Epoch [8750] Train: NLL=1.497, Prep=4.467\n",
+      "Iter [8] Train: Time: 40.371 sec, NLL=1.495, Prep=4.461\n",
+      "Iter [8] Val: NLL=1.481, Prep=4.396\n",
+      "Reset learning rate to 0.729\n",
+      "Epoch [8875] Train: NLL=1.478, Prep=4.384\n",
+      "Epoch [9000] Train: NLL=1.489, Prep=4.434\n",
+      "Epoch [9125] Train: NLL=1.482, Prep=4.400\n",
+      "Epoch [9250] Train: NLL=1.480, Prep=4.391\n",
+      "Epoch [9375] Train: NLL=1.474, Prep=4.368\n",
+      "Epoch [9500] Train: NLL=1.471, Prep=4.355\n",
+      "Epoch [9625] Train: NLL=1.469, Prep=4.343\n",
+      "Epoch [9750] Train: NLL=1.466, Prep=4.333\n",
+      "Iter [9] Train: Time: 40.344 sec, NLL=1.465, Prep=4.329\n",
+      "Iter [9] Val: NLL=1.453, Prep=4.278\n",
+      "Epoch [9875] Train: NLL=1.458, Prep=4.297\n",
+      "Epoch [10000] Train: NLL=1.466, Prep=4.331\n",
+      "Epoch [10125] Train: NLL=1.460, Prep=4.305\n",
+      "Epoch [10250] Train: NLL=1.456, Prep=4.289\n",
+      "Epoch [10375] Train: NLL=1.452, Prep=4.270\n",
+      "Epoch [10500] Train: NLL=1.449, Prep=4.260\n",
+      "Epoch [10625] Train: NLL=1.447, Prep=4.248\n",
+      "Epoch [10750] Train: NLL=1.445, Prep=4.242\n",
+      "Iter [10] Train: Time: 40.341 sec, NLL=1.444, Prep=4.240\n",
+      "Iter [10] Val: NLL=1.438, Prep=4.211\n",
+      "Epoch [10875] Train: NLL=1.447, Prep=4.250\n",
+      "Epoch [11000] Train: NLL=1.445, Prep=4.243\n",
+      "Epoch [11125] Train: NLL=1.440, Prep=4.222\n",
+      "Epoch [11250] Train: NLL=1.436, Prep=4.205\n",
+      "Epoch [11375] Train: NLL=1.434, Prep=4.196\n",
+      "Epoch [11500] Train: NLL=1.432, Prep=4.185\n",
+      "Epoch [11625] Train: NLL=1.429, Prep=4.175\n",
+      "Epoch [11750] Train: NLL=1.428, Prep=4.169\n",
+      "Iter [11] Train: Time: 40.352 sec, NLL=1.427, Prep=4.168\n",
+      "Iter [11] Val: NLL=1.429, Prep=4.174\n",
+      "Reset learning rate to 0.6561\n",
+      "Epoch [11875] Train: NLL=1.431, Prep=4.182\n",
+      "Epoch [12000] Train: NLL=1.424, Prep=4.154\n",
+      "Epoch [12125] Train: NLL=1.422, Prep=4.145\n",
+      "Epoch [12250] Train: NLL=1.418, Prep=4.127\n",
+      "Epoch [12375] Train: NLL=1.414, Prep=4.113\n",
+      "Epoch [12500] Train: NLL=1.412, Prep=4.105\n",
+      "Epoch [12625] Train: NLL=1.410, Prep=4.096\n",
+      "Iter [12] Train: Time: 40.357 sec, NLL=1.409, Prep=4.091\n",
+      "Iter [12] Val: NLL=1.417, Prep=4.124\n",
+      "Epoch [12750] Train: NLL=1.435, Prep=4.201\n",
+      "Epoch [12875] Train: NLL=1.417, Prep=4.123\n",
+      "Epoch [13000] Train: NLL=1.408, Prep=4.086\n",
+      "Epoch [13125] Train: NLL=1.409, Prep=4.091\n",
+      "Epoch [13250] Train: NLL=1.404, Prep=4.073\n",
+      "Epoch [13375] Train: NLL=1.401, Prep=4.058\n",
+      "Epoch [13500] Train: NLL=1.398, Prep=4.048\n",
+      "Epoch [13625] Train: NLL=1.397, Prep=4.041\n",
+      "Iter [13] Train: Time: 40.356 sec, NLL=1.396, Prep=4.038\n",
+      "Iter [13] Val: NLL=1.411, Prep=4.102\n",
+      "Epoch [13750] Train: NLL=1.414, Prep=4.114\n",
+      "Epoch [13875] Train: NLL=1.402, Prep=4.063\n",
+      "Epoch [14000] Train: NLL=1.395, Prep=4.036\n",
+      "Epoch [14125] Train: NLL=1.396, Prep=4.037\n",
+      "Epoch [14250] Train: NLL=1.392, Prep=4.023\n",
+      "Epoch [14375] Train: NLL=1.389, Prep=4.010\n",
+      "Epoch [14500] Train: NLL=1.386, Prep=4.000\n",
+      "Epoch [14625] Train: NLL=1.385, Prep=3.995\n",
+      "Iter [14] Train: Time: 40.344 sec, NLL=1.384, Prep=3.992\n",
+      "Iter [14] Val: NLL=1.400, Prep=4.055\n",
+      "Reset learning rate to 0.59049\n",
+      "Epoch [14750] Train: NLL=1.378, Prep=3.966\n",
+      "Epoch [14875] Train: NLL=1.390, Prep=4.014\n",
+      "Epoch [15000] Train: NLL=1.383, Prep=3.986\n",
+      "Epoch [15125] Train: NLL=1.382, Prep=3.982\n",
+      "Epoch [15250] Train: NLL=1.377, Prep=3.965\n",
+      "Epoch [15375] Train: NLL=1.375, Prep=3.957\n",
+      "Epoch [15500] Train: NLL=1.372, Prep=3.945\n",
+      "Epoch [15625] Train: NLL=1.371, Prep=3.938\n",
+      "Iter [15] Train: Time: 40.352 sec, NLL=1.370, Prep=3.936\n",
+      "Iter [15] Val: NLL=1.393, Prep=4.026\n",
+      "Epoch [15750] Train: NLL=1.368, Prep=3.927\n",
+      "Epoch [15875] Train: NLL=1.380, Prep=3.974\n",
+      "Epoch [16000] Train: NLL=1.374, Prep=3.951\n",
+      "Epoch [16125] Train: NLL=1.371, Prep=3.940\n",
+      "Epoch [16250] Train: NLL=1.367, Prep=3.922\n",
+      "Epoch [16375] Train: NLL=1.364, Prep=3.912\n",
+      "Epoch [16500] Train: NLL=1.362, Prep=3.905\n",
+      "Epoch [16625] Train: NLL=1.361, Prep=3.900\n",
+      "Iter [16] Train: Time: 40.358 sec, NLL=1.360, Prep=3.898\n",
+      "Iter [16] Val: NLL=1.389, Prep=4.012\n",
+      "Epoch [16750] Train: NLL=1.367, Prep=3.924\n",
+      "Epoch [16875] Train: NLL=1.367, Prep=3.923\n",
+      "Epoch [17000] Train: NLL=1.363, Prep=3.907\n",
+      "Epoch [17125] Train: NLL=1.360, Prep=3.895\n",
+      "Epoch [17250] Train: NLL=1.357, Prep=3.886\n",
+      "Epoch [17375] Train: NLL=1.355, Prep=3.878\n",
+      "Epoch [17500] Train: NLL=1.353, Prep=3.867\n",
+      "Epoch [17625] Train: NLL=1.352, Prep=3.864\n",
+      "Iter [17] Train: Time: 40.347 sec, NLL=1.352, Prep=3.864\n",
+      "Iter [17] Val: NLL=1.384, Prep=3.990\n",
+      "Reset learning rate to 0.531441\n",
+      "Epoch [17750] Train: NLL=1.362, Prep=3.903\n",
+      "Epoch [17875] Train: NLL=1.355, Prep=3.877\n",
+      "Epoch [18000] Train: NLL=1.353, Prep=3.870\n",
+      "Epoch [18125] Train: NLL=1.348, Prep=3.851\n",
+      "Epoch [18250] Train: NLL=1.346, Prep=3.843\n",
+      "Epoch [18375] Train: NLL=1.344, Prep=3.834\n",
+      "Epoch [18500] Train: NLL=1.342, Prep=3.827\n",
+      "Iter [18] Train: Time: 40.354 sec, NLL=1.341, Prep=3.823\n",
+      "Iter [18] Val: NLL=1.378, Prep=3.967\n",
+      "Epoch [18625] Train: NLL=1.370, Prep=3.935\n",
+      "Epoch [18750] Train: NLL=1.352, Prep=3.863\n",
+      "Epoch [18875] Train: NLL=1.345, Prep=3.838\n",
+      "Epoch [19000] Train: NLL=1.346, Prep=3.841\n",
+      "Epoch [19125] Train: NLL=1.341, Prep=3.823\n",
+      "Epoch [19250] Train: NLL=1.338, Prep=3.811\n",
+      "Epoch [19375] Train: NLL=1.336, Prep=3.803\n",
+      "Epoch [19500] Train: NLL=1.334, Prep=3.797\n",
+      "Iter [19] Train: Time: 40.370 sec, NLL=1.334, Prep=3.795\n",
+      "Iter [19] Val: NLL=1.377, Prep=3.961\n",
+      "Epoch [19625] Train: NLL=1.354, Prep=3.874\n",
+      "Epoch [19750] Train: NLL=1.344, Prep=3.836\n",
+      "Epoch [19875] Train: NLL=1.338, Prep=3.811\n",
+      "Epoch [20000] Train: NLL=1.338, Prep=3.813\n",
+      "Epoch [20125] Train: NLL=1.334, Prep=3.797\n",
+      "Epoch [20250] Train: NLL=1.331, Prep=3.786\n",
+      "Epoch [20375] Train: NLL=1.329, Prep=3.778\n",
+      "Epoch [20500] Train: NLL=1.328, Prep=3.774\n",
+      "Iter [20] Train: Time: 40.363 sec, NLL=1.327, Prep=3.771\n",
+      "Iter [20] Val: NLL=1.373, Prep=3.946\n",
+      "Reset learning rate to 0.478297\n"
+     ]
+    }
+   ],
+   "source": [
+    "lstm.train_lstm(model, X_train, X_val,\n",
+    "                num_round=num_round,\n",
+    "                half_life=3,\n",
+    "                update_period=update_period,\n",
+    "                learning_rate=learning_rate,\n",
+    "                wd=wd,\n",
+    "                momentum=momentum,\n",
+    "                clip_gradient=clip_gradient)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get parameter from model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "args = dict([(name, arr) for i, arr, grad_arr, name in model.param_blocks])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make a sampler use the parameter we trained"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+     ]
+    }
+   ],
+   "source": [
+    "batch_size = 1\n",
+    "sampler = lstm.setup_rnn_sample_model(mx.cpu(), args, num_lstm_layer, num_hidden, num_embed, vocab, batch_size, vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "start = 'a'\n",
+    "seq_len = 75\n",
+    "X_input_batch = np.zeros((1,1), dtype=\"float32\")\n",
+    "X_input_batch[0][0] = dic[start]\n",
+    "out = lstm.sample_lstm(sampler, X_input_batch, seq_len)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lookup predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "an'd and dear victories at sound before.\n",
+      "Sir! palient, made me; let it kiss \n"
+     ]
+    }
+   ],
+   "source": [
+    "chars = [lookup_table[int(out[i][0])] for i in range(seq_len)]\n",
+    "print(start + \"\".join(chars))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
new file mode 100644
index 000000000000..25245aad18ee
--- /dev/null
+++ b/example/rnn/lstm.py
@@ -0,0 +1,362 @@
+# pylint:skip-file
+import sys
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+import time
+
+LSTMState = namedtuple("LSTMState", ["c", "h"])
+LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias",
+                                     "h2h_weight", "h2h_bias"])
+LSTMModel = namedtuple("LSTMModel", ["rnn_exec", "symbol",
+                                     "init_states", "last_states",
+                                     "seq_data", "seq_labels", "seq_outputs",
+                                     "param_blocks"])
+
+def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
+    """LSTM Cell symbol"""
+    if dropout > 0.:
+        indata = mx.sym.Dropout(data=indata, p=dropout)
+    i2h = mx.sym.FullyConnected(data=indata,
+                                weight=param.i2h_weight,
+                                bias=param.i2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_i2h" % (seqidx, layeridx))
+    h2h = mx.sym.FullyConnected(data=prev_state.h,
+                                weight=param.h2h_weight,
+                                bias=param.h2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_h2h" % (seqidx, layeridx))
+    gates = i2h + h2h
+    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
+                                      name="t%d_l%d_slice" % (seqidx, layeridx))
+    in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid")
+    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
+    forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid")
+    out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid")
+    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
+    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")
+    return LSTMState(c=next_c, h=next_h)
+
+
+def lstm_unroll(num_lstm_layer, seq_len,
+                num_hidden, num_embed, num_label, dropout=0.):
+    """unrolled lstm network"""
+    # initialize the parameter symbols
+    embed_weight=mx.sym.Variable("embed_weight")
+    cls_weight = mx.sym.Variable("cls_weight")
+    cls_bias = mx.sym.Variable("cls_bias")
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight = mx.sym.Variable("l%d_i2h_weight" % i),
+                                      i2h_bias = mx.sym.Variable("l%d_i2h_bias" % i),
+                                      h2h_weight = mx.sym.Variable("l%d_h2h_weight" % i),
+                                      h2h_bias = mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+    assert(len(last_states) == num_lstm_layer)
+
+    out_prob = []
+    for seqidx in range(seq_len):
+        # embeding layer
+        data = mx.sym.Variable("t%d_data" % seqidx)
+        label = mx.sym.Variable("t%d_label" % seqidx)
+        hidden = mx.sym.FullyConnected(data=data, weight=embed_weight,
+                                      num_hidden = num_embed, no_bias=True,
+                                      name="t%d_embed" % seqidx)
+        # stack LSTM
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata=hidden,
+                              prev_state=last_states[i],
+                              param=param_cells[i],
+                              seqidx=seqidx, layeridx=i, dropout=0.)
+            hidden = next_state.h
+            last_states[i] = next_state
+        # decoder
+        if dropout > 0.:
+            hidden = mx.sym.Dropout(data=hidden, p=dropout)
+        fc = mx.sym.FullyConnected(data=hidden,
+                                   weight=cls_weight,
+                                   bias=cls_bias,
+                                   num_hidden=num_label,
+                                   name="t%d_cls" % seqidx)
+        sm = mx.sym.Softmax(data=fc, label=label, name="t%d_sm" % seqidx)
+        out_prob.append(sm)
+
+    for i in range(num_lstm_layer):
+        state = last_states[i]
+        state = LSTMState(c=mx.sym.BlockGrad(state.c, name="l%d_last_c" % i),
+                          h=mx.sym.BlockGrad(state.h, name="l%d_last_h" % i))
+        last_states[i] = state
+
+    unpack_c = [state.c for state in last_states]
+    unpack_h = [state.h for state in last_states]
+    list_all = out_prob + unpack_c + unpack_h
+    return mx.sym.Group(list_all)
+
+
+def is_param_name(name):
+    return name.endswith("weight") or name.endswith("bias") or\
+        name.endswith("gamma") or name.endswith("beta")
+
+
+def setup_rnn_model(ctx,
+                    num_lstm_layer, seq_len,
+                    num_hidden, num_embed, num_label,
+                    batch_size, input_size,
+                    initializer, dropout=0.):
+    """set up rnn model with lstm cells"""
+    rnn_sym = lstm_unroll(num_lstm_layer=num_lstm_layer,
+                          num_hidden=num_hidden,
+                          seq_len=seq_len,
+                          num_embed=num_embed,
+                          num_label=num_label,
+                          dropout=dropout)
+    arg_names = rnn_sym.list_arguments()
+
+    input_shapes = {}
+    for name in arg_names:
+        if name.endswith("init_c") or name.endswith("init_h"):
+            input_shapes[name] = (batch_size, num_hidden)
+        elif name.endswith("data"):
+            input_shapes[name] = (batch_size, input_size)
+        else:
+            pass
+
+    arg_shape, out_shape, aux_shape = rnn_sym.infer_shape(**input_shapes)
+    arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
+    args_grad = {}
+    for shape, name in zip(arg_shape, arg_names):
+        if is_param_name(name):
+            args_grad[name] = mx.nd.zeros(shape, ctx)
+
+    rnn_exec = rnn_sym.bind(ctx=ctx, args=arg_arrays,
+                            args_grad=args_grad,
+                            grad_req="add")
+    param_blocks = []
+    arg_dict = dict(zip(arg_names, rnn_exec.arg_arrays))
+    for i, name in enumerate(arg_names):
+        if is_param_name(name):
+            initializer(name, arg_dict[name])
+
+            param_blocks.append((i, arg_dict[name], args_grad[name], name))
+        else:
+            assert name not in args_grad
+    out_dict = dict(zip(rnn_sym.list_outputs(), rnn_exec.outputs))
+
+    init_states = [LSTMState(c=arg_dict["l%d_init_c" % i],
+                             h=arg_dict["l%d_init_h" % i]) for i in range(num_lstm_layer)]
+    seq_labels = [rnn_exec.arg_dict["t%d_label" % i] for i in range(seq_len)]
+    seq_data = [rnn_exec.arg_dict["t%d_data" % i] for i in range(seq_len)]
+    last_states = [LSTMState(c=out_dict["l%d_last_c_output" % i],
+                             h=out_dict["l%d_last_h_output" % i]) for i in range(num_lstm_layer)]
+    seq_outputs = [out_dict["t%d_sm_output" % i] for i in range(seq_len)]
+
+    return LSTMModel(rnn_exec=rnn_exec, symbol=rnn_sym,
+                     init_states=init_states, last_states=last_states,
+                     seq_data=seq_data, seq_labels=seq_labels, seq_outputs=seq_outputs,
+                     param_blocks=param_blocks)
+
+
+
+def set_rnn_inputs(m, X, begin):
+    seq_len = len(m.seq_data)
+    batch_size, vocab = m.seq_data[0].shape
+    for seqidx in range(seq_len):
+        idx = (begin + seqidx) % X.shape[0]
+        next_idx = (begin + seqidx + 1) % X.shape[0]
+        x = X[idx, :]
+        y = X[next_idx, :]
+        mx.nd.onehot_encode(mx.nd.array(x, ctx=m.seq_data[seqidx].context),
+                out=m.seq_data[seqidx])
+        m.seq_labels[seqidx][:] = y
+
+def calc_nll(seq_label_probs, X, begin):
+    eps = 1e-10
+    nll = 0.
+    for seqidx in range(len(seq_label_probs)):
+        next_idx = (begin + seqidx + 1) % X.shape[0]
+        y = X[next_idx, :]
+        py = seq_label_probs[seqidx].asnumpy()
+        nll += -np.sum(np.log(np.maximum(py, eps))) / len(y)
+    return nll
+
+def train_lstm(model, X_train_batch, X_val_batch,
+               num_round, update_period,
+               optimizer='sgd', half_life=2, **kwargs):
+    print("Training swith train.shape=%s" % str(X_train_batch.shape))
+    print("Training swith val.shape=%s" % str(X_val_batch.shape))
+    m = model
+    seq_len = len(m.seq_data)
+    batch_size = m.seq_data[0].shape[0]
+    print("batch_size=%d" % batch_size)
+    print("seq_len=%d" % seq_len)
+    rescale_grad = 1.0 / (seq_len * batch_size * update_period)
+    opt = mx.optimizer.create(optimizer,
+                              rescale_grad=rescale_grad,
+                              **kwargs)
+    updater = mx.optimizer.get_updater(opt)
+    epoch_counter = 0
+    log_period = max(1000 / seq_len, 1)
+
+    for iteration in range(num_round):
+        nbatch = 0
+        train_nll = 0
+        # reset states
+        for state in m.init_states:
+            state.c[:] = 0.0
+            state.h[:] = 0.0
+        tic = time.time()
+        assert X_train_batch.shape[0] % seq_len == 0
+        assert X_val_batch.shape[0] % seq_len == 0
+        for begin in range(0, X_train_batch.shape[0], seq_len):
+            set_rnn_inputs(m, X_train_batch, begin=begin)
+            m.rnn_exec.forward(is_train=True)
+            # probability of each label class, used to evaluate nll
+            seq_label_probs = [mx.nd.choose_element(out, label).copyto(mx.cpu())
+                               for out, label in zip(m.seq_outputs, m.seq_labels)]
+            m.rnn_exec.backward()
+            # transfer the states
+            for init, last in zip(m.init_states, m.last_states):
+                last.c.copyto(init.c)
+                last.h.copyto(init.h)
+            # update epoch counter
+            epoch_counter += 1
+            if epoch_counter % update_period == 0:
+                # updare parameters
+                for idx, weight, grad, name in m.param_blocks:
+                    updater(idx, grad, weight)
+                    # reset gradient to zero
+                    grad[:] = 0.0
+            train_nll += calc_nll(seq_label_probs, X_train_batch, begin=begin)
+
+            nbatch = begin + seq_len
+            if epoch_counter % log_period == 0:
+                print("Epoch [%d] Train: NLL=%.3f, Prep=%.3f" % (
+                    epoch_counter, train_nll / nbatch, np.exp(train_nll / nbatch)))
+        # end of training loop
+        toc = time.time()
+        print("Iter [%d] Train: Time: %.3f sec, NLL=%.3f, Prep=%.3f" % (
+            iteration, toc - tic, train_nll / nbatch, np.exp(train_nll / nbatch)))
+
+        val_nll = 0.0
+        # validation set, reset states
+        for state in m.init_states:
+            state.c[:] = 0.0
+            state.h[:] = 0.0
+        for begin in range(0, X_val_batch.shape[0], seq_len):
+            set_rnn_inputs(m, X_val_batch, begin=begin)
+            m.rnn_exec.forward(is_train=False)
+            # probability of each label class, used to evaluate nll
+            seq_label_probs = [mx.nd.choose_element(out, label).copyto(mx.cpu())
+                               for out, label in zip(m.seq_outputs, m.seq_labels)]
+            # transfer the states
+            for init, last in zip(m.init_states, m.last_states):
+                last.c.copyto(init.c)
+                last.h.copyto(init.h)
+            val_nll += calc_nll(seq_label_probs, X_val_batch, begin=begin)
+        nbatch = X_val_batch.shape[0]
+        print("Iter [%d] Val: NLL=%.3f, Prep=%.3f" % (
+            iteration, val_nll / nbatch, np.exp(val_nll / nbatch)))
+        if (iteration + 1) % half_life == 0:
+            opt.lr *= 0.9
+            print("Reset learning rate to %g" % opt.lr)
+
+def setup_rnn_sample_model(ctx,
+                           params,
+                           num_lstm_layer,
+                           num_hidden, num_embed, num_label,
+                           batch_size, input_size):
+    seq_len = 1
+    rnn_sym = lstm_unroll(num_lstm_layer=num_lstm_layer,
+                          num_hidden=num_hidden,
+                          seq_len=seq_len,
+                          num_embed=num_embed,
+                          num_label=num_label)
+    arg_names = rnn_sym.list_arguments()
+    input_shapes = {}
+    for name in arg_names:
+        if name.endswith("init_c") or name.endswith("init_h"):
+            input_shapes[name] = (batch_size, num_hidden)
+        elif name.endswith("data"):
+            input_shapes[name] = (batch_size, input_size)
+        else:
+            pass
+    arg_shape, out_shape, aux_shape = rnn_sym.infer_shape(**input_shapes)
+    arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
+    arg_dict = dict(zip(arg_names, arg_arrays))
+    for name, arr in params.items():
+        arg_dict[name][:] = arr
+    rnn_exec = rnn_sym.bind(ctx=ctx, args=arg_arrays, args_grad=None, grad_req="null")
+    out_dict = dict(zip(rnn_sym.list_outputs(), rnn_exec.outputs))
+    param_blocks = []
+    params_array = list(params.items())
+    for i in range(len(params)):
+        param_blocks.append((i, params_array[i][1], None, params_array[i][0]))
+    init_states = [LSTMState(c=arg_dict["l%d_init_c" % i],
+                             h=arg_dict["l%d_init_h" % i]) for i in range(num_lstm_layer)]
+    seq_labels = [rnn_exec.arg_dict["t%d_label" % i] for i in range(seq_len)]
+    seq_data = [rnn_exec.arg_dict["t%d_data" % i] for i in range(seq_len)]
+    last_states = [LSTMState(c=out_dict["l%d_last_c_output" % i],
+                             h=out_dict["l%d_last_h_output" % i]) for i in range(num_lstm_layer)]
+    seq_outputs = [out_dict["t%d_sm_output" % i] for i in range(seq_len)]
+
+    return LSTMModel(rnn_exec=rnn_exec, symbol=rnn_sym,
+                     init_states=init_states, last_states=last_states,
+                     seq_data=seq_data, seq_labels=seq_labels, seq_outputs=seq_outputs,
+                     param_blocks=param_blocks)
+
+# Python3 np.random.choice is too strict in eval float probability so we use an alternative
+import random
+import bisect
+import collections
+
+def _cdf(weights):
+    total = sum(weights)
+    result = []
+    cumsum = 0
+    for w in weights:
+        cumsum += w
+        result.append(cumsum / total)
+    return result
+
+def _choice(population, weights):
+    assert len(population) == len(weights)
+    cdf_vals = _cdf(weights)
+    x = random.random()
+    idx = bisect.bisect(cdf_vals, x)
+    return population[idx]
+
+def sample_lstm(model, X_input_batch, seq_len, temperature=1., sample=True):
+    m = model
+    vocab = m.seq_outputs[0].shape[1]
+    batch_size = m.seq_data[0].shape[0]
+    outputs_ndarray = mx.nd.zeros(m.seq_outputs[0].shape)
+    outputs_batch = []
+    tmp = [i for i in range(vocab)]
+    for i in range(seq_len):
+        outputs_batch.append(np.zeros(X_input_batch.shape))
+    for i in range(seq_len):
+        set_rnn_inputs(m, X_input_batch, 0)
+        m.rnn_exec.forward(is_train=False)
+        outputs_ndarray[:] = m.seq_outputs[0]
+        for init, last in zip(m.init_states, m.last_states):
+            last.c.copyto(init.c)
+            last.h.copyto(init.h)
+        prob = np.clip(outputs_ndarray.asnumpy(), 1e-6, 1 - 1e-6)
+        if sample:
+            rescale = np.exp(np.log(prob) / temperature)
+            for j in range(batch_size):
+                p = rescale[j, :]
+                p[:] /= p.sum()
+                outputs_batch[i][j] = _choice(tmp, p)
+                # outputs_batch[i][j] = np.random.choice(vocab, 1, p)
+        else:
+            outputs_batch[i][:] = np.argmax(prob, axis=1)
+        X_input_batch[:] = outputs_batch[i]
+    return outputs_batch
+
+
diff --git a/example/rnn/lstm_ptb.py b/example/rnn/lstm_ptb.py
new file mode 100644
index 000000000000..b01a88aa5063
--- /dev/null
+++ b/example/rnn/lstm_ptb.py
@@ -0,0 +1,91 @@
+# pylint:skip-file
+import lstm
+import sys
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+
+"""
+PennTreeBank Language Model
+We would like to thanks Wojciech Zaremba for his Torch LSTM code
+
+The data file can be found at:
+https://github.com/dmlc/web-data/tree/master/mxnet/ptb
+"""
+
+def load_data(path, dic=None):
+    fi = open(path)
+    content = fi.read()
+    content = content.replace('\n', '<eos>')
+    content = content.split(' ')
+    print("Loading %s, size of data = %d" % (path, len(content)))
+    x = np.zeros(len(content))
+    if dic == None:
+        dic = {}
+    idx = 0
+    for i in range(len(content)):
+        word = content[i]
+        if len(word) == 0:
+            continue
+        if not word in dic:
+            dic[word] = idx
+            idx += 1
+        x[i] = dic[word]
+    print("Unique token: %d" % len(dic))
+    return x, dic
+
+def drop_tail(X, seq_len):
+    shape = X.shape
+    nstep = int(shape[0] / seq_len)
+    return X[0:(nstep * seq_len), :]
+
+
+def replicate_data(x, batch_size):
+    nbatch = int(x.shape[0] / batch_size)
+    x_cut = x[:nbatch * batch_size]
+    data = x_cut.reshape((nbatch, batch_size), order='F')
+    return data
+
+batch_size = 20
+seq_len = 20
+num_hidden = 200
+num_embed = 200
+num_lstm_layer = 2
+num_round = 20
+learning_rate= 1
+wd=0.00001
+momentum=0.0
+clip_gradient=1
+update_period = 1
+
+
+X_train, dic = load_data("./data/ptb.train.txt")
+X_val, _ = load_data("./data/ptb.valid.txt", dic)
+X_train_batch = replicate_data(X_train, batch_size)
+X_val_batch = replicate_data(X_val, batch_size)
+vocab = len(dic)
+print("Vocab=%d" %vocab)
+
+X_train_batch = drop_tail(X_train_batch, seq_len)
+X_val_batch = drop_tail(X_val_batch, seq_len)
+
+
+model = lstm.setup_rnn_model(mx.gpu(),
+                             num_lstm_layer=num_lstm_layer,
+                             seq_len=seq_len,
+                             num_hidden=num_hidden,
+                             num_embed=num_embed,
+                             num_label=vocab,
+                             batch_size=batch_size,
+                             input_size=vocab,
+                             initializer=mx.initializer.Uniform(0.1))
+
+lstm.train_lstm(model, X_train_batch, X_val_batch,
+                num_round=num_round,
+                half_life=2,
+                update_period=update_period,
+                learning_rate=learning_rate,
+                wd=wd,
+                momentum=momentum,
+                clip_gradient=clip_gradient)
+
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 1ef9c6bf8450..1eeffc1ab4b9 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -49,6 +49,19 @@
 #endif
 #endif
 
+/*!
+* \brief define dllexport for Visual Studio
+*/
+#ifdef _MSC_VER
+#ifdef MXNET_EXPORTS
+#define MXNET_API __declspec(dllexport)
+#else
+#define MXNET_API __declspec(dllimport)
+#endif
+#else
+#define MXNET_API
+#endif
+
 /*! \brief namespace of mxnet */
 namespace mxnet {
 /*! \brief mxnet cpu */
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index b0cf64323afe..2bbda3ddbf0e 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -11,7 +11,7 @@
 #endif
 
 /*! \brief MXNET_DLL prefix for windows" */
-#ifdef _MSC_VER
+#ifdef _WIN32
 #ifdef MXNET_EXPORTS
 #define MXNET_DLL MXNET_EXTERN_C __declspec(dllexport)
 #else
@@ -46,6 +46,8 @@ typedef void *DataIterCreator;
 typedef void *DataIterHandle;
 /*! \brief handle to KVStore */
 typedef void *KVStoreHandle;
+/*! \brief handle to RecordIO */
+typedef void *RecordIOHandle;
 /*!
  * \brief return str message of the last error
  *  all function in this file will return 0 when success
@@ -883,4 +885,53 @@ MXNET_DLL int MXKVStoreSendCommmandToServers(KVStoreHandle handle,
                                              int cmd_id,
                                              const char* cmd_body);
 
+/**
+ * \brief Create a RecordIO writer object
+ * \param uri path to file
+ * \param out handle pointer to the created object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOWriterCreate(const char *uri, RecordIOHandle *out);
+
+/**
+ * \brief Delete a RecordIO writer object
+ * \param handle handle to RecordIO object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOWriterFree(RecordIOHandle handle);
+
+/**
+ * \brief Write a record to a RecordIO object
+ * \param handle handle to RecordIO object
+ * \param buf buffer to write
+ * \param size size of buffer
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOWriterWriteRecord(RecordIOHandle *handle,
+                                          const char *buf, size_t size);
+
+/**
+ * \brief Create a RecordIO reader object
+ * \param uri path to file
+ * \param out handle pointer to the created object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderCreate(const char *uri, RecordIOHandle *out);
+
+/**
+ * \brief Delete a RecordIO reader object
+ * \param handle handle to RecordIO object
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderFree(RecordIOHandle *handle);
+
+/**
+ * \brief Write a record to a RecordIO object
+ * \param handle handle to RecordIO object
+ * \param buf pointer to return buffer
+ * \param size point to size of buffer
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderReadRecord(RecordIOHandle *handle,
+                                        char const **buf, size_t *size);
 #endif  // MXNET_C_API_H_
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
new file mode 100644
index 000000000000..e5671da33cbc
--- /dev/null
+++ b/include/mxnet/c_predict_api.h
@@ -0,0 +1,152 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_predict_api.h
+ * \brief C predict API of mxnet, contains a minimum API to run prediction.
+ *  This file is self-contained, and do not dependent on any other files.
+ */
+#ifndef MXNET_C_PREDICT_API_H_
+#define MXNET_C_PREDICT_API_H_
+
+#ifdef __cplusplus
+#define MXNET_EXTERN_C extern "C"
+#endif
+
+#ifdef _WIN32
+#ifdef MXNET_EXPORTS
+#define MXNET_DLL MXNET_EXTERN_C __declspec(dllexport)
+#else
+#define MXNET_DLL MXNET_EXTERN_C __declspec(dllimport)
+#endif
+#else
+#define MXNET_DLL MXNET_EXTERN_C
+#endif
+
+/*! \brief manually define unsigned int */
+typedef unsigned int mx_uint;
+/*! \brief manually define float */
+typedef float mx_float;
+/*! \brief handle to Predictor */
+typedef void *PredictorHandle;
+/*! \brief handle to NDArray list */
+typedef void *NDListHandle;
+
+/*!
+ * \brief Get the last error happeneed.
+ * \return The last error happened at the predictor.
+ */
+MXNET_DLL const char* MXGetLastError();
+/*!
+ * \brief create a predictor
+ * \param symbol_json_str The JSON string of the symbol.
+ * \param param_bytes The in-memory raw bytes of parameter ndarray file.
+ * \param param_size The size of parameter ndarray file.
+ * \param dev_type The device type, 1: cpu, 2:gpu
+ * \param dev_id The device id of the predictor.
+ * \param num_input_nodes Number of input nodes to the net,
+ *    For feedforward net, this is 1.
+ * \param input_keys The name of input argument.
+ *    For feedforward net, this is {"data"}
+ * \param input_shape_indptr Index pointer of shapes of each input node.
+ *    The length of this array = num_input_nodes + 1.
+ *    For feedforward net that takes 4 dimensional input, this is {0, 4}.
+ * \param input_shape_data A flatted data of shapes of each input node.
+ *    For feedforward net that takes 4 dimensional input, this is the shape data.
+ * \param out The created predictor handle.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredCreate(const char* symbol_json_str,
+                           const char* param_bytes,
+                           size_t param_size,
+                           int dev_type, int dev_id,
+                           mx_uint num_input_nodes,
+                           const char** input_keys,
+                           const mx_uint* input_shape_indptr,
+                           const mx_uint* input_shape_data,
+                           PredictorHandle* out);
+/*!
+ * \brief Get the shape of output node.
+ *  The returned shape_data and shape_ndim is only valid before next call to MXPred function.
+ * \param handle The handle of the predictor.
+ * \param index The index of output node, set to 0 if there is only one output.
+ * \param shape_data Used to hold pointer to the shape data
+ * \param shape_ndim Used to hold shape dimension.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredGetOutputShape(PredictorHandle handle,
+                                   mx_uint index,
+                                   mx_uint** shape_data,
+                                   mx_uint* shape_ndim);
+/*!
+ * \brief Set the input data of predictor.
+ * \param handle The predictor handle.
+ * \param key The name of input node to set.
+ *     For feedforward net, this is "data".
+ * \param data The pointer to the data to be set, with the shape specified in MXPredCreate.
+ * \param size The size of data array, used for safety check.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredSetInput(PredictorHandle handle,
+                             const char* key,
+                             const mx_float* data,
+                             mx_uint size);
+/*!
+ * \brief Run a forward pass to get the output
+ * \param handle The handle of the predictor.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredForward(PredictorHandle handle);
+/*!
+ * \brief Get the output value of prediction.
+ * \param handle The handle of the predictor.
+ * \param index The index of output node, set to 0 if there is only one output.
+ * \param data User allocated data to hold the output.
+ * \param size The size of data array, used for safe checking.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredGetOutput(PredictorHandle handle,
+                              mx_uint index,
+                              mx_float* data,
+                              mx_uint size);
+/*!
+ * \brief Free a predictor handle.
+ * \param handle The handle of the predictor.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredFree(PredictorHandle handle);
+/*!
+ * \brief Create a NDArray List by loading from ndarray file.
+ *     This can be used to load mean image file.
+ * \param nd_file_bytes The byte contents of nd file to be loaded.
+ * \param nd_file_size The size of the nd file to be loaded.
+ * \param out The out put NDListHandle
+ * \param out_length Length of the list.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXNDListCreate(const char* nd_file_bytes,
+                             size_t nd_file_size,
+                             NDListHandle *out,
+                             mx_uint* out_length);
+/*!
+ * \brief Get an element from list
+ * \param handle The handle to the NDArray
+ * \param index The index in the list
+ * \param out_key The output key of the item
+ * \param out_data The data region of the item
+ * \param out_shape The shape of the item.
+ * \param out_ndim The number of dimension in the shape.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXNDListGet(NDListHandle handle,
+                          mx_uint index,
+                          const char** out_key,
+                          const mx_float** out_data,
+                          const mx_uint** out_shape,
+                          mx_uint* out_ndim);
+/*!
+ * \brief Free a predictor handle.
+ * \param handle The handle of the predictor.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXNDListFree(NDListHandle handle);
+
+#endif  // MXNET_C_PREDICT_API_H_
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 03eb45b54de0..195f5c05eb20 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -45,7 +45,7 @@ enum class FnProperty {
 /*!
  * \brief Dependency engine that schedules operations.
 */
-class Engine {
+class MXNET_API Engine {
  public:
   /*!
    * \brief OnComplete Callback to the engine,
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index b84d1f62436f..6a3fb6a769f9 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -188,13 +188,7 @@ class KVStore {
   }
 
   /*!
-   * \return The number of nodes in this group.
-   *
-   * Always returns 1 when type == "local". Otherwise, returns
-   *
-   * - number of workers if if `IsWorkerNode() == true`,
-   * - number of servers if if `IsServerNode() == true`,
-   * - 1 if `IsSchedulerNode() == true`,
+   * \return The number of worker nodes
    */
   virtual int get_group_size() const {
     return 1;
@@ -205,8 +199,7 @@ class KVStore {
    *
    * But note that, this functions only blocks the main thread of workers until
    * all of them are reached this point. It doesn't guarantee that all
-   * operations issued before are actually finished, such as \ref Push and \ref
-   * Pull. In that case, we need to call \ref Wait or \ref WaitAll
+   * operations issued before are actually finished, such as \ref Push and \ref Pull.
    */
   virtual void Barrier() { }
 
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index fce3d61f6855..ec8c856d84c1 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -243,21 +243,21 @@ class NDArray {
     ptr_->CheckAndAlloc();
   }
   /*!
-   * \brief Save list of narray into the file.
-   * \param fname name of the file.
+   * \brief Save list of narray into the Stream.x
+   * \param fo The stream of output.
    * \param data the NDArrays to be saved.
    * \param names the name of the NDArray, optional, can be zero length.
    */
-  static void Save(const std::string& fname,
+  static void Save(dmlc::Stream* fo,
                    const std::vector<NDArray>& data,
                    const std::vector<std::string>& names);
   /*!
-   * \brief Load list of narray into from the file.
-   * \param fname name of the file.
+   * \brief Load list of narray into from the stream.
+   * \param fi The stream of the input file.
    * \param data the NDArrays to be loaded
    * \param keys the name of the NDArray, if saved in the file.
    */
-  static void Load(const std::string& fname,
+  static void Load(dmlc::Stream* fi,
                    std::vector<NDArray>* data,
                    std::vector<std::string>* keys);
 
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 72c5f6c28823..dc6176fe8b51 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -400,7 +400,7 @@ class OperatorProperty {
 };
 
 /*! \brief typedef the factory function of operator property */
-typedef OperatorProperty *(*OperatorPropertyFactory)();
+typedef std::function<OperatorProperty *()> OperatorPropertyFactory;
 /*!
  * \brief Registry entry for OperatorProperty factory functions.
  */
@@ -454,12 +454,8 @@ struct OperatorPropertyReg
  * \endcode
  */
 #define MXNET_REGISTER_OP_PROPERTY(name, OperatorPropertyType)          \
-  static ::mxnet::OperatorProperty* __create__ ## OperatorProperty ## name ## __() { \
-    OperatorProperty* ret = new OperatorPropertyType();                 \
-    return ret;                                                         \
-  }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::OperatorPropertyReg, OperatorPropertyReg, name) \
-  .set_body(__create__ ## OperatorProperty ## name ## __)               \
+  .set_body([]() { return new OperatorPropertyType(); })                \
   .check_name()
 
 #endif  // DMLC_USE_CXX11
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index da7a8aaa5388..60bca03b0680 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-class Storage {
+class MXNET_API Storage {
  public:
   /*!
    * \brief Storage handle.
diff --git a/mshadow b/mshadow
index d2c27549571f..27ba6a635e81 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit d2c27549571fb6a71e81d8b860b1484809d8922f
+Subproject commit 27ba6a635e81ac6e9f0f30a1ab1bf1d32e56f7d8
diff --git a/predict/python/mxnet_predict.py b/predict/python/mxnet_predict.py
new file mode 100644
index 000000000000..4abeefcb77d8
--- /dev/null
+++ b/predict/python/mxnet_predict.py
@@ -0,0 +1,211 @@
+# coding: utf-8
+# pylint: disable=invalid-name, too-many-arguments
+"""Lightweight API for mxnet prediction.
+
+This is for prediction only, use mxnet python package instead for most tasks.
+"""
+from __future__ import absolute_import
+
+import os
+import sys
+import ctypes
+import numpy as np
+
+__all__ = ["Predictor", "load_ndarray_file"]
+
+if sys.version_info[0] == 3:
+    py_str = lambda x: x.decode('utf-8')
+else:
+    py_str = lambda x: x
+
+def c_str(string):
+    """"Convert a python string to C string."""
+    return ctypes.c_char_p(string.encode('utf-8'))
+
+def c_array(ctype, values):
+    """Create ctypes array from a python array."""
+    return (ctype * len(values))(*values)
+
+def _find_lib_path():
+    """Find mxnet library."""
+    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    api_path = os.path.join(curr_path, '../../lib/')
+    dll_path = [curr_path, api_path]
+    dll_path = [os.path.join(p, 'libmxnet.so') for p in dll_path] + \
+        [os.path.join(p, 'libmxnet_predict.so') for p in dll_path]
+    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    if len(lib_path) == 0:
+        raise RuntimeError('Cannot find the files.\n' +
+                           'List of candidates:\n' + str('\n'.join(dll_path)))
+    return lib_path
+
+
+def _load_lib():
+    """Load libary by searching possible path."""
+    lib_path = _find_lib_path()
+    lib = ctypes.cdll.LoadLibrary(lib_path[0])
+    # DMatrix functions
+    lib.MXGetLastError.restype = ctypes.c_char_p
+    return lib
+
+
+def _check_call(ret):
+    """Check the return value of API."""
+    if ret != 0:
+        raise RuntimeError(py_str(_LIB.MXGetLastError()))
+
+_LIB = _load_lib()
+# type definitions
+mx_uint = ctypes.c_uint
+mx_float = ctypes.c_float
+mx_float_p = ctypes.POINTER(mx_float)
+PredictorHandle = ctypes.c_void_p
+NDListHandle = ctypes.c_void_p
+
+devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3}
+
+class Predictor(object):
+    """A predictor class that runs prediction.
+
+    Parameters
+    ----------
+    symbol_json_str : str
+        Path to the symbol file.
+
+    param_raw_bytes : str, bytes
+        The raw parameter bytes.
+
+    input_shapes : dict of str to tuple
+        The shape of input data
+
+    dev_type : str, optional
+        The device type of the predictor.
+
+    dev_id : int, optional
+        The device id of the predictor.
+    """
+    def __init__(self, symbol_file,
+                 param_raw_bytes, input_shapes,
+                 dev_type="cpu", dev_id=0):
+        dev_type = devstr2type[dev_type]
+        indptr = [0]
+        sdata = []
+        keys = []
+        for k, v  in input_shapes.items():
+            if not isinstance(v, tuple):
+                raise ValueError("Expect input_shapes to be dict str->tuple")
+            keys.append(c_str(k))
+            sdata.extend(v)
+            indptr.append(len(sdata))
+        handle = PredictorHandle()
+        param_raw_bytes = bytearray(param_raw_bytes)
+        ptr = (ctypes.c_char * len(param_raw_bytes)).from_buffer(param_raw_bytes)
+        _check_call(_LIB.MXPredCreate(
+            c_str(symbol_file),
+            ptr, len(param_raw_bytes),
+            ctypes.c_int(dev_type), ctypes.c_int(dev_id),
+            mx_uint(len(indptr) - 1),
+            c_array(ctypes.c_char_p, keys),
+            c_array(mx_uint, indptr),
+            c_array(mx_uint, sdata),
+            ctypes.byref(handle)))
+        self.handle = handle
+
+    def __del__(self):
+        _check_call(_LIB.MXPredFree(self.handle))
+
+    def forward(self, **kwargs):
+        """Perform forward to get the output.
+
+        Parameters
+        ----------
+        **kwargs
+            Keyword arguments of input variable name to data.
+
+        Examples
+        --------
+        >>> predictor.forward(data=mydata)
+        >>> out = predictor.get_output(0)
+        """
+        for k, v in kwargs.items():
+            if not isinstance(v, np.ndarray):
+                raise ValueError("Expect numpy ndarray as input")
+            v = np.ascontiguousarray(v, dtype=np.float32)
+            _check_call(_LIB.MXPredSetInput(
+                self.handle, c_str(k),
+                v.ctypes.data_as(mx_float_p),
+                mx_uint(v.size)))
+        _check_call(_LIB.MXPredForward(self.handle))
+
+    def get_output(self, index):
+        """Get the index-th output.
+
+        Parameters
+        ----------
+        index : int
+            The index of output.
+
+        Returns
+        -------
+        out : numpy array.
+            The output array.
+        """
+        pdata = ctypes.POINTER(mx_uint)()
+        ndim = mx_uint()
+        _check_call(_LIB.MXPredGetOutputShape(
+            self.handle, index,
+            ctypes.byref(pdata),
+            ctypes.byref(ndim)))
+        shape = tuple(pdata[:ndim.value])
+        print shape
+        data = np.empty(shape, dtype=np.float32)
+        _check_call(_LIB.MXPredGetOutput(
+            self.handle, mx_uint(index),
+            data.ctypes.data_as(mx_float_p),
+            mx_uint(data.size)))
+        return data
+
+
+def load_ndarray_file(nd_bytes):
+    """Load ndarray file and return as list of numpy array.
+
+    Parameters
+    ----------
+    nd_bytes : str or bytes
+        The internal ndarray bytes
+
+    Returns
+    -------
+    out : dict of str to numpy array or list of numpy array
+        The output list or dict, depending on whether the saved type is list or dict.
+    """
+    handle = NDListHandle()
+    olen = mx_uint()
+    nd_bytes = bytearray(nd_bytes)
+    ptr = (ctypes.c_char * len(nd_bytes)).from_buffer(nd_bytes)
+    _check_call(_LIB.MXNDListCreate(
+        ptr, len(nd_bytes),
+        ctypes.byref(handle), ctypes.byref(olen)))
+    keys = []
+    arrs = []
+
+    for i in range(olen.value):
+        key = ctypes.c_char_p()
+        cptr = mx_float_p()
+        pdata = ctypes.POINTER(mx_uint)()
+        ndim = mx_uint()
+        _check_call(_LIB.MXNDListGet(
+            handle, mx_uint(i), ctypes.byref(key),
+            ctypes.byref(cptr), ctypes.byref(pdata), ctypes.byref(ndim)))
+        shape = tuple(pdata[:ndim.value])
+        dbuffer = (mx_float * np.prod(shape)).from_address(ctypes.addressof(cptr.contents))
+        ret = np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+        ret = np.array(ret, dtype=np.float32)
+        keys.append(py_str(key.value))
+        arrs.append(ret)
+    _check_call(_LIB.MXNDListFree(handle))
+
+    if len(keys) == 0 or len(keys[0]) == 0:
+        return arrs
+    else:
+        return {keys[i] : arrs[i] for i in range(len(keys))}
diff --git a/ps-lite b/ps-lite
index 504faa73a826..0cc04093f7c9 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 504faa73a82638c4b2fe66f5696330da38637c96
+Subproject commit 0cc04093f7c9e07155f585552f31a90715bacef6
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 5b216a53596b..a036e003ba77 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -12,6 +12,7 @@
 from . import symbol as sym
 from . import symbol
 from . import io
+from . import recordio
 # use mx.nd as short for mx.ndarray
 from . import ndarray as nd
 # use mx.rnd as short for mx.random
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 7b0ce9d5a9bd..d6aec6509b85 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -54,6 +54,7 @@ def _load_lib():
 DataIterCreatorHandle = ctypes.c_void_p
 DataIterHandle = ctypes.c_void_p
 KVStoreHandle = ctypes.c_void_p
+RecordIOHandle = ctypes.c_void_p
 #----------------------------
 # helper function definition
 #----------------------------
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index f6056d9c25e1..c9c96b57d0a9 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -156,10 +156,9 @@ def __init__(self, data, label,
             self.batch_data[i, 0:actual_size, ::] = data[loc:loc+actual_size, ::]
             self.batch_label[i, 0:actual_size] = label[loc:loc+actual_size]
             loc += batch_size
-        if data.shape[0] > batch_size:
-            self.num_pad = data.shape[0] % batch_size
-        else:
-            self.num_pad = batch_size - data.shape[0]
+        self.num_pad = batch_size - data.shape[0] % batch_size
+        if data.shape[0] % batch_size == 0:
+            self.num_pad = 0
         self.out_data = None
         self.out_label = None
         self.current_batch = -1
@@ -259,7 +258,6 @@ def getpad(self):
         check_call(_LIB.MXDataIterGetPadNum(self.handle, ctypes.byref(pad)))
         return pad.value
 
-
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
     name = ctypes.c_char_p()
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 6e6e01287539..e4cfbfbed35d 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -25,9 +25,9 @@
 except ImportError:
     SKLEARN_INSTALLED = False
 
-# Parameter to pass to epoch_end_callback
-EpochEndParam = namedtuple('EpochEndParams',
-                           ['iteration',
+# Parameter to pass to batch_end_callback
+BatchEndParam = namedtuple('BatchEndParams',
+                           ['epoch',
                             'nbatch',
                             'eval_metric'])
 
@@ -170,10 +170,10 @@ def _create_kvstore(kvstore, num_device, arg_params):
 
 def _train_multi_device(symbol, ctx, input_shape,
                         arg_params, aux_params,
-                        begin_round, end_round, optimizer,
+                        begin_epoch, end_epoch, optimizer,
                         kvstore, update_on_kvstore,
                         train_data, eval_data=None, eval_metric=None,
-                        iter_end_callback=None, epoch_end_callback=None,
+                        epoch_end_callback=None, batch_end_callback=None,
                         logger=None):
     """Internal training function on multiple devices.
 
@@ -196,11 +196,11 @@ def _train_multi_device(symbol, ctx, input_shape,
     aux_params : dict of str to NDArray
         Model parameter, dict of name to NDArray of net's auxiliary states.
 
-    begin_round : int
-        The begining training iteration.
+    begin_epoch : int
+        The begining training epoch.
 
-    end_round : int
-        The end training iteration.
+    end_epoch : int
+        The end training epoch.
 
     optimizer : Optimizer
         The optimization algorithm
@@ -214,11 +214,11 @@ def _train_multi_device(symbol, ctx, input_shape,
     eval_metric : EvalMetric
         A evaluation function.
 
-    iter_end_callback : callable(iteration, symbol, arg_params, aux_states)
-        A callback that is invoked at end of each iteration.
-        This can be used to checkpoint model each iteration.
+    epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
+        A callback that is invoked at end of each epoch.
+        This can be used to checkpoint model each epoch.
 
-    epoch_end_callback : callable(EpochEndParams)
+    batch_end_callback : callable(BatchEndParams)
         A callback that is invoked at end of each batch.
         This can be used to measure speed, get result from evaluation metric. etc.
 
@@ -261,7 +261,7 @@ def _train_multi_device(symbol, ctx, input_shape,
         texec.copy_params_from(arg_params, aux_params)
 
     # init optmizer
-    optimizer.begin_round(begin_round)
+    optimizer.begin_epoch(begin_epoch)
 
     if not update_on_kvstore:
         updater = get_updater(optimizer)
@@ -290,7 +290,7 @@ def _train_multi_device(symbol, ctx, input_shape,
     out_cpu_array = nd.zeros(merged_shape, cpu())
 
     # Now start training
-    for iteration in range(begin_round, end_round):
+    for epoch in range(begin_epoch, end_epoch):
         # Training phase
         tic = time.time()
         eval_metric.reset()
@@ -332,25 +332,25 @@ def _train_multi_device(symbol, ctx, input_shape,
                         updater(index*num_device+k, g, w)
 
             nbatch += 1
-            # epoch callback (for print purpose)
-            if epoch_end_callback != None:
-                epoch_end_params = EpochEndParam(iteration=iteration,
+            # batch callback (for print purpose)
+            if batch_end_callback != None:
+                batch_end_params = BatchEndParam(epoch=epoch,
                                                  nbatch=nbatch,
                                                  eval_metric=eval_metric)
-                if isinstance(epoch_end_callback, list):
-                    for call in epoch_end_callback:
-                        call(epoch_end_params)
+                if isinstance(batch_end_callback, list):
+                    for call in batch_end_callback:
+                        call(batch_end_params)
                 else:
-                    epoch_end_callback(epoch_end_params)
+                    batch_end_callback(batch_end_params)
             # evaluate at end, so out_cpu_array can lazy copy
             eval_metric.update(label, out_cpu_array)
 
-        # reset training data after iteration finish
+        # reset training data after epoch finish
         train_data.reset()
         name, value = eval_metric.get()
-        logger.info('Iteration[%d] Train-%s=%f', iteration, name, value)
+        logger.info('Epoch[%d] Train-%s=%f', epoch, name, value)
         toc = time.time()
-        logger.info('Iteration[%d] Time cost=%.3f', iteration, (toc - tic))
+        logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
         # evaluation
         if eval_data:
             eval_metric.reset()
@@ -367,9 +367,9 @@ def _train_multi_device(symbol, ctx, input_shape,
                 eval_metric.update(label, out_cpu_array)
             eval_data.reset()
             name, value = eval_metric.get()
-            logger.info('Iteration[%d] Validation-%s=%f', iteration, name, value)
+            logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
 
-        if iter_end_callback or iteration + 1 == end_round:
+        if epoch_end_callback or epoch + 1 == end_epoch:
             # copy data back to cpu
             for name, block in zip(arg_names, arg_blocks):
                 if name in arg_params:
@@ -379,17 +379,17 @@ def _train_multi_device(symbol, ctx, input_shape,
                 if name in aux_params:
                     weight = sum(w.copyto(cpu()) for w in block) / len(block)
                     weight.copyto(aux_params[name])
-        if iter_end_callback != None:
-            if isinstance(iter_end_callback, list):
-                for call in iter_end_callback:
-                    call(iteration, symbol, arg_params, aux_params)
+        if epoch_end_callback != None:
+            if isinstance(epoch_end_callback, list):
+                for call in epoch_end_callback:
+                    call(epoch, symbol, arg_params, aux_params)
             else:
-                iter_end_callback(iteration, symbol, arg_params, aux_params)
-    # end of all iterations
+                epoch_end_callback(epoch, symbol, arg_params, aux_params)
+    # end of all epochs
     return
 
 
-def save_checkpoint(prefix, iteration, symbol, arg_params, aux_params):
+def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params):
     """Checkpoint the model data into file.
 
     Parameters
@@ -397,8 +397,8 @@ def save_checkpoint(prefix, iteration, symbol, arg_params, aux_params):
     prefix : str
         Prefix of model name.
 
-    iteration : int
-        The iteration number of the model.
+    epoch : int
+        The epoch number of the model.
 
     symbol : Symbol
         The input symbol
@@ -412,17 +412,17 @@ def save_checkpoint(prefix, iteration, symbol, arg_params, aux_params):
     Notes
     -----
     - ``prefix-symbol.json`` will be saved for symbol.
-    - ``prefix-iteration.params`` will be saved for parameters.
+    - ``prefix-epoch.params`` will be saved for parameters.
     """
     symbol.save('%s-symbol.json' % prefix)
     save_dict = {('arg:%s' % k) : v for k, v in arg_params.items()}
     save_dict.update({('aux:%s' % k) : v for k, v in aux_params.items()})
-    param_name = '%s-%04d.params' % (prefix, iteration)
+    param_name = '%s-%04d.params' % (prefix, epoch)
     nd.save(param_name, save_dict)
     logging.info('Saved checkpoint to \"%s\"', param_name)
 
 
-def load_checkpoint(prefix, iteration):
+def load_checkpoint(prefix, epoch):
     """Load model checkpoint from file.
 
     Parameters
@@ -430,8 +430,8 @@ def load_checkpoint(prefix, iteration):
     prefix : str
         Prefix of model name.
 
-    iteration : int
-        Iteration number of model we would like to load.
+    epoch : int
+        Epoch number of model we would like to load.
 
     Returns
     -------
@@ -447,10 +447,10 @@ def load_checkpoint(prefix, iteration):
     Notes
     -----
     - ``prefix-symbol.json`` will be saved for symbol.
-    - ``prefix-iteration.params`` will be saved for parameters.
+    - ``prefix-epoch.params`` will be saved for parameters.
     """
     symbol = sym.load('%s-symbol.json' % prefix)
-    save_dict = nd.load('%s-%04d.params' % (prefix, iteration))
+    save_dict = nd.load('%s-%04d.params' % (prefix, epoch))
     arg_params = {}
     aux_params = {}
     for k, v in save_dict.items():
@@ -476,8 +476,8 @@ class FeedForward(BASE_ESTIMATOR):
         The device context of training and prediction.
         To use multi GPU training, pass in a list of gpu contexts.
 
-    num_round : int, optional
-        Training parameter, number of training rounds(iterations).
+    num_epoch : int, optional
+        Training parameter, number of training epochs(epochs).
 
     optimizer : str or Optimizer, optional
         Training parameter, name or optimizer object for training.
@@ -501,19 +501,19 @@ class FeedForward(BASE_ESTIMATOR):
         If this is True, no error will be thrown when aux_params and arg_params
         contain extra parameters than needed.
 
-    begin_round : int,optional
-        The begining training iteration.
+    begin_epoch : int,optional
+        The begining training epoch.
 
     **kwargs : dict
         The additional keyword arguments passed to optimizer.
     """
     def __init__(self, symbol, ctx=None,
-                 num_round=None, optimizer='sgd',
+                 num_epoch=None, optimizer='sgd',
                  initializer=Uniform(0.01),
                  numpy_batch_size=128,
                  arg_params=None, aux_params=None,
                  allow_extra_params=False,
-                 begin_round=0,
+                 begin_epoch=0,
                  **kwargs):
         # check if symbol contain duplicated names.
         _check_arguments(symbol)
@@ -535,7 +535,7 @@ def __init__(self, symbol, ctx=None,
             ctx = [ctx]
         self.ctx = ctx
         # training parameters
-        self.num_round = num_round
+        self.num_epoch = num_epoch
         self.kwargs = kwargs.copy()
         self.optimizer = optimizer
         self.initializer = initializer
@@ -546,7 +546,7 @@ def __init__(self, symbol, ctx=None,
         # internal helper state
         self._pred_exec = None
         self._pred_exec_input = None
-        self.begin_round = begin_round
+        self.begin_epoch = begin_epoch
 
     @staticmethod
     def _is_data_arg(name):
@@ -608,8 +608,6 @@ def _init_iter(self, X, y, is_train):
                 raise TypeError('y must be ndarray when X is numpy.ndarray')
             if X.shape[0] != y.shape[0]:
                 raise ValueError("The numbers of data points and labels not equal")
-            if X.ndim != 2:
-                raise ValueError("Data must be 2D")
             if y.ndim == 2 and y.shape[1] == 1:
                 y = y.flatten()
             if y.ndim != 1:
@@ -669,7 +667,7 @@ def predict(self, X):
         return np.concatenate(outputs)
 
     def fit(self, X, y=None, eval_data=None, eval_metric='acc',
-            iter_end_callback=None, epoch_end_callback=None,
+            epoch_end_callback=None, batch_end_callback=None,
             kvstore='local', logger=None):
         """Fit the model.
 
@@ -693,11 +691,11 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
             Or a customize evaluation function that returns the statistics
             based on minibatch.
 
-        iter_end_callback : callable(iteration, symbol, arg_params, aux_states)
-            A callback that is invoked at end of each iteration.
-            This can be used to checkpoint model each iteration.
+        epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
+            A callback that is invoked at end of each epoch.
+            This can be used to checkpoint model each epoch.
 
-        epoch_end_callback: callable(iteration)
+        batch_end_callback: callable(epoch)
             A callback that is invoked at end of each batch
             For print purpose
 
@@ -718,7 +716,7 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
         X = self._init_iter(X, y, is_train=True)
         eval_data = self._init_eval_iter(eval_data)
         # Simply ignore the first example to get input_shape
-        # in first training round.
+        # in first training epoch.
         if not X.iter_next():
             X.reset()
             assert X.iter_next()
@@ -746,16 +744,16 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
         # do training
         _train_multi_device(self.symbol, self.ctx, input_shape,
                             self.arg_params, self.aux_params,
-                            begin_round=self.begin_round, end_round=self.num_round,
+                            begin_epoch=self.begin_epoch, end_epoch=self.num_epoch,
                             optimizer=optimizer,
                             train_data=X, eval_data=eval_data,
                             eval_metric=eval_metric,
-                            iter_end_callback=iter_end_callback,
                             epoch_end_callback=epoch_end_callback,
+                            batch_end_callback=batch_end_callback,
                             kvstore=kvstore, update_on_kvstore=update_on_kvstore,
                             logger=logger)
 
-    def save(self, prefix, iteration=None):
+    def save(self, prefix, epoch=None):
         """Checkpoint the model checkpoint into file.
 
         You can also use pickle to do the job if you only work on python.
@@ -775,15 +773,15 @@ def save(self, prefix, iteration=None):
         Notes
         -----
         - ``prefix-symbol.json`` will be saved for symbol.
-        - ``prefix-iteration.params`` will be saved for parameters.
+        - ``prefix-epoch.params`` will be saved for parameters.
         """
-        if iteration is None:
-            iteration = self.num_round
-        assert iteration is not None
-        save_checkpoint(prefix, iteration, self.symbol, self.arg_params, self.aux_params)
+        if epoch is None:
+            epoch = self.num_epoch
+        assert epoch is not None
+        save_checkpoint(prefix, epoch, self.symbol, self.arg_params, self.aux_params)
 
     @staticmethod
-    def load(prefix, iteration, ctx=None, **kwargs):
+    def load(prefix, epoch, ctx=None, **kwargs):
         """Load model checkpoint from file.
 
         Parameters
@@ -791,13 +789,13 @@ def load(prefix, iteration, ctx=None, **kwargs):
         prefix : str
             Prefix of model name.
 
-        iteration : int
-            Iteration number of model we would like to load.
+        epoch : int
+            epoch number of model we would like to load.
 
         ctx : Context or list of Context, optional
             The device context of training and prediction.
         kwargs : dict
-            other parameters for model, including num_round, optimizer and numpy_batch_size
+            other parameters for model, including num_epoch, optimizer and numpy_batch_size
 
         Returns
         -------
@@ -807,18 +805,18 @@ def load(prefix, iteration, ctx=None, **kwargs):
         Notes
         -----
         - ``prefix-symbol.json`` will be saved for symbol.
-        - ``prefix-iteration.params`` will be saved for parameters.
+        - ``prefix-epoch.params`` will be saved for parameters.
         """
-        symbol, arg_params, aux_params = load_checkpoint(prefix, iteration)
+        symbol, arg_params, aux_params = load_checkpoint(prefix, epoch)
         return FeedForward(symbol, ctx=ctx,
                            arg_params=arg_params, aux_params=aux_params,
-                           begin_round=iteration,
+                           begin_epoch=epoch,
                            **kwargs)
 
     @staticmethod
     def create(symbol, X, y=None, ctx=None,
-               num_round=None, optimizer='sgd', initializer=Uniform(0.01),
-               eval_data=None, eval_metric='acc', iter_end_callback=None,
+               num_epoch=None, optimizer='sgd', initializer=Uniform(0.01),
+               eval_data=None, eval_metric='acc', epoch_end_callback=None,
                kvstore='local', logger=None, **kwargs):
         """Functional style to create a model.
 
@@ -840,8 +838,8 @@ def create(symbol, X, y=None, ctx=None,
             The device context of training and prediction.
             To use multi GPU training, pass in a list of gpu contexts.
 
-        num_round : int, optional
-            Training parameter, number of training rounds(iterations).
+        num_epoch : int, optional
+            Training parameter, number of training epochs(epochs).
 
         optimizer : str or Optimizer, optional
             Training parameter, name or optimizer object for training.
@@ -857,9 +855,9 @@ def create(symbol, X, y=None, ctx=None,
             Or a customize evaluation function that returns the statistics
             based on minibatch.
 
-        iter_end_callback : callable(iteration, symbol, arg_params, aux_states)
-            A callback that is invoked at end of each iteration.
-            This can be used to checkpoint model each iteration.
+        epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
+            A callback that is invoked at end of each epoch.
+            This can be used to checkpoint model each epoch.
 
         kvstore: KVStore or str, optional
            The KVStore or a string kvstore type:
@@ -871,10 +869,10 @@ def create(symbol, X, y=None, ctx=None,
 
            In default uses 'local', often no need to change for single machiine.
         """
-        model = FeedForward(symbol, ctx=ctx, num_round=num_round,
+        model = FeedForward(symbol, ctx=ctx, num_epoch=num_epoch,
                             optimizer=optimizer, initializer=initializer, **kwargs)
         model.fit(X, y, eval_data=eval_data, eval_metric=eval_metric,
-                  iter_end_callback=iter_end_callback,
+                  epoch_end_callback=epoch_end_callback,
                   kvstore=kvstore,
                   logger=logger)
         return model
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index f08aeb3675d9..1e52f66cea81 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -305,6 +305,20 @@ def asnumpy(self):
             ctypes.c_size_t(data.size)))
         return data
 
+    def asscalar(self):
+        """Return a CPU scalar(float) of current ndarray.
+
+        This ndarray must have shape (1,)
+
+        Returns
+        -------
+        scalar : np.float
+            The scalar representation of the ndarray.
+        """
+        if self.shape != (1,):
+            raise ValueError("The current array is not a scalar")
+        return self.asnumpy()[0]
+
     def copyto(self, other):
         """Copy the content of current array to other.
 
@@ -335,6 +349,28 @@ def copyto(self, other):
             raise TypeError('copyto do not support type ' + str(type(other)))
     # pylint: enable= no-member
 
+
+def onehot_encode(indices, out):
+    """One hot encoding indices into matrix out.
+
+    Parameters
+    ----------
+    indices: NDArray
+        An NDArray containing indices of the categorical features.
+
+    out: NDArray
+        The result holder of the encoding.
+
+    Returns
+    -------
+    out: Array
+        Same as out.
+    """
+    # pylint: disable= no-member, protected-access
+    return NDArray._onehot_encode(indices, out, out=out)
+    # pylint: enable= no-member, protected-access
+
+
 def empty(shape, ctx=None):
     """Create an empty uninitialized new NDArray, with specified shape.
 
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 8c5b54178f31..ccfb99eb6019 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -4,20 +4,75 @@
 
 class Optimizer(object):
     """Base class of all optimizers."""
-    def __init__(self):
-        self.iteration = 0
+    opt_registry = {}
+
+    @staticmethod
+    def register(klass):
+        """Register optimizers to the optimizer factory"""
+        assert(isinstance(klass, type))
+        name = klass.__name__.lower()
+        if name in Optimizer.opt_registry:
+            print('WARNING: New optimizer %s.%s is overriding ' \
+                  'existing optimizer %s.%s'%(
+                      klass.__module__, klass.__name__,
+                      Optimizer.opt_registry[name].__module__,
+                      Optimizer.opt_registry[name].__name__))
+        Optimizer.opt_registry[name] = klass
+        return klass
+
+    @staticmethod
+    def create_optimizer(name, rescale_grad=1, **kwargs):
+        """Create an optimizer with specified name.
 
-    def begin_round(self, iteration):
-        """Function called to notify beginning of iteration.
+        Parameters
+        ----------
+        name: str
+            Name of required optimizer. Should be the name
+            of a subclass of Optimizer. Case insensitive.
+
+        rescale_grad : float
+            Rescaling factor on gradient.
+
+        kwargs: dict
+            Parameters for optimizer
+
+        Returns
+        -------
+        opt : Optimizer
+            The result optimizer.
+        """
+        if name.lower() in Optimizer.opt_registry:
+            return Optimizer.opt_registry[name.lower()](
+                rescale_grad=rescale_grad,
+                **kwargs)
+        else:
+            raise ValueError('Cannot find optimizer %s' % name)
+
+    def __init__(self, rescale_grad=1):
+        self.epoch = 0
+        self.rescale_grad = rescale_grad
+
+    def begin_epoch(self, epoch):
+        """Function called to notify beginning of epoch.
 
         Parameters
         ----------
-        iteration : int
-            The iteration number.
+        epoch : int
+            The epoch number.
         """
-        self.iteration = iteration
+        self.epoch = epoch
+
+    def create_state(self, index, weight):
+        """Create additional optimizer state such as momentum.
+        override in implementations."""
+
+    def update(self, index, weight, grad, state):
+        """Update the parameters. override in implementations"""
 
+#convenience wrapper for Optimizer.Register
+register = Optimizer.register
 
+@register
 class SGD(Optimizer):
     """A very simple SGD optimizer with momentum and weight regularization.
 
@@ -41,11 +96,10 @@ class SGD(Optimizer):
     def __init__(self, learning_rate=0.01, momentum=0.0,
                  wd=0.0001, rescale_grad=1, clip_gradient=None,
                  lr_scheduler=None):
-        super(SGD, self).__init__()
+        super(SGD, self).__init__(rescale_grad)
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd
-        self.rescale_grad = rescale_grad
         self.clip_gradient = clip_gradient
         self.lr_scheduler = lr_scheduler
         if lr_scheduler != None:
@@ -87,29 +141,28 @@ def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         if self.lr_scheduler != None:
-            lr = self.lr_scheduler(self.iteration)
+            lr = self.lr_scheduler(self.epoch)
         else:
             lr = self.lr
+
+        grad = grad * self.rescale_grad
+        if self.clip_gradient != None:
+            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
         if state:
             mom = state
             mom[:] *= self.momentum
-            if self.clip_gradient == None:
-                mom[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
-            else:
-                mom[:] += -lr * (clip(grad * self.rescale_grad, -self.clip_gradient,
-                                      self.clip_gradient) +
-                                 self.wd * weight)
+            mom[:] += -lr * (grad + self.wd * weight)
             weight[:] += mom
         else:
             assert self.momentum == 0.0
-            weight[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
-
+            weight[:] += -lr * (grad + self.wd * weight)
 
-class Test(object):
+@register
+class Test(Optimizer):
     """For test use"""
     def __init__(self, rescale_grad=1):
-        self.rescale_grad = rescale_grad
-
+        super(Test, self).__init__(rescale_grad)
 
     # pylint: disable=no-self-use
     def create_state(self, index, weight):
@@ -121,31 +174,8 @@ def update(self, index, weight, grad, state):
         weight[:] += grad * self.rescale_grad
         state[:] = weight
 
-def create(name, rescale_grad=1, **kwargs):
-    """Create an optimizer with specified name.
-
-    Parameters
-    ----------
-    name: str
-        Name of required optimizer
-
-    rescale_grad : float
-        Rescaling factor on gradient.
-
-    kwargs: dict
-        Parameters for optimizer
-
-    Returns
-    -------
-    opt : Optimizer
-        The result optimizer.
-    """
-    if name == 'sgd' or name == 'SGD':
-        return SGD(rescale_grad=rescale_grad, **kwargs)
-    if name == 'test':
-        return Test(rescale_grad=rescale_grad)
-    else:
-        raise ValueError('Cannot find optimizer %s' % name)
+#backward compatibility wrapper for Optimizer.CreateOptimizer
+create = Optimizer.create_optimizer
 
 def get_updater(optimizer):
     """Return a clossure of the updater needed for kvstore
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
new file mode 100644
index 000000000000..5346230f5101
--- /dev/null
+++ b/python/mxnet/recordio.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+# pylint: disable=invalid-name, protected-access, fixme, too-many-arguments
+
+"""Python interface for DLMC RecrodIO data format"""
+from __future__ import absolute_import
+
+import ctypes
+from .base import _LIB
+from .base import RecordIOHandle
+from .base import check_call
+
+class MXRecordIO(object):
+    """Python interface for read/write RecordIO data formmat
+
+    Parameters
+    ----------
+    uri : string
+        uri path to recordIO file.
+    flag : string
+        "r" for reading or "w" writing.
+    """
+    def __init__(self, uri, flag):
+        uri = ctypes.c_char_p(uri)
+        self.handle = RecordIOHandle()
+        if flag == "w":
+            check_call(_LIB.MXRecordIOWriterCreate(uri, ctypes.byref(self.handle)))
+            self.writable = True
+        elif flag == "r":
+            check_call(_LIB.MXRecordIOReaderCreate(uri, ctypes.byref(self.handle)))
+            self.writable = False
+        else:
+            raise ValueError("Invalid flag %s"%flag)
+
+    def __del__(self):
+        if self.writable:
+            check_call(_LIB.MXRecordIOWriterFree(self.handle))
+        else:
+            check_call(_LIB.MXRecordIOReaderFree(self.handle))
+
+    def write(self, buf):
+        """Write a string buffer as a record
+
+        Parameters
+        ----------
+        buf : string
+            buffer to write.
+        """
+        assert self.writable
+        check_call(_LIB.MXRecordIOWriterWriteRecord(self.handle,
+                                                    ctypes.c_char_p(buf),
+                                                    ctypes.c_size_t(len(buf))))
+
+    def read(self):
+        """Read a record as string
+
+        Returns
+        ----------
+        buf : string
+            buffer read.
+        """
+        assert not self.writable
+        buf = ctypes.c_char_p()
+        size = ctypes.c_size_t()
+        check_call(_LIB.MXRecordIOReaderReadRecord(self.handle,
+                                                   ctypes.byref(buf),
+                                                   ctypes.byref(size)))
+        buf = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char*size.value))
+        return buf.contents.raw
diff --git a/src/c_api.cc b/src/c_api/c_api.cc
similarity index 91%
rename from src/c_api.cc
rename to src/c_api/c_api.cc
index 5be4ad29a150..aaac3ee61d08 100644
--- a/src/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -7,6 +7,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/io.h>
 #include <dmlc/memory_io.h>
+#include <dmlc/recordio.h>
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/symbolic.h>
@@ -20,26 +21,13 @@
 #include <mutex>
 #include <memory>
 #include <functional>
-
-// macro hanlding for threadlocal variables
-#ifdef __GNUC__
-  #define MX_TREAD_LOCAL __thread
-#elif __STDC_VERSION__ >= 201112L
-  #define  MX_TREAD_LOCAL _Thread_local
-#elif defined(_MSC_VER)
-  #define MX_TREAD_LOCAL __declspec(thread)
-#endif
-
-#ifndef MX_TREAD_LOCAL
-#message("Warning: Threadlocal is not enabled");
-#endif
+#include "./c_api_error.h"
+#include "../common/thread_local.h"
 
 using namespace mxnet;
 
 /*! \brief entry to to easily hold returning information */
 struct MXAPIThreadLocalEntry {
-  /*! \brief holds last error message */
-  std::string last_error;
   /*! \brief result holder for returning string */
   std::string ret_str;
   /*! \brief result holder for returning strings */
@@ -68,84 +56,8 @@ struct MXAPIThreadLocalEntry {
   }
 };
 
-/*!
- * \brief A threadlocal store to store threadlocal variables.
- *  Will return a thread local singleton of type T
- * \tparam T the type we like to store
- */
-class MXAPIThreadLocalStore {
- public:
-  /*! \brief store return entry */
-  typedef MXAPIThreadLocalEntry T;
-  /*! \return get a thread local singleton */
-  static T* Get() {
-    static MX_TREAD_LOCAL T* ptr = nullptr;
-    if (ptr == nullptr) {
-      ptr = new T();
-      Singleton()->RegisterDelete(ptr);
-    }
-    return ptr;
-  }
-
- private:
-  /*! \brief constructor */
-  MXAPIThreadLocalStore() {}
-  /*! \brief destructor */
-  ~MXAPIThreadLocalStore() {
-    for (size_t i = 0; i < data_.size(); ++i) {
-      delete data_[i];
-    }
-  }
-  /*! \return singleton of the store */
-  static MXAPIThreadLocalStore *Singleton() {
-    static MXAPIThreadLocalStore inst;
-    return &inst;
-  }
-  /*!
-   * \brief register str for internal deletion
-   * \param str the string pointer
-   */
-  void RegisterDelete(T *str) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    data_.push_back(str);
-    lock.unlock();
-  }
-  /*! \brief internal mutex */
-  std::mutex mutex_;
-  /*!\brief internal data */
-  std::vector<T*> data_;
-};
-
-// NOTE: all functions return 0 upon success
-// consider add try/catch block for user error
-// handling in the future
-
-/*! \brief  macro to guard beginning and end section of all functions */
-#define API_BEGIN() try {
-/*! \brief every function starts with API_BEGIN();
-     and finishes with API_END() or API_END_HANDLE_ERROR */
-#define API_END() } catch(dmlc::Error &_except_) { return MXHandleException(_except_); } return 0;
-/*!
- * \brief every function starts with API_BEGIN();
- *   and finishes with API_END() or API_END_HANDLE_ERROR
- *   The finally clause contains procedure to cleanup states when an error happens.
- */
-#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return MXHandleException(_except_); } return 0; // NOLINT(*)
-
-/*! \brief return str message of the last error */
-const char *MXGetLastError() {
-  return MXAPIThreadLocalStore::Get()->last_error.c_str();
-}
-
-/*!
- * \brief handle exception throwed out
- * \param e the exception
- * \return the return value of API after exception is handled
- */
-int MXHandleException(const dmlc::Error &e) {
-  MXAPIThreadLocalStore::Get()->last_error = e.what();
-  return -1;
-}
+// define the threadlocal store.
+typedef mxnet::common::ThreadLocalStore<MXAPIThreadLocalEntry> MXAPIThreadLocalStore;
 
 // Internal function to get the information
 // from function registry
@@ -285,7 +197,10 @@ int MXNDArraySave(const char* fname,
       names[i] = keys[i];
     }
   }
-  mxnet::NDArray::Save(fname, data, names);
+  {
+    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname, "w"));
+    mxnet::NDArray::Save(fo.get(), data, names);
+  }
   API_END();
 }
 
@@ -299,7 +214,10 @@ int MXNDArrayLoad(const char* fname,
   API_BEGIN();
   std::vector<NDArray> data;
   std::vector<std::string> &names = ret->ret_vec_str;
-  mxnet::NDArray::Load(fname, &data, &names);
+  {
+    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
+    mxnet::NDArray::Load(fi.get(), &data, &names);
+  }
   ret->ret_handles.resize(data.size());
   for (size_t i = 0; i < data.size(); ++i) {
     NDArray *ptr = new NDArray();
@@ -1076,3 +994,75 @@ int MXKVStoreGetType(KVStoreHandle handle,
   *CHECK_NOTNULL(type) = static_cast<KVStore*>(handle)->type().c_str();
   API_END();
 }
+
+struct MXRecordIOContext {
+  dmlc::RecordIOWriter *writer;
+  dmlc::RecordIOReader *reader;
+  dmlc::Stream *stream;
+  std::string *read_buff;
+};
+
+int MXRecordIOWriterCreate(const char *uri,
+                           RecordIOHandle *out) {
+  API_BEGIN();
+  dmlc::Stream *stream = dmlc::Stream::Create(uri, "w");
+  MXRecordIOContext *context = new MXRecordIOContext;
+  context->writer = new dmlc::RecordIOWriter(stream);
+  context->reader = NULL;
+  context->stream = stream;
+  context->read_buff = NULL;
+  *out = reinterpret_cast<RecordIOHandle>(context);
+  API_END();
+}
+
+int MXRecordIOWriterFree(RecordIOHandle handle) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  delete context->writer;
+  delete context->stream;
+  API_END();
+}
+
+int MXRecordIOWriterWriteRecord(RecordIOHandle *handle,
+                                const char *buf, size_t size) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  context->writer->WriteRecord(reinterpret_cast<const void*>(buf), size);
+  API_END();
+}
+
+int MXRecordIOReaderCreate(const char *uri,
+                           RecordIOHandle *out) {
+  API_BEGIN();
+  dmlc::Stream *stream = dmlc::Stream::Create(uri, "r");
+  MXRecordIOContext *context = new MXRecordIOContext;
+  context->reader = new dmlc::RecordIOReader(stream);
+  context->writer = NULL;
+  context->stream = stream;
+  context->read_buff = new std::string();
+  *out = reinterpret_cast<RecordIOHandle>(context);
+  API_END();
+}
+
+int MXRecordIOReaderFree(RecordIOHandle *handle) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  delete context->reader;
+  delete context->stream;
+  delete context->read_buff;
+  API_END();
+}
+
+int MXRecordIOReaderReadRecord(RecordIOHandle *handle,
+                              char const **buf, size_t *size) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  context->reader->NextRecord(context->read_buff);
+  *buf = context->read_buff->c_str();
+  *size = context->read_buff->size();
+  API_END();
+}
diff --git a/src/c_api/c_api_error.cc b/src/c_api/c_api_error.cc
new file mode 100644
index 000000000000..2e9c74985d8f
--- /dev/null
+++ b/src/c_api/c_api_error.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_api_error.cc
+ * \brief C error handling
+ */
+#include "./c_api_error.h"
+#include "../common/thread_local.h"
+
+struct ErrorEntry {
+  std::string last_error;
+};
+
+typedef mxnet::common::ThreadLocalStore<ErrorEntry> MXAPIErrorStore;
+
+const char *MXGetLastError() {
+  return MXAPIErrorStore::Get()->last_error.c_str();
+}
+
+void MXAPISetLastError(const char* msg) {
+  MXAPIErrorStore::Get()->last_error = msg;
+}
diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h
new file mode 100644
index 000000000000..fe47052f704b
--- /dev/null
+++ b/src/c_api/c_api_error.h
@@ -0,0 +1,39 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_api_error.h
+ * \brief Error handling for C API.
+ */
+#ifndef MXNET_C_API_C_API_ERROR_H_
+#define MXNET_C_API_C_API_ERROR_H_
+
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <mxnet/c_api.h>
+
+/*! \brief  macro to guard beginning and end section of all functions */
+#define API_BEGIN() try {
+/*! \brief every function starts with API_BEGIN();
+     and finishes with API_END() or API_END_HANDLE_ERROR */
+#define API_END() } catch(dmlc::Error &_except_) { return MXAPIHandleException(_except_); } return 0;  // NOLINT(*)
+/*!
+ * \brief every function starts with API_BEGIN();
+ *   and finishes with API_END() or API_END_HANDLE_ERROR
+ *   The finally clause contains procedure to cleanup states when an error happens.
+ */
+#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return MXAPIHandleException(_except_); } return 0; // NOLINT(*)
+
+/*!
+ * \brief Set the last error message needed by C API
+ * \param msg The error message to set.
+ */
+void MXAPISetLastError(const char* msg);
+/*!
+ * \brief handle exception throwed out
+ * \param e the exception
+ * \return the return value of API after exception is handled
+ */
+inline int MXAPIHandleException(const dmlc::Error &e) {
+  MXAPISetLastError(e.what());
+  return -1;
+}
+#endif  // MXNET_C_API_C_API_ERROR_H_
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
new file mode 100644
index 000000000000..27f63d69944a
--- /dev/null
+++ b/src/c_api/c_predict_api.cc
@@ -0,0 +1,237 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file c_predict_api.cc
+ * \brief C predict API of mxnet
+ */
+#include <dmlc/base.h>
+#include <dmlc/memory_io.h>
+#include <mxnet/c_predict_api.h>
+#include <mxnet/symbolic.h>
+#include <mxnet/ndarray.h>
+#include <memory>
+
+#include "./c_api_error.h"
+
+using namespace mxnet;
+
+// predictor interface
+struct MXAPIPredictor {
+  // output arrays
+  std::vector<NDArray> out_arrays;
+  // argument arrays
+  std::vector<NDArray> arg_arrays;
+  // output shapes
+  std::vector<TShape> out_shapes;
+  // key to arguments
+  std::unordered_map<std::string, size_t> key2arg;
+  // executor
+  std::unique_ptr<Executor> exec;
+};
+
+struct MXAPINDList {
+  std::vector<std::string> keys;
+  std::vector<TShape> shapes;
+  std::vector<size_t> indptr;
+  std::vector<mx_float> data;
+};
+
+int MXPredCreate(const char* symbol_json_str,
+                 const char* param_bytes,
+                 size_t param_size,
+                 int dev_type, int dev_id,
+                 mx_uint num_input_nodes,
+                 const char** input_keys,
+                 const mx_uint* input_shape_indptr,
+                 const mx_uint* input_shape_data,
+                 PredictorHandle* out) {
+  MXAPIPredictor* ret = new MXAPIPredictor();
+  API_BEGIN();
+  Symbol sym;
+  // load in the symbol.
+  {
+    std::string json = symbol_json_str;
+    std::istringstream is(json);
+    dmlc::JSONReader reader(&is);
+    sym.Load(&reader);
+  }
+  // load the parameters
+  std::unordered_map<std::string, NDArray> arg_params, aux_params;
+  {
+    std::vector<NDArray> data;
+    std::vector<std::string> names;
+    dmlc::MemoryFixedSizeStream fi((void*)param_bytes, param_size);  // NOLINT(*)
+    NDArray::Load(&fi, &data, &names);
+    CHECK_EQ(names.size(), data.size())
+        << "Invalid param file format";
+    for (size_t i = 0; i < names.size(); ++i) {
+      if (!strncmp(names[i].c_str(), "aux:", 4)) {
+        aux_params[std::string(names[i].c_str() + 4)]  = data[i];
+      }
+      if (!strncmp(names[i].c_str(), "arg:", 4)) {
+        arg_params[std::string(names[i].c_str() + 4)]  = data[i];
+      }
+    }
+  }
+
+  // shape inference and bind
+  std::unordered_map<std::string, TShape> known_shape;
+  for (mx_uint i = 0; i < num_input_nodes; ++i) {
+    known_shape[std::string(input_keys[i])] =
+        TShape(input_shape_data + input_shape_indptr[i],
+               input_shape_data + input_shape_indptr[i + 1]);
+  }
+  std::vector<TShape> arg_shapes;
+  std::vector<std::string> arg_names = sym.ListArguments();
+  std::vector<std::string> aux_names = sym.ListAuxiliaryStates();
+  std::vector<TShape> out_shapes(sym.ListOutputs().size());
+  std::vector<TShape> aux_shapes(aux_names.size());
+  for (size_t i = 0; i < arg_names.size(); ++i) {
+    std::string key = arg_names[i];
+    ret->key2arg[key] = i;
+    if (known_shape.count(key) != 0) {
+      arg_shapes.push_back(known_shape[key]);
+    } else {
+      arg_shapes.push_back(TShape());
+    }
+  }
+  CHECK(sym.InferShape(&arg_shapes, &out_shapes, &aux_shapes))
+      << "The shape information of is not enough to get the shapes";
+  ret->out_shapes = out_shapes;
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+
+  std::vector<NDArray> arg_arrays, aux_arrays;
+  for (size_t i = 0; i < arg_shapes.size(); ++i) {
+    NDArray nd = NDArray(arg_shapes[i], ctx);
+    if (arg_params.count(arg_names[i]) != 0) {
+      CopyFromTo(arg_params[arg_names[i]], &nd);
+    }
+    arg_arrays.push_back(nd);
+  }
+  for (size_t i = 0; i < aux_shapes.size(); ++i) {
+    NDArray nd = NDArray(aux_shapes[i], ctx);
+    if (aux_params.count(aux_names[i]) != 0) {
+      CopyFromTo(aux_params[aux_names[i]], &nd);
+    }
+    aux_arrays.push_back(nd);
+  }
+  ret->arg_arrays = arg_arrays;
+  // bind
+  {
+    std::vector<NDArray> grad_store(arg_arrays.size());
+    std::vector<OpReqType> grad_req(arg_arrays.size(), kNullOp);
+    ret->exec.reset(Executor::Bind(sym, ctx, arg_arrays,
+                                   grad_store, grad_req,
+                                   aux_arrays));
+    ret->out_arrays = ret->exec->outputs();
+  }
+  *out = ret;
+  API_END_HANDLE_ERROR(delete ret);
+}
+
+int MXPredGetOutputShape(PredictorHandle handle,
+                         mx_uint out_index,
+                         mx_uint** shape_data,
+                         mx_uint* shape_ndim) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  CHECK_LT(out_index, p->out_arrays.size())
+      << "Index exceed number of outputs";
+  *shape_data = p->out_shapes[out_index].data();
+  *shape_ndim = p->out_shapes[out_index].ndim();
+  API_END();
+}
+
+int MXPredSetInput(PredictorHandle handle,
+                   const char* key,
+                   const mx_float* data,
+                   mx_uint size) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  auto it = p->key2arg.find(key);
+  if (it == p->key2arg.end()) {
+    LOG(FATAL) << "cannot find input key " << key;
+  }
+  NDArray& nd = p->arg_arrays[it->second];
+  nd.SyncCopyFromCPU(data, size);
+  API_END();
+}
+
+int MXPredForward(PredictorHandle handle) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  p->exec->Forward(false);
+  API_END();
+}
+
+int MXPredGetOutput(PredictorHandle handle,
+                    mx_uint index,
+                    mx_float* data,
+                    mx_uint size) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  CHECK_LT(index, p->out_arrays.size())
+      << "Output index out of range";
+  const NDArray& nd = p->out_arrays[index];
+  nd.SyncCopyToCPU(data, size);
+  API_END();
+}
+
+int MXPredFree(PredictorHandle handle) {
+  API_BEGIN();
+  delete static_cast<MXAPIPredictor*>(handle);
+  API_END();
+}
+
+int MXNDListCreate(const char* nd_file_bytes,
+                   size_t nd_file_size,
+                   NDListHandle *out,
+                   mx_uint* out_length) {
+  MXAPINDList* ret = new MXAPINDList();
+  API_BEGIN();
+  std::vector<NDArray> arrays;
+  dmlc::MemoryFixedSizeStream fi((void*)nd_file_bytes, nd_file_size);  // NOLINT(*)
+  NDArray::Load(&fi,
+                &(arrays),
+                &(ret->keys));
+  if (ret->keys.size() == 0) {
+    ret->keys.resize(arrays.size());
+  }
+  ret->indptr.push_back(0);
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    TShape shape = arrays[i].shape();
+    size_t begin = ret->data.size();
+    size_t size = shape.Size();
+    ret->shapes.push_back(shape);
+    ret->data.resize(begin + size);
+    arrays[i].SyncCopyToCPU(dmlc::BeginPtr(ret->data) + begin, size);
+    ret->indptr.push_back(begin + size);
+  }
+  *out = ret;
+  *out_length = static_cast<mx_uint>(arrays.size());
+  API_END();
+}
+
+int MXNDListGet(NDListHandle handle,
+                mx_uint index,
+                const char** out_key,
+                const mx_float** out_data,
+                const mx_uint** out_shape,
+                mx_uint* out_ndim) {
+  MXAPINDList* p = static_cast<MXAPINDList*>(handle);
+  API_BEGIN();
+  CHECK_LT(index, p->shapes.size())
+      << "Index out of range";
+  *out_key = p->keys[index].c_str();
+  *out_data = dmlc::BeginPtr(p->data) + p->indptr[index];
+  *out_shape = p->shapes[index].data();
+  *out_ndim = p->shapes[index].ndim();
+  API_END();
+}
+
+int MXNDListFree(NDListHandle handle) {
+  API_BEGIN();
+  delete static_cast<MXAPINDList*>(handle);
+  API_END();
+}
+
+
diff --git a/src/common/tblob_op_registry.cc b/src/common/tblob_op_registry.cc
new file mode 100644
index 000000000000..8dac8944f144
--- /dev/null
+++ b/src/common/tblob_op_registry.cc
@@ -0,0 +1,353 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file tblob_op_registry.cc
+ * Implementation of tblob op registry
+ */
+#include <mxnet/ndarray.h>
+#include <mxnet/engine.h>
+#include <vector>
+#include <mutex>
+#include "./tblob_op_registry.h"
+
+namespace mxnet {
+namespace common {
+class TBlobUnaryOpProp;
+
+class TBlobOpRegEntryImpl : public TBlobOpRegEntry {
+ public:
+  // functions
+  TSelf& set_function(int dev_mask,
+                      UnaryFunction funary,
+                      bool inplace_in_out,
+                      bool register_symbolic) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ++reg_counter_;
+    if (funary_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_[dev_mask] = funary;
+    inplace_in0_out_forward_ = inplace_in_out;
+    if (reg_counter_ == 1) {
+      this->RegisterUnary();
+      register_symbolic_ = register_symbolic;
+      if (register_symbolic) {
+        this->RegisterUnarySymbolic();
+      }
+    }
+    return *this;
+  }
+
+  TSelf& set_gradient(int dev_mask,
+                      UnaryGradType1 fgrad,
+                      bool inplace_out_in_grad) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (funary_grad_t1_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_grad_t1_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_grad_t1_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device gradient function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_grad_t1_[dev_mask] = fgrad;
+    inplace_out_in0_grad_ = inplace_out_in_grad;
+    return *this;
+  }
+
+  TSelf& set_gradient(int dev_mask,
+                      UnaryGradType2 fgrad,
+                      bool inplace_out_in_grad) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (funary_grad_t2_.size() <= static_cast<size_t>(dev_mask)) {
+      funary_grad_t2_.resize(dev_mask + 1, nullptr);
+    }
+    if (funary_grad_t2_[dev_mask] != nullptr) {
+      LOG(FATAL) << "Device gradient function " << this->name
+                 << " already registerd for device " << dev_mask;
+    }
+    funary_grad_t2_[dev_mask] = fgrad;
+    inplace_out_in0_grad_ = inplace_out_in_grad;
+    return *this;
+  }
+
+  TSelf& set_shape_infer(UnaryShapeInfer fshapeinfer) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    unary_infer_ = fshapeinfer;
+    return *this;
+  }
+
+  TSelf& describe(const std::string &description) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (reg_counter_ != 1) return *this;
+    NDArrayReg().describe(description);
+    if (register_symbolic_) {
+      OpReg().describe(description);
+    }
+    return *this;
+  }
+
+ private:
+  // make friend with unary op
+  friend class TBlobUnaryOpProp;
+  // internal mutex
+  std::mutex mutex_;
+  // registration counter
+  int reg_counter_{0};
+  bool register_symbolic_{true};
+  // unary shape inferencer
+  UnaryShapeInfer unary_infer_{nullptr};
+  // unary functions on each device mask
+  std::vector<UnaryFunction> funary_;
+  // type 1 gradient function
+  std::vector<UnaryGradType1> funary_grad_t1_;
+  // type 2 gradient function
+  std::vector<UnaryGradType2> funary_grad_t2_;
+  // whether do inplace optimization of in 0 and output
+  bool inplace_in0_out_forward_{true};
+  // whether do inplace optimization of out_grad and in_grad0
+  bool inplace_out_in0_grad_{false};
+  // NDArray registry
+  NDArrayFunctionReg *ndarray_reg_{nullptr};
+  OperatorPropertyReg *op_reg_{nullptr};
+  // internal function to register NDArray function.
+  inline NDArrayFunctionReg &NDArrayReg() {
+    if (ndarray_reg_ == nullptr) {
+      NDArrayFunctionReg &reg =
+          ::dmlc::Registry<NDArrayFunctionReg>::Get()->__REGISTER__(this->name);
+      ndarray_reg_ = &reg;
+    }
+    return *ndarray_reg_;
+  }
+  // internal function to register NDArray function.
+  inline OperatorPropertyReg &OpReg() {
+    if (op_reg_ == nullptr) {
+      OperatorPropertyReg &reg =
+          ::dmlc::Registry<OperatorPropertyReg>::Get()->__REGISTER__(this->name);
+      op_reg_ = &reg;
+    }
+    return *op_reg_;
+  }
+  // start registering all stuffs
+  void RegisterUnary();
+  void RegisterUnarySymbolic();
+};
+
+// Unary operator to invoke generic TBlob function.
+struct TBlobUnaryOperator : public Operator {
+  TBlobOpRegEntry::UnaryFunction forward;
+  TBlobOpRegEntry::UnaryGradType1 backward1{nullptr};
+  TBlobOpRegEntry::UnaryGradType2 backward2{nullptr};
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data,
+               const std::vector<TBlob> &aux_args) override {
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    TBlob out = out_data[0];
+    (*forward)(in_data[0], &out, req[0], ctx.run_ctx);
+  }
+
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad,
+                const std::vector<TBlob> &aux_args) override {
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+    arg::OutGrad ograd; ograd.data = out_grad[0];
+    TBlob igrad = in_grad[0];
+    if (backward1 != nullptr) {
+      arg::OutValue out_value; out_value.data = out_data[0];
+      (*backward1)(ograd, out_value, &igrad, req[0], ctx.run_ctx);
+    } else if (backward2 != nullptr) {
+      arg::Input0 in0; in0.data = in_data[0];
+      (*backward2)(ograd, in0, &igrad, req[0], ctx.run_ctx);
+    } else {
+      LOG(FATAL) << "Backward is not supported";
+    }
+  }
+};  // class UnaryOperator
+
+class TBlobUnaryOpProp : public OperatorProperty {
+ public:
+  std::string name;
+  TBlobOpRegEntryImpl* source;
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return std::map<std::string, std::string>();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    if (source->unary_infer_ == nullptr) {
+      out_shape->push_back(dshape);
+    } else {
+      out_shape->push_back((*(source->unary_infer_))(dshape));
+    }
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new TBlobUnaryOpProp();
+    ptr->source = source;
+    ptr->name = name;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return name;
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (source->funary_grad_t1_.size() != 0) {
+      return {out_grad[0], out_data[0]};
+    } else if (source->funary_grad_t2_.size() != 0) {
+      return {out_grad[0], in_data[0]};
+    } else {
+      LOG(FATAL) << "Backward of " << name << " is not decalred";
+      return {};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    if (source->inplace_out_in0_grad_) {
+      return {{out_grad[0], in_grad[0]}};
+    } else {
+      return {};
+    }
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    if (source->inplace_in0_out_forward_) {
+      return {{in_data[0], out_data[0]}};
+    } else {
+      return {};
+    }
+  }
+
+  Operator* CreateOperator(Context ctx) const {
+    size_t dev_mask = ctx.dev_mask();
+    TBlobUnaryOperator *op = new TBlobUnaryOperator();
+    CHECK(dev_mask < source->funary_.size() && source->funary_[dev_mask] != nullptr);
+    op->forward = source->funary_[dev_mask];
+    if (dev_mask < source->funary_grad_t1_.size()) {
+      op->backward1 = source->funary_grad_t1_[dev_mask];
+    }
+    if (dev_mask < source->funary_grad_t2_.size()) {
+      op->backward2 = source->funary_grad_t2_[dev_mask];
+    }
+    return op;
+  }
+};
+
+void TBlobOpRegEntryImpl::RegisterUnary() {
+  CHECK_EQ(reg_counter_, 1);
+  // The body to be registered
+  auto body = [this] (NDArray **used_vars,
+                      real_t *s,
+                      NDArray **mutate_vars) {
+    NDArray src = *used_vars[0];
+    NDArray *out = mutate_vars[0];
+    TShape dshape = src.shape();
+    if (unary_infer_ != nullptr) dshape = unary_infer_(dshape);
+
+    if (out->is_none()) {
+      *out = NDArray(dshape, src.ctx(), true);
+    } else {
+      CHECK(out->ctx() == src.ctx()) << "target context mismatch";
+      CHECK(out->shape() == dshape) << "target shape mismatch "
+      << out->shape() << " vs. " << dshape;
+    }
+    // important: callback must always capture by value
+    NDArray ret = *out;
+    // get the const variables
+    std::vector<Engine::VarHandle> const_vars;
+    if (src.var() != ret.var()) const_vars.push_back(src.var());
+    // check if the function exist
+    int dev_mask = src.ctx().dev_mask();
+    if (static_cast<size_t>(dev_mask) >= funary_.size() ||
+        funary_[dev_mask] == nullptr) {
+      if (dev_mask == gpu::kDevMask) LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      LOG(FATAL) << "Function " << this->name << "not registered for device " << dev_mask;
+    }
+    // invoke the function
+    UnaryFunction fun = funary_[dev_mask];
+    Engine::Get()->PushSync([src, ret, fun, dev_mask](RunContext ctx) {
+        ret.CheckAndAlloc();
+        TBlob tmp = ret.data();
+        (*fun)(src.data(), &tmp, kWriteTo, ctx);
+#if MXNET_USE_CUDA
+        if (dev_mask == gpu::kDevMask) {
+          ctx.get_stream<gpu>()->Wait();
+        }
+#endif
+      }, src.ctx(), const_vars, {ret.var()});
+  };
+  // register the function.
+  NDArrayReg()
+      .set_body(body)
+      .set_num_use_vars(1)
+      .set_num_mutate_vars(1)
+      .set_type_mask(kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget)
+      .add_argument("src", "NDArray", "Source input to the function");
+}
+
+void TBlobOpRegEntryImpl::RegisterUnarySymbolic() {
+  // register the operator
+  auto op_factory = [this]() {
+    TBlobUnaryOpProp *prop = new TBlobUnaryOpProp();
+    prop->name = this->name;
+    prop->source = this;
+    return prop;
+  };
+  OpReg()
+      .set_body(op_factory)
+      .add_argument("src", "Symbol", "Source symbolic input to the function");
+}
+TBlobOpRegEntry& TBlobOpRegistry::__REGISTER_OR_FIND__(const std::string &name) {
+  if (fmap_.count(name) != 0) return *fmap_.at(name);
+  TBlobOpRegEntry *e = new TBlobOpRegEntryImpl();
+  e->name = name;
+  fmap_[name] = e;
+  return *e;
+}
+
+TBlobOpRegistry* TBlobOpRegistry::Get() {
+  static TBlobOpRegistry inst;
+  return &inst;
+}
+
+TBlobOpRegistry::~TBlobOpRegistry() {
+  for (auto kv : fmap_) {
+    delete kv.second;
+  }
+}
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/tblob_op_registry.h b/src/common/tblob_op_registry.h
new file mode 100644
index 000000000000..d6f5b1644b74
--- /dev/null
+++ b/src/common/tblob_op_registry.h
@@ -0,0 +1,137 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file tblob_op_registry.h
+ * \brief Helper registry to make registration of simple unary binary math function easy.
+ * Register to this registry will enable both symbolic operator and NDArray operator in client.
+ *
+ * More complicated operators can be registered in normal way in ndarray and operator modules.
+ */
+#ifndef MXNET_COMMON_TBLOB_OP_REGISTRY_H_
+#define MXNET_COMMON_TBLOB_OP_REGISTRY_H_
+
+#include <dmlc/registry.h>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <string>
+#include <vector>
+#include <functional>
+
+#if DMLC_USE_CXX11
+#include <functional>
+#endif
+
+namespace mxnet {
+namespace common {
+/*! \brief namespace of arguments */
+namespace arg {
+/*! \brief super class of all gradient function argument */
+struct GradFunctionArgument {
+  /*! \brief The real data */
+  TBlob data;
+};
+/*! \brief First input to the function */
+struct Input0 : GradFunctionArgument {};
+/*! \brief Second input to the function */
+struct Input1 : GradFunctionArgument {};
+
+/*! \brief Ouput value of the function to the function */
+struct OutValue : GradFunctionArgument {};
+/*! \brief Gradient of output value */
+struct OutGrad : GradFunctionArgument {};
+}  // namespace arg
+
+/*! \brief registry for function entry */
+class TBlobOpRegEntry {
+ public:
+  typedef void (*UnaryFunction)(const TBlob &src,
+                                TBlob* ret,
+                                OpReqType req,
+                                RunContext ctx);
+  typedef TShape (*UnaryShapeInfer)(const TShape &src);
+  typedef void (*UnaryGradType1)(const arg::OutGrad& out_grad,
+                                 const arg::OutValue& out_value,
+                                 TBlob* in_grad,
+                                 OpReqType req,
+                                 RunContext ctx);
+  typedef void (*UnaryGradType2)(const arg::OutGrad& out_grad,
+                                 const arg::Input0& in_data0,
+                                 TBlob* in_grad,
+                                 OpReqType req,
+                                 RunContext ctx);
+  /*! \brief declare self type */
+  typedef TBlobOpRegEntry TSelf;
+  /*! \brief name of the entry */
+  std::string name;
+  /*!
+   * \brief set shape inference function, by default use same shape.
+   * \param fshapeinfer The unary function that peforms the operation.
+   */
+  virtual TSelf& set_shape_infer(UnaryShapeInfer fshapeinfer) = 0;
+  /*!
+   * \brief set function of the function to be funary
+   * \param dev_mask The device mask of the function can act on.
+   * \param funary The unary function that peforms the operation.
+   * \param inplace_in_out Whether do inplace optimization on in and out.
+   * \param register_symbolic Whether register a symbolic operator as well.
+   */
+  virtual TSelf& set_function(int dev_mask,
+                              UnaryFunction funary,
+                              bool inplace_in_out,
+                              bool register_symbolic = true) = 0;
+  /*!
+   * \brief set gradient of the function of this function.
+   * \param dev_mask The device mask of the function can act on.
+   * \param fgrad The gradient function to be set.
+   * \param inplace_out_in_grad whether out_grad and in_grad can share memory.
+   */
+  virtual TSelf& set_gradient(int dev_mask,
+                              UnaryGradType1 fgrad,
+                              bool inplace_out_in_grad) = 0;
+  virtual TSelf& set_gradient(int dev_mask,
+                              UnaryGradType2 fgrad,
+                              bool inplace_out_in_grad) = 0;
+  /*!
+   * \brief Describe the function.
+   * \param description The description of the function.
+   * \return reference to self.
+   */
+  virtual TSelf& describe(const std::string &description) = 0;
+  /*! \brief destructor */
+  virtual ~TBlobOpRegEntry() {}
+};
+
+/*! \brief registry for TBlob functions */
+class TBlobOpRegistry {
+ public:
+  /*!
+   * \brief Internal function to register a name function under name.
+   * \param name name of the function
+   * \return ref to the registered entry, used to set properties
+   */
+  TBlobOpRegEntry &__REGISTER_OR_FIND__(const std::string& name);
+  /*!
+   * \brief Find the entry with corresponding name.
+   * \param name name of the function
+   * \return the corresponding function, can be NULL
+   */
+  inline static const TBlobOpRegEntry *Find(const std::string &name) {
+    return Get()->fmap_.at(name);
+  }
+  /*! \return global singleton of the registry */
+  static TBlobOpRegistry* Get();
+
+ private:
+  // destructor
+  ~TBlobOpRegistry();
+  /*! \brief internal registry map */
+  std::map<std::string, TBlobOpRegEntry*> fmap_;
+};
+
+#define MXNET_REGISTER_TBLOB_FUN(Name, DEV)                             \
+  static ::mxnet::common::TBlobOpRegEntry &                             \
+  __make_ ## TBlobOpRegEntry ## _ ## Name ## __ ## DEV ##__ =           \
+      ::mxnet::common::TBlobOpRegistry::Get()->__REGISTER_OR_FIND__(#Name)
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_TBLOB_OP_REGISTRY_H_
diff --git a/src/common/thread_local.h b/src/common/thread_local.h
new file mode 100644
index 000000000000..4853694df79f
--- /dev/null
+++ b/src/common/thread_local.h
@@ -0,0 +1,77 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file thread_local.h
+ * \brief Common utility for thread local storage.
+ */
+#ifndef MXNET_COMMON_THREAD_LOCAL_H_
+#define MXNET_COMMON_THREAD_LOCAL_H_
+
+#include <mutex>
+#include <memory>
+#include <vector>
+
+namespace mxnet {
+namespace common {
+
+// macro hanlding for threadlocal variables
+#ifdef __GNUC__
+  #define MX_TREAD_LOCAL __thread
+#elif __STDC_VERSION__ >= 201112L
+  #define  MX_TREAD_LOCAL _Thread_local
+#elif defined(_MSC_VER)
+  #define MX_TREAD_LOCAL __declspec(thread)
+#endif
+
+#ifndef MX_TREAD_LOCAL
+#message("Warning: Threadlocal is not enabled");
+#endif
+
+/*!
+ * \brief A threadlocal store to store threadlocal variables.
+ *  Will return a thread local singleton of type T
+ * \tparam T the type we like to store
+ */
+template<typename T>
+class ThreadLocalStore {
+ public:
+  /*! \return get a thread local singleton */
+  static T* Get() {
+    static MX_TREAD_LOCAL T* ptr = nullptr;
+    if (ptr == nullptr) {
+      ptr = new T();
+      Singleton()->RegisterDelete(ptr);
+    }
+    return ptr;
+  }
+
+ private:
+  /*! \brief constructor */
+  ThreadLocalStore() {}
+  /*! \brief destructor */
+  ~ThreadLocalStore() {
+    for (size_t i = 0; i < data_.size(); ++i) {
+      delete data_[i];
+    }
+  }
+  /*! \return singleton of the store */
+  static ThreadLocalStore<T> *Singleton() {
+    static ThreadLocalStore<T> inst;
+    return &inst;
+  }
+  /*!
+   * \brief register str for internal deletion
+   * \param str the string pointer
+   */
+  void RegisterDelete(T *str) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    data_.push_back(str);
+    lock.unlock();
+  }
+  /*! \brief internal mutex */
+  std::mutex mutex_;
+  /*!\brief internal data */
+  std::vector<T*> data_;
+};
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_THREAD_LOCAL_H_
diff --git a/src/common/utils.h b/src/common/utils.h
index fbaf5f4fdb55..574fc242ebd8 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -25,14 +25,14 @@ namespace common {
 // heuristic to dermine number of threads per GPU
 inline int GetNumThreadPerGPU() {
   // This is resource efficient option.
-  return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 1);
+  return dmlc::GetEnv("MXNET_GPU_WORKER_NTHREADS", 2);
 }
 
 // heuristic to get number of matching colors.
 // this decides how much parallelism we can get in each GPU.
 inline int GetExecNumMatchColor() {
   // This is resource efficient option.
-  int num_match_color = dmlc::GetEnv("MXNET_EXEC_NUM_TEMP", 4);
+  int num_match_color = dmlc::GetEnv("MXNET_EXEC_NUM_TEMP", 1);
   return std::min(num_match_color, GetNumThreadPerGPU());
 }
 
diff --git a/src/io/iter_normalize.h b/src/io/iter_normalize.h
index add700095892..19d3050696e1 100644
--- a/src/io/iter_normalize.h
+++ b/src/io/iter_normalize.h
@@ -100,7 +100,10 @@ class ImageNormalizeIter : public IIterator<DataInst> {
         // use python compatible ndarray store format
         std::vector<NDArray> data;
         std::vector<std::string> keys;
-        NDArray::Load(param_.mean_img, &data, &keys);
+        {
+          std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(param_.mean_img.c_str(), "r"));
+          NDArray::Load(fi.get(), &data, &keys);
+        }
         CHECK_EQ(data.size(), 1)
             << "Invalid mean image file format";
         data[0].WaitToRead();
@@ -220,9 +223,12 @@ class ImageNormalizeIter : public IIterator<DataInst> {
     meanimg_ *= (1.0f / imcnt);
     // save as mxnet python compatible format.
     TBlob tmp = meanimg_;
-    NDArray::Save(param_.mean_img,
-                  {NDArray(tmp, 0)},
-                  {"mean_img"});
+    {
+      std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(param_.mean_img.c_str(), "w"));
+      NDArray::Save(fo.get(),
+                    {NDArray(tmp, 0)},
+                    {"mean_img"});
+    }
     if (param_.verbose) {
       LOG(INFO) << "Save mean image to " << param_.mean_img << "..";
     }
diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc
index b4044d0a1a0c..0de025ba9a35 100644
--- a/src/kvstore/kvstore.cc
+++ b/src/kvstore/kvstore.cc
@@ -10,7 +10,6 @@
 #include "./kvstore_device.h"
 #if MXNET_USE_DIST_KVSTORE
 #include "./kvstore_dist.h"
-#include "./mxnet_ps_node.h"
 #endif  // MXNET_USE_DIST_KVSTORE
 
 namespace mxnet {
@@ -36,7 +35,7 @@ KVStore* KVStore::Create(const char *type_name) {
         kv->IsWorkerNode() &&
         kv->get_rank() == 0) {
       // configure the server to be the sync mode
-      kv->SendCommandToServers(kvstore::CommandID::kSyncMode, "");
+      kv->SendCommandToServers(kvstore::kSyncMode, "");
     }
 #else
     LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use " << tname;
@@ -50,25 +49,3 @@ KVStore* KVStore::Create(const char *type_name) {
 }
 
 }  // namespace mxnet
-
-#if MXNET_USE_DIST_KVSTORE
-
-namespace ps {
-
-App* App::Create(int argc, char *argv[]) {
-  NodeInfo n;
-  if (n.IsWorker()) {
-    return new ::mxnet::kvstore::MXNetWorker();
-  } else if (n.IsServer()) {
-    return new ::mxnet::kvstore::MXNetServer();
-  } else if (n.IsScheduler()) {
-    return new ::mxnet::kvstore::MXNetScheduler();
-  } else {
-    LOG(FATAL) << "unknown node";
-  }
-  return NULL;
-}
-
-}  // namespace ps
-
-#endif  // MXNET_USE_DIST_KVSTORE
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index ef5cb6b999d7..18d50ffa5e61 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -8,11 +8,9 @@
 #include <string>
 #include <vector>
 #include "./kvstore_local.h"
-#include "./mxnet_ps_node.h"
 #include "mxnet/engine.h"
-// #include "dmlc/parameter.h"
-#include "ps.h"
-#include "base/range.h"
+#include "ps/ps.h"
+#include "./kvstore_dist_server.h"
 
 namespace mxnet {
 namespace kvstore {
@@ -29,27 +27,22 @@ namespace kvstore {
  */
 class KVStoreDist : public KVStoreLocal {
  public:
-  KVStoreDist()
-      : server_(NULL),
-        cache_(NULL),
-        barrier_count_(0) {
+  KVStoreDist() : ps_worker_(nullptr), server_(nullptr) {
     if (IsWorkerNode()) {
-      cache_ = new ps::KVCache<ps::Key, real_t>(PS_KV_ID);
-      StartPS();
+      ps_worker_ = new ps::KVWorker<real_t>(0);
+      ps::Start("mxnet\0");
     }
   }
 
   virtual ~KVStoreDist() {
     Engine::Get()->WaitForAll();
-    delete cache_;
-
     if (IsWorkerNode()) {
       if (get_rank() == 0) {
         // stop the executor at servers
-        SendCommandToServers(CommandID::kStop, "");
+        SendCommandToServers(kStopServer, "");
       }
-      Barrier();
-      ps::StopSystem();
+      ps::Finalize();
+      delete ps_worker_;
     }
   }
 
@@ -62,15 +55,13 @@ class KVStoreDist : public KVStoreLocal {
       Wait(keys);
     } else {
       // do nothing
-      // // simply increase the clock. it's necessary for BSP
-      // cache_->executor()->IncrClock(keys.size());
     }
     Barrier();
   }
 
   void Push(const std::vector<int>& keys,
             const std::vector<NDArray>& values,
-            int priority) override {
+              int priority) override {
     // first aggregate the values over keys
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray> > grouped_vals;
@@ -84,21 +75,16 @@ class KVStoreDist : public KVStoreLocal {
       // push to servers
       auto push_to_servers =
           [this, key, merged](RunContext rctx, Engine::CallbackOnComplete cb) {
-        // convert to ps keys
+         // convert to ps keys
         size_t size = merged.shape().Size();
         PSKV& pskv = EncodeKey(key, size);
 
         // do push
         real_t* data = static_cast<real_t*>(merged.data().dptr_);
-        ps::SArray<real_t> vals(data, size, ps::EmptyDel<real_t>());
-        ps::SyncOpts opts;
-        opts.callback = [cb]() { cb(); };
-        CHECK_NOTNULL(cache_)->Push(
-            opts.GetTask(),
-            pskv.keys,
-            vals,
-            pskv.vals_size,
-            opts.callback);
+        // false means no delete
+        ps::SArray<real_t> vals(data, size, false);
+        CHECK_NOTNULL(ps_worker_)->ZPush(
+        pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
       };
       Engine::Get()->PushAsync(
           push_to_servers,
@@ -134,18 +120,10 @@ class KVStoreDist : public KVStoreLocal {
         // convert to ps keys
         PSKV& pskv = EncodeKey(key, size);
 
-        // pull opts
-        ps::SyncOpts opts;
-        opts.callback = [cb]() { cb(); };
-
-        // issue pull
-        CHECK_NOTNULL(cache_)->Pull(
-            opts.GetTask(),
-            pskv.keys,
-            opts.callback,
-            data,
-            size,
-            pskv.vals_size.data());
+        // issue pull, false means no delete
+        auto vals = new ps::SArray<real_t>(data, size, false);
+        CHECK_NOTNULL(ps_worker_)->ZPull(
+        pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); });
       };
 
       CHECK_NOTNULL(Engine::Get())->PushAsync(
@@ -172,36 +150,31 @@ class KVStoreDist : public KVStoreLocal {
   }
 
   void Barrier() override {
-    ps::Task task;
-    task.set_cmd(CommandID::SetBarrier(barrier_count_++));
-    auto node = CHECK_NOTNULL(ps::NodeInfo::MyApp());
-    node->Wait(node->Submit(task, ps::NodeInfo::SchedulerID()));
+    ps::Postoffice::Get()->Barrier(ps::kWorkerGroup);
   }
 
 
   void SendCommandToServers(int cmd_id,
                             const std::string& cmd_body) override {
-    ps::Task task;
-    task.set_cmd(cmd_id);
-    task.set_msg(cmd_body);
-    auto node = CHECK_NOTNULL(ps::NodeInfo::MyApp());
-    node->Wait(node->Submit(task, ps::kServerGroup));
+    CHECK_NOTNULL(ps_worker_);
+    ps_worker_->Wait(ps_worker_->Request(cmd_id, cmd_body, ps::kServerGroup));
   }
 
-  int get_group_size() const override { return ps::NodeInfo::RankSize(); }
+  int get_group_size() const override { return ps::NumWorkers(); }
 
-  int get_rank() const override { return ps::NodeInfo::MyRank(); }
+  int get_rank() const override { return ps::MyRank(); }
 
   void RunServer(const Controller& controller) override {
     CHECK(!IsWorkerNode());
-    StartPS();
     if (IsServerNode()) {
-      server_ = new KVStoreDistServer(controller);
-      server_->Run();
-      delete server_;
-      server_ = nullptr;
+      server_ = new KVStoreDistServer();
+      server_->set_controller(controller);
     }
-    ps::StopSystem();
+
+    ps::Start("mxnet_server\0");
+    if (server_) server_->Run();
+    ps::Finalize();
+    delete server_; server_ = nullptr;
   }
 
  private:
@@ -232,25 +205,11 @@ class KVStoreDist : public KVStoreLocal {
   }
 
   /**
-   * \brief start the network threads in ps-lite
-   */
-  void StartPS() {
-    // hack argc argv
-    int argc = 1;
-    char** argv = new char*[1];
-    char name[] = "mxnet";
-    argv[0] = new char[strlen(name)+1];
-    memcpy(argv[0], name, strlen(name));
-    argv[0][strlen(name)] = '\0';
-    ps::StartSystem(&argc, &argv);
-  }
-
-  /**
-   * \brief struct for ps keys and vals_size
+   * \brief struct for ps keys and lens
    */
   struct PSKV {
     ps::SArray<ps::Key> keys;  // n keys
-    ps::SArray<int> vals_size;  // the length of the i-th value
+    ps::SArray<int> lens;  // the length of the i-th value
     int size;
   };
 
@@ -264,72 +223,58 @@ class KVStoreDist : public KVStoreLocal {
    */
   std::mutex mu_;
 
-  /**
-   * \brief key partition of server nodes in ps
-   */
-  std::vector<ps::Key> server_key_partition_;
-
   /**
    * \brief convert to keys in ps
    */
   inline PSKV& EncodeKey(int key, size_t size) {
-    CHECK_EQ(sizeof(ps::Key), 8) << "Do not use USE_KEY32=1 to compile ps-lite";
-    int num_servers = ps::NodeInfo::NumServers();
-    CHECK_GT(num_servers, 0);
-
     mu_.lock();
-    // init key parititon
-    if (server_key_partition_.empty()) {
-      auto all = ps::Range<ps::Key>::All();
-      for (int i = 0; i < num_servers; ++i) {
-        ps::Key key = all.EvenDivide(num_servers, i).begin();
-        server_key_partition_.push_back(
-            ((key >> CommandID::kIndexBits)+1) << CommandID::kIndexBits);
-      }
-    }
-
     PSKV& pskv = ps_kv_[key];
     mu_.unlock();
 
     if (!pskv.keys.empty()) {
       CHECK_EQ(pskv.size, size) << "The value size cannot be changed";
     } else {
+      auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
+      int num_servers = krs.size();
+      CHECK_GT(num_servers, 0);
+
       // a simple heuristic for load balance
       if (size < bigarray_bound_) {
         // send it to a single random picked server
         int server = (key * 9973) % num_servers;
-        pskv.keys.push_back(server_key_partition_[server] | key);
-        pskv.vals_size.push_back(size);
+        ps::Key ps_key = krs[server].begin() + key;
+        CHECK_LT(ps_key, krs[server].end());
+        pskv.keys.push_back(ps_key);
+        pskv.lens.push_back(size);
+        pskv.size = size;
       } else {
-        // divide it to all servers
-        auto all = ps::Range<size_t>(0, size);
+        // parition it to all servers
+        pskv.size = 0;
         for (int i = 0; i < num_servers; ++i) {
-          pskv.keys.push_back(server_key_partition_[i] | key);
-          pskv.vals_size.push_back(all.EvenDivide(num_servers, i).size());
+          size_t part_size =
+              static_cast<size_t>(static_cast<double>(size)/num_servers*(i+1)) -
+              static_cast<size_t>(static_cast<double>(size)/num_servers*i);
+          ps::Key ps_key = krs[i].begin() + key;
+          CHECK_LT(ps_key, krs[i].end());
+          pskv.keys.push_back(ps_key);
+          pskv.lens.push_back(part_size);
+          pskv.size += part_size;
         }
+        CHECK_EQ(pskv.size, size);
       }
-      pskv.size = size;
     }
     return pskv;
   }
 
-  /**
-   * \brief a server node
-   */
-  KVStoreDistServer* server_;
-
   /**
    * \brief for worker to push and pull data
-   * use KVCache rather than KVWorker for the c-style pull
    */
-  ps::KVCache<ps::Key, real_t>* cache_;
-
-
+  ps::KVWorker<real_t>* ps_worker_;
 
   /**
-   * \brief the count for barrier
+   * \brief the server handle
    */
-  int barrier_count_;
+  KVStoreDistServer* server_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
new file mode 100644
index 000000000000..d25d6d95d989
--- /dev/null
+++ b/src/kvstore/kvstore_dist_server.h
@@ -0,0 +1,245 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file mxnet_node.h
+ * \brief implement mxnet nodes
+ */
+#ifndef MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
+#define MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
+#include <queue>
+#include <string>
+#include <mutex>
+#include <condition_variable>
+#include <memory>
+#include <functional>
+#include <future>
+#include <vector>
+#include "ps/ps.h"
+#include "mxnet/kvstore.h"
+
+namespace mxnet {
+namespace kvstore {
+
+static const int kStopServer = -1;
+static const int kSyncMode = -2;
+
+/**
+ * \brief executor runs a function using the thread called \ref Start
+ */
+class Executor {
+ public:
+  /**
+   * \brief start the executor
+   */
+  void Start() {
+    std::unique_lock<std::mutex> lk(mu_);
+    while (true) {
+      cond_.wait(lk, [this]{return !queue_.empty();});
+      Block blk = std::move(queue_.front());
+      queue_.pop();
+      lk.unlock();
+
+      if (blk.f) {
+        blk.f(); blk.p.set_value();
+      } else {
+        blk.p.set_value(); break;
+      }
+      lk.lock();
+    }
+  }
+
+  /**
+   * \brief function
+   */
+  typedef std::function<void()> Func;
+
+  /**
+   * \brief let the thread called \ref Start to exec a function. threadsafe
+   */
+  void Exec(const Func& func) {
+    Block blk(func);
+    auto fut = blk.p.get_future();
+    {
+      std::lock_guard<std::mutex> lk(mu_);
+      queue_.push(std::move(blk));
+      cond_.notify_one();
+    }
+    fut.wait();
+  }
+
+  /**
+   * \brief stop the thread, threadsafe
+   */
+  void Stop() {
+    Exec(Func());
+  }
+
+ private:
+  struct Block {
+    explicit Block(const Func& func) : f(func) { }
+    Func f;
+    std::promise<void> p;
+  };
+  std::queue<Block> queue_;
+  std::mutex mu_;
+  std::condition_variable cond_;
+};
+
+class KVStoreDistServer {
+ public:
+  KVStoreDistServer() {
+    using namespace std::placeholders;
+    ps_server_ = new ps::KVServer<float>(0);
+    static_cast<ps::SimpleApp*>(ps_server_)->set_request_handle(
+        std::bind(&KVStoreDistServer::CommandHandle, this, _1, _2));
+    ps_server_->set_request_handle(
+        std::bind(&KVStoreDistServer::DataHandle, this, _1, _2, _3));
+    sync_mode_ = false;
+  }
+
+  ~KVStoreDistServer() {
+    delete ps_server_;
+  }
+
+  void set_controller(const KVStore::Controller& controller) {
+    CHECK(controller);
+    controller_ = controller;
+  }
+
+  void set_updater(const KVStore::Updater& updater)  {
+    CHECK(updater);
+    updater_ = updater;
+  }
+
+  /**
+   * \brief blocked until received the command \a kSyncMode
+   */
+  void Run() {
+    exec_.Start();
+  }
+
+ private:
+  void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
+    if (recved.head == kStopServer) {
+      exec_.Stop();
+    } else if (recved.head == kSyncMode) {
+      sync_mode_ = true;
+    } else {
+      // let the main thread to execute ctrl, which is necessary for python
+      exec_.Exec([this, recved]() {
+          CHECK(controller_);
+          controller_(recved.head, recved.body);
+        });
+    }
+    app->Response(recved);
+  }
+
+  void DataHandle(const ps::KVMeta& req_meta,
+                  const ps::KVPairs<real_t>& req_data,
+                  ps::KVServer<real_t>* server) {
+    // do some check
+    CHECK_EQ(req_data.keys.size(), (size_t)1);
+    if (req_meta.push) {
+      CHECK_EQ(req_data.lens.size(), (size_t)1);
+      CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[0]);
+    }
+
+    int key = DecodeKey(req_data.keys[0]);
+    auto& stored = store_[key];
+
+    // there used several WaitToRead, this is because \a recved's memory
+    // could be deallocated when this function returns. so we need to make sure
+    // the operators with \a NDArray are actually finished
+    if (req_meta.push) {
+      size_t ds[] = {(size_t)req_data.lens[0]};
+      TShape dshape(ds, ds + 1);
+      TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*)
+                      dshape, cpu::kDevMask);
+      NDArray recved = NDArray(recv_blob, 0);
+      if (stored.is_none()) {
+        // initialization
+        stored = NDArray(dshape, Context());
+        CopyFromTo(recved, &stored, 0);
+        server->Response(req_meta);
+        stored.WaitToRead();
+      } else if (sync_mode_) {
+        // synced push
+        auto& merged = merge_buf_[key];
+        if (merged.array.is_none()) {
+          merged.array = NDArray(dshape, Context());
+        }
+
+        if (merged.request.size() == 0) {
+          CopyFromTo(recved, &merged.array, 0);
+        } else {
+          merged.array += recved;
+        }
+
+        merged.request.push_back(req_meta);
+
+        if (merged.request.size() == (size_t)ps::NumWorkers()) {
+          // let the main thread to execute updater_, which is necessary for
+          // python
+          exec_.Exec([this, key, &merged, &stored](){
+              CHECK(updater_);
+              updater_(key, merged.array, &stored);
+            });
+          for (const auto& req : merged.request) {
+            server->Response(req);
+          }
+          merged.request.clear();
+          stored.WaitToRead();
+        } else {
+          merged.array.WaitToRead();
+        }
+      } else {
+        // async push
+        exec_.Exec([this, key, &recved, &stored](){
+            CHECK(updater_);
+            updater_(key, recved, &stored);
+          });
+        server->Response(req_meta);
+        stored.WaitToRead();
+      }
+    } else {
+      // pull
+      ps::KVPairs<real_t> response;
+      CHECK(!stored.is_none()) << "init " << key << " first";
+      int len = stored.shape()[0];
+      response.keys = req_data.keys;
+      response.lens = {len};
+      response.vals.CopyFrom(static_cast<const float*>(stored.data().dptr_), len);
+      server->Response(req_meta, response);
+    }
+  }
+
+  int DecodeKey(ps::Key key) {
+    auto kr = ps::Postoffice::Get()->GetServerKeyRanges()[ps::MyRank()];
+    return key - kr.begin();
+  }
+
+  /**
+   * \brief user defined
+   */
+  bool sync_mode_;
+  KVStore::Controller controller_;
+  KVStore::Updater updater_;
+
+  std::unordered_map<int, NDArray> store_;
+
+  struct MergeBuf {
+    std::vector<ps::KVMeta> request;
+    NDArray array;
+  };
+  std::unordered_map<int, MergeBuf> merge_buf_;
+
+  Executor exec_;
+
+  ps::KVServer<float>* ps_server_;
+};
+
+
+
+}  // namespace kvstore
+}  // namespace mxnet
+
+#endif  // MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 4426d4e82f32..e31930436821 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -64,7 +64,8 @@ class KVStoreLocal : public KVStore {
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
-      if (updater_ != nullptr) {
+      auto it = merge_buf_.find(key);
+      if (updater_ != nullptr || it == merge_buf_.end()) {
         auto it = local_.find(key);
         CHECK(it != local_.end()) << "key " << key << " has not been inited";
         const NDArray& src = it->second;
@@ -72,8 +73,6 @@ class KVStoreLocal : public KVStore {
           CopyFromTo(src, vptr, priority);
         }
       } else {
-        auto it = merge_buf_.find(key);
-        CHECK(it != merge_buf_.end()) << "key " << key << " has not been pushed";
         auto& src = it->second.merged;
         for (auto* vptr : grouped_vals[i]) {
           CopyFromTo(src, vptr, priority);
diff --git a/src/kvstore/mxnet_ps_node.h b/src/kvstore/mxnet_ps_node.h
deleted file mode 100644
index 569a4caac962..000000000000
--- a/src/kvstore/mxnet_ps_node.h
+++ /dev/null
@@ -1,431 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file mxnet_node.h
- * \brief implement mxnet nodes
- */
-#ifndef MXNET_KVSTORE_MXNET_PS_NODE_H_
-#define MXNET_KVSTORE_MXNET_PS_NODE_H_
-#include <queue>
-#include <string>
-#include <mutex>
-#include <condition_variable>
-#include <memory>
-#include <functional>
-#include <vector>
-#include "ps.h"
-#include "mxnet/kvstore.h"
-
-namespace mxnet {
-namespace kvstore {
-
-/**
- * \brief encode/decode a command id
- */
-struct CommandID {
-  /**
-   * \brief commmand id for stoping
-   */
-  static const int kStop = -1;
-  /**
-   * \brief command id to set the server to the sync mode
-   */
-  static const int kSyncMode = -2;
-  /**
-   * \brief returns the commmand id given a barrier count
-   */
-  static int SetBarrier(int count) {
-    return - count - 10;
-  }
-  /**
-   * \brief returns true if it is a barrier command
-   */
-  static bool GetBarrier(int cmd_id, int* count) {
-    if (cmd_id <= 10) {
-      *count = - cmd_id - 10;
-      return true;
-    }
-    return false;
-  }
-
-  /**
-   * \brief number of bits used to encode the key in mxnet
-   */
-  static const int kIndexBits = 32;
-};
-
-/**
- * \brief a simple aggregator over time.
- */
-class Aggregator {
- public:
-  /**
-   * \param num number of nodes for aggregation
-   */
-  Aggregator(int num, ps::Customer* obj) {
-    num_ = num;
-    obj_ = obj;
-  }
-
-  using Message = std::shared_ptr<ps::Message>;
-
-  bool Has(int time) {
-    return msgs_.find(time) != msgs_.end();
-  }
-
-  void Add(int time, const Message& msg) {
-    msgs_[time].push_back(msg);
-    msg->replied = true;
-  }
-
-  size_t Size() {
-    return msgs_.size();
-  }
-
-  size_t Count(int time) {
-    return msgs_[time].size();
-  }
-
-  bool Done(int time) {
-    return Count(time) == (size_t)num_;
-  }
-
-  void Remove(int time) {
-    for (auto& m : msgs_[time]) {
-      CHECK_NOTNULL(obj_)->Reply(m.get());
-    }
-    msgs_.erase(time);
-  }
-
- private:
-  std::unordered_map<int, std::vector<Message>> msgs_;
-  int num_;
-  ps::Customer* obj_;
-};
-
-/** \brief to match worker/server's app id */
-#define PS_KV_ID 9
-
-/** \brief to match worker/server's app id */
-#define PS_APP_ID 10
-
-/**
- * \brief a server node on ps
- */
-class MXNetServer : public ps::App {
- public:
-  MXNetServer() : App(PS_APP_ID) { }
-  virtual ~MXNetServer() { }
-
-  void set_controller(const KVStore::Controller& ctrl) {
-    controller_ = ctrl;
-  }
-
-  void ProcessRequest(ps::Message* request) override {
-    // wait for one second if controller_ is not inited
-    for (int i = 0; i < 100; ++i) {
-      if (!controller_) usleep(10000);
-    }
-    CHECK(controller_);
-    controller_(request->task.cmd(), request->task.msg());
-  }
-
- private:
-  KVStore::Controller controller_;
-};
-
-/**
- * \brief a worker node on ps
- */
-class MXNetWorker : public ps::App {
- public:
-  MXNetWorker() : App(PS_APP_ID) { }
-  virtual ~MXNetWorker() { }
-};
-
-/**
- * \brief a scheduler node on ps
- */
-class MXNetScheduler : public ps::App {
- public:
-  MXNetScheduler()
-      : App(PS_APP_ID),
-        barrier_(ps::NodeInfo::NumWorkers(), this) {
-  }
-  virtual ~MXNetScheduler() { }
-
-  void ProcessRequest(ps::Message* request) override {
-    int count;
-    if (CommandID::GetBarrier(request->task.cmd(), &count)) {
-      barrier_.Add(count, LastRequest());
-      CHECK_EQ(barrier_.Size(), 1);
-
-      if (barrier_.Done(count)) {
-        barrier_.Remove(count);
-      }
-    }
-  }
-
- private:
-  Aggregator barrier_;
-};
-
-/**
- * \brief executor runs a function using it's own thread
- */
-class Executor {
- public:
-  /**
-   * \brief start the executor
-   */
-  void Start() {
-    std::unique_lock<std::mutex> lk(mu_);
-    while (true) {
-      cond_.wait(lk, [this]{return !queue_.empty();});
-      Block blk = std::move(queue_.front());
-      queue_.pop();
-      lk.unlock();
-
-      if (blk.f) {
-        blk.f(); blk.p.set_value();
-      } else {
-        blk.p.set_value(); break;
-      }
-
-      lk.lock();
-    }
-  }
-
-  /**
-   * \brief function
-   */
-  typedef std::function<void()> Func;
-
-  /**
-   * \brief exec a function. threadsafe
-   */
-  void Exec(const Func& func) {
-    Block blk(func);
-    auto fut = blk.p.get_future();
-    {
-      std::lock_guard<std::mutex> lk(mu_);
-      queue_.push(std::move(blk));
-      cond_.notify_one();
-    }
-    fut.wait();
-  }
-
-  /**
-   * \brief stop, threadsafe
-   */
-  void Stop() {
-    Exec(Func());
-  }
-
- private:
-  struct Block {
-    explicit Block(const Func& func) : f(func) { }
-    Func f;
-    std::promise<void> p;
-  };
-  std::queue<Block> queue_;
-  std::mutex mu_;
-  std::condition_variable cond_;
-};
-
-/**
- * \brief distributed kvstore for servers
- */
-class KVStoreDistServer {
- public:
-  explicit KVStoreDistServer(const KVStore::Controller& user_ctrl)
-      // set updater
-      : store_(ServerHandle(this), 1, 1, PS_KV_ID) {
-    // set controller
-    sync_mode_ = false;
-    auto controller
-        = [user_ctrl, this](int cmd_id, const std::string& cmd_body) {
-      if (cmd_id == CommandID::kStop) {
-        exec_.Stop();
-      } else if (cmd_id == CommandID::kSyncMode) {
-        sync_mode_ = true;
-      } else {
-        // let the main thread to execute ctrl, which is necessary for python
-        exec_.Exec([user_ctrl, cmd_id, cmd_body]() {
-            CHECK(user_ctrl);
-            user_ctrl(cmd_id, cmd_body);
-        });
-      }
-    };
-    auto node = CHECK_NOTNULL(ps::NodeInfo::MyApp());
-    static_cast<MXNetServer*>(node)->set_controller(controller);
-  }
-
-  // ~KVStoreDistServer() {
-  //   // clear all ndarrays before Engine is shutting down.
-  //   store_.server()->Clear();
-  // }
-
-  void set_updater(const KVStore::Updater& updater)  {
-    CHECK(updater);
-    updater_ = updater;
-  }
-
-  void Run() {
-    exec_.Start();
-  }
-
- private:
-  /**
-   * \brief value type stored at server
-   */
-  struct ServerVal {
-    std::vector<real_t> data;
-    inline void Load(dmlc::Stream *fi) { fi->Read(&data); }
-    inline void Save(dmlc::Stream *fo) const { fo->Write(data); }
-    inline bool Empty() const { return data.empty(); }
-  };
-
-  /**
-   * \brief server handle
-   */
-  class ServerHandle {
-   public:
-    explicit ServerHandle(KVStoreDistServer* kvstore)
-        : kvstore_(kvstore),
-          ps_obj_(nullptr),
-          aggregator_(nullptr) {
-    }
-
-    ~ServerHandle() {
-      delete aggregator_;
-    }
-
-    /**
-     * \brief get a cpu ndarray from a c-array without data copy
-     */
-    inline NDArray GetNDArray(real_t* data, size_t size) {
-      size_t ds[] = {size};
-      TShape dshape(ds, ds + 1);
-      TBlob data_blob(data, dshape, cpu::kDevMask);
-      return NDArray(data_blob, 0);
-    }
-
-    inline void Start(bool push, int timestamp, int cmd_id, void* msg) { }
-    inline void Finish() { }
-    inline void Load(dmlc::Stream *fi) { }
-    inline void Save(dmlc::Stream *fo) const { }
-
-    inline void Push(ps::Key recv_key,
-                     ps::Blob<const real_t> recv_val,
-                     ServerVal& my_val) {  // NOLINT(*)
-      // initialization
-      if (my_val.Empty()) {
-        my_val.data.resize(recv_val.size);
-        memcpy(my_val.data.data(), recv_val.data,
-               recv_val.size * sizeof(real_t));
-        return;
-      }
-
-      int key = DecodeKey(recv_key);
-      NDArray recv_array = GetNDArray((real_t*)recv_val.data,  // NOLINT(*)
-                                      recv_val.size);
-      NDArray my_array = GetNDArray(my_val.data.data(), my_val.data.size());
-
-      if (kvstore_->sync_mode_) {
-        // create aggregator
-        if (aggregator_ == nullptr) {
-          ps_obj_ = CHECK_NOTNULL(kvstore_)->store_.server();
-          aggregator_ = new Aggregator(
-              ps::NodeInfo::NumWorkers(), ps_obj_);
-        }
-
-        // init merge buf
-        std::vector<real_t>& buf = merge_buf_[key];
-        if (!aggregator_->Has(key)) {
-          if (buf.empty()) {
-            buf.resize(recv_val.size);
-          }
-          memset(buf.data(), 0, buf.size() * sizeof(real_t));
-        }
-
-        // add recved data into merge
-        NDArray merge = GetNDArray(buf.data(), buf.size());
-        merge += recv_array;
-
-        // update if aggregation is done
-        aggregator_->Add(key, ps_obj_->LastRequest());
-        if (aggregator_->Done(key)) {
-          // let the main thread to execute updater_, which is necessary for
-          // python
-          merge.WaitToRead();
-          kvstore_->exec_.Exec([this, key, &merge, &my_array](){
-              CHECK(kvstore_->updater_);
-              kvstore_->updater_(key, merge, &my_array);
-            });
-          aggregator_->Remove(key);
-        }
-      } else {
-        // runs eventual consistency model. so update immediately
-
-        // let the main thread to execute updater_, which is necessary for
-        // python
-        kvstore_->exec_.Exec([this, key, &recv_array, &my_array](){
-            CHECK(kvstore_->updater_);
-            kvstore_->updater_(key, recv_array, &my_array);
-          });
-      }
-      // place waittoread here rather than the beginning of pull.
-      my_array.WaitToRead();
-    }
-
-    inline void Pull(ps::Key recv_key,
-                     const ServerVal& my_val,
-                     ps::Blob<real_t>& send_val) {  // NOLINT(*)
-      CHECK(!my_val.Empty())
-          << DecodeKey(recv_key) << " is not inited";
-
-      send_val.data = (real_t*) my_val.data.data();  // NOLINT(*)
-      send_val.size = my_val.data.size();
-    }
-
-   private:
-    /**
-     * \brief convert from a key in ps
-     */
-    inline int DecodeKey(ps::Key key) {
-      return static_cast<int>(
-          (key << CommandID::kIndexBits) >> CommandID::kIndexBits);
-    }
-    /**
-     * \brief for BSP model
-     */
-    std::unordered_map<int, std::vector<real_t>> merge_buf_;
-    /**
-     * \brief the current timestamp
-     */
-    // int curr_timestamp_;
-
-    KVStoreDistServer* kvstore_;
-
-    ps::Customer* ps_obj_;
-    Aggregator* aggregator_;
-  };
-
-
-  /**
-   * \brief let the main thread execute python codes
-   */
-  Executor exec_;
-
-  bool sync_mode_;
-
-  KVStore::Updater updater_;
-
-  ps::OnlineServer<real_t, ServerVal, ServerHandle> store_;
-};
-
-
-}  // namespace kvstore
-}  // namespace mxnet
-
-#endif  // MXNET_KVSTORE_MXNET_PS_NODE_H_
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 1aecbd39508b..404c0891f984 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -507,10 +507,9 @@ bool NDArray::Load(dmlc::Stream *strm) {
 
 const uint64_t kMXAPINDArrayListMagic = 0x112;
 
-void NDArray::Save(const std::string& fname,
+void NDArray::Save(dmlc::Stream* fo,
                    const std::vector<NDArray>& data,
                    const std::vector<std::string>& names) {
-  std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
   uint64_t header = kMXAPINDArrayListMagic, reserved = 0;
   fo->Write(&header, sizeof(header));
   fo->Write(&reserved, sizeof(reserved));
@@ -518,10 +517,9 @@ void NDArray::Save(const std::string& fname,
   fo->Write(names);
 }
 
-void NDArray::Load(const std::string& fname,
+void NDArray::Load(dmlc::Stream* fi,
                    std::vector<NDArray>* data,
                    std::vector<std::string>* keys) {
-  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
   uint64_t header, reserved;
   CHECK(fi->Read(&header))
       << "Invalid NDArray file format";
@@ -601,6 +599,7 @@ void NDArray::SyncCopyToCPU(real_t *data, size_t size) const {
 // those with underscore will be registered at NDArray
 MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
 
+
 MXNET_REGISTER_NDARRAY_FUN(_plus).set_function(BinaryOp<ndarray::Plus>);
 MXNET_REGISTER_NDARRAY_FUN(_minus).set_function(BinaryOp<ndarray::Minus>);
 MXNET_REGISTER_NDARRAY_FUN(_mul).set_function(BinaryOp<ndarray::Mul>);
@@ -608,6 +607,14 @@ MXNET_REGISTER_NDARRAY_FUN(_div).set_function(BinaryOp<ndarray::Div>);
 
 MXNET_REGISTER_NDARRAY_FUN(dot).set_function(BinaryOp<ndarray::Dot>)
 .describe("Calcuate 2D matrix multiplication");
+
+MXNET_REGISTER_NDARRAY_FUN(_onehot_encode).set_function(BinaryOp<ndarray::OneHotEncode>);
+
+MXNET_REGISTER_NDARRAY_FUN(choose_element)
+.set_function(BinaryOp<ndarray::MatChooseRowElem>)
+.describe("Choose one element from each line(row for python, column for R/Julia)"
+          " in lhs according to index indicated by rhs");
+
 // register API function
 // those with underscore will be registered at NDArray
 MXNET_REGISTER_NDARRAY_FUN(_plus_scalar).set_function(ScalarOp<ndarray::Plus, false>);
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 487152b2bd0f..20f9eb8c65a0 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -47,7 +47,7 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
 
 template<typename xpu, typename OP>
 inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
-                        TBlob *ret, RunContext ctx) {
+                     TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   ret->FlatTo2D<xpu, real_t>(s)
@@ -55,6 +55,26 @@ inline void EvalDot_(const TBlob &lhs, const TBlob &rhs,
           rhs.FlatTo2D<xpu, real_t>(s));
 }
 
+template<typename xpu, typename OP>
+inline void EvalOneHot_(const TBlob &index, const TBlob &rhs,
+                        TBlob *ret, RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  ret->get<xpu, 2, real_t>(s)
+      = one_hot_encode(index.get<xpu, 1, real_t>(s),
+                       rhs.shape_[1]);
+}
+
+template<typename xpu, typename OP>
+inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
+                                  TBlob *ret, RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  ret->get<xpu, 1, real_t>(s)
+      = mat_choose_row_element(lhs.get<xpu, 2, real_t>(s),
+                               rhs.get<xpu, 1, real_t>(s));
+}
+
 template<typename xpu, typename OP, bool reverse>
 inline void EvalScalar_(const TBlob &lhs, const real_t &rhs,
                         TBlob *ret, RunContext ctx) {
@@ -150,7 +170,7 @@ void ElementwiseSum<DEVICE>(const std::vector<TBlob> source,
     }
     default: {
       Tensor<xpu, 2> in_0 = source[0].FlatTo2D<xpu, real_t>(s);
-      out = F<op::identity>(in_0);
+      out = F<mshadow::op::identity>(in_0);
       for (size_t i = 1; i < source.size(); ++i) {
         out += source[i].FlatTo2D<xpu, real_t>(s);
       }
@@ -160,7 +180,9 @@ void ElementwiseSum<DEVICE>(const std::vector<TBlob> source,
 }
 
 // declarations
+DECL_BINARY(DEVICE, MatChooseRowElem, EvalMatChooseRowElem_)
 DECL_BINARY(DEVICE, Dot, EvalDot_)
+DECL_BINARY(DEVICE, OneHotEncode, EvalOneHot_)
 DECL_BINARY(DEVICE, Plus, EvalBinary_)
 DECL_BINARY(DEVICE, Minus, EvalBinary_)
 DECL_BINARY(DEVICE, Mul, EvalBinary_)
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index f23b696bf5eb..9f23c1a5c348 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -11,6 +11,7 @@
 #include <mxnet/base.h>
 #include <mxnet/resource.h>
 #include <vector>
+#include "../operator/mshadow_op.h"
 
 namespace mxnet {
 /*! \brief namespace to support all possible Ndarray operator */
@@ -22,6 +23,7 @@ struct BinaryBase {
     return lshape;
   }
 };
+
 // operators
 struct Plus : public BinaryBase {
   typedef mshadow::op::plus mshadow_op;
@@ -72,6 +74,24 @@ struct Dot {
   }
 };
 
+
+struct OneHotEncode {
+  inline static TShape GetShape(const TShape &index, const TShape &proptype) {
+    CHECK(index.ndim() == 1 && proptype.ndim() == 2) << "OneHotEncode only support 1d index.";
+    CHECK_EQ(index[0], proptype[0]) << "OneHotEncode shape inconsistent";
+    return proptype;
+  }
+};
+
+struct MatChooseRowElem {
+  inline static TShape GetShape(const TShape &lshape, const TShape &rshape) {
+    CHECK(lshape.ndim() == 2 && rshape.ndim() == 1)
+        << "choose_row_element only support 2D Matrix and 1D index";
+    CHECK_EQ(lshape[0], rshape[0]) << "choose_row_element index and matrix shape mismatch";
+    return rshape;
+  }
+};
+
 // type holder for random number generators
 struct UniformDistribution {};
 
@@ -84,6 +104,9 @@ void EvalClip(const TBlob &src, const real_t &a_min, const real_t &a_max,
 template<typename Device, typename OP>
 void Eval(const TBlob &lhs, const TBlob &rhs, TBlob *ret, RunContext ctx);
 
+template<typename Device, typename OP>
+void Eval(const TBlob &src, TBlob *ret, RunContext ctx);
+
 template<typename Device, typename OP, bool reverse>
 void Eval(const TBlob &lhs, const real_t &rhs, TBlob *ret, RunContext ctx);
 
diff --git a/src/ndarray/unary_function-inl.h b/src/ndarray/unary_function-inl.h
new file mode 100644
index 000000000000..1082fd826057
--- /dev/null
+++ b/src/ndarray/unary_function-inl.h
@@ -0,0 +1,116 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary-function-inl.h
+ * \brief the real execution functions of ndarray operations
+ */
+#ifndef MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
+#define MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
+
+#include "../common/tblob_op_registry.h"
+#include "../operator/mshadow_op.h"
+#include "../operator/operator_common.h"
+#if defined(__CUDACC__)
+#define XPU gpu
+#else
+#define XPU cpu
+#endif
+
+namespace mxnet {
+namespace ndarray {
+
+using namespace common; // NOLINT(*)
+
+template<typename xpu, typename OP>
+void UnaryForward_(const TBlob &src,
+                   TBlob *ret,
+                   OpReqType req,
+                   RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 2> out = ret->FlatTo2D<xpu, real_t>(s);
+  Assign(out, req, F<OP>(src.FlatTo2D<xpu, real_t>(s)));
+}
+
+// backward function that takes input value of the op
+template<typename xpu, typename OP>
+void UnaryBackwardUseIn_(const arg::OutGrad& out_grad,
+                         const arg::Input0& in_data0,
+                         TBlob *in_grad,
+                         OpReqType req,
+                         RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
+  Assign(igrad, req,
+         F<OP>(in_data0.data.FlatTo2D<xpu, real_t>(s)) *
+         out_grad.data.FlatTo2D<xpu, real_t>());
+}
+
+// backward function that takes output value of the op
+template<typename xpu, typename OP>
+void UnaryBackwardUseOut_(const arg::OutGrad& out_grad,
+                          const arg::OutValue& out_value,
+                          TBlob *in_grad,
+                          OpReqType req,
+                          RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 2> igrad = in_grad->FlatTo2D<xpu, real_t>(s);
+  Assign(igrad, req,
+         F<OP>(out_value.data.FlatTo2D<xpu, real_t>(s)) *
+         out_grad.data.FlatTo2D<xpu, real_t>());
+}
+
+// return a shape of scalar
+inline TShape ScalarShape(const TShape& ishape) {
+  mshadow::index_t shape[] = {1};
+  return TShape(shape, shape + 1);
+}
+
+template<typename xpu>
+void L2Norm(const TBlob &src,
+            TBlob *ret,
+            OpReqType req,
+            RunContext ctx) {
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  mshadow::Tensor<xpu, 1> out = ret->get<xpu, 1, real_t>(s);
+  mshadow::Tensor<xpu, 1> in =
+      src.get_with_shape<xpu, 1, real_t>(mshadow::Shape1(src.shape_.Size()), s);
+  mshadow::VectorDot(out, in, in);
+  out = mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out);
+}
+// Register all unary operations here
+// The true means inplace can be enabled.
+// square
+MXNET_REGISTER_TBLOB_FUN(square, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::square_grad>, true)
+.describe("Take square of the src");
+// sqrt
+MXNET_REGISTER_TBLOB_FUN(sqrt, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::square_root>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::square_root_grad>, true)
+.describe("Take sqrt of the src");
+// exp
+MXNET_REGISTER_TBLOB_FUN(exp, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::exp>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseOut_<XPU, op::mshadow_op::identity>, true)
+.describe("Take exp of the src");
+// log
+MXNET_REGISTER_TBLOB_FUN(log, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, op::mshadow_op::log>, true)
+.set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, op::mshadow_op::log_grad>, true)
+.describe("Take log of the src");
+// L2 norm
+MXNET_REGISTER_TBLOB_FUN(norm, XPU)
+.set_function(XPU::kDevMask, L2Norm<XPU>, false, false)
+.set_shape_infer(ScalarShape)
+.describe("Take L2 norm of the src."
+          "The result will be ndarray of shape (1,) on the same device.");
+
+}  // namespace ndarray
+}  // namespace mxnet
+#endif  // MXNET_NDARRAY_UNARY_FUNCTION_INL_H_
diff --git a/src/ndarray/unary_function.cc b/src/ndarray/unary_function.cc
new file mode 100644
index 000000000000..f77f113e611e
--- /dev/null
+++ b/src/ndarray/unary_function.cc
@@ -0,0 +1,7 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary_function.cc
+ * \brief CPU Implementation of unary function.
+ */
+// this will be invoked by gcc and compile CPU version
+#include "./unary_function-inl.h"
diff --git a/src/ndarray/unary_function.cu b/src/ndarray/unary_function.cu
new file mode 100644
index 000000000000..0c0d4e64957c
--- /dev/null
+++ b/src/ndarray/unary_function.cu
@@ -0,0 +1,8 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file unary_function.cu
+ * \brief GPU Implementation of unary function.
+ */
+// this will be invoked by gcc and compile GPU version
+// real common implementation is only in the -inl.h file.
+#include "./unary_function-inl.h"
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index e18dbe68fb25..6280c1664e84 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -21,18 +21,20 @@ namespace mxnet {
 namespace op {
 // Declare enumeration of input order to make code more intuitive.
 // // These enums are only visible within this header
+namespace activation {
 enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
 enum ActivationOpType {kReLU, kSigmoid, kTanh};
+}  // activation
 
 struct ActivationParam : public dmlc::Parameter<ActivationParam> {
   // use int for enumeration
   int act_type;
   DMLC_DECLARE_PARAMETER(ActivationParam) {
     DMLC_DECLARE_FIELD(act_type)
-    .add_enum("relu", kReLU)
-    .add_enum("sigmoid", kSigmoid)
-    .add_enum("tanh", kTanh)
+    .add_enum("relu", activation::kReLU)
+    .add_enum("sigmoid", activation::kSigmoid)
+    .add_enum("tanh", activation::kTanh)
     .describe("Activation function to be applied.");
   }
 };
@@ -54,9 +56,9 @@ class ActivationOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[kOut], F<ForwardOp>(data));
+    Tensor<xpu, 2> data = in_data[activation::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[activation::kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[activation::kOut], F<ForwardOp>(data));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -72,10 +74,10 @@ class ActivationOp : public Operator {
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
     CHECK_EQ(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> m_out_grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> m_out_data = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> m_in_grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(m_in_grad, req[kData], F<BackwardOp>(m_out_data) * m_out_grad);
+    Tensor<xpu, 2> m_out_grad = out_grad[activation::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_out_data = out_data[activation::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_in_grad = in_grad[activation::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(m_in_grad, req[activation::kData], F<BackwardOp>(m_out_data) * m_out_grad);
   }
 };  // class ActivationOp
 
@@ -99,7 +101,7 @@ class ActivationProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(activation::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -122,9 +124,9 @@ class ActivationProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
 #if MXNET_USE_CUDNN == 1
-    return {out_grad[kOut], out_data[kOut], in_data[kData]};
+    return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]};
 #else
-    return {out_grad[kOut], out_data[kOut]};
+    return {out_grad[activation::kOut], out_data[activation::kOut]};
 #endif  // MXNET_USE_CUDNN
   }
 
@@ -133,13 +135,13 @@ class ActivationProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[activation::kOut], in_grad[activation::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[activation::kData], out_data[activation::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index 04a8da24eed9..019ac4bf0bb7 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -12,11 +12,11 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(ActivationParam param) {
   switch (param.act_type) {
-    case kReLU:
+    case activation::kReLU:
       return new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad>();
-    case kSigmoid:
+    case activation::kSigmoid:
       return new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
-    case kTanh:
+    case activation::kTanh:
       return new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
     default:
       LOG(FATAL) << "unknown activation type";
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
index 2c9c29c04f45..51cac51c70f4 100644
--- a/src/operator/activation.cu
+++ b/src/operator/activation.cu
@@ -18,11 +18,11 @@ Operator *CreateOp<gpu>(ActivationParam param) {
   return new CuDNNActivationOp(param);
 #else
   switch(param.act_type) {
-    case kReLU:
+    case activation::kReLU:
       return new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad>();
-    case kSigmoid:
+    case activation::kSigmoid:
       return new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad>();
-    case kTanh:
+    case activation::kTanh:
       return new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad>();
     default:
       LOG(FATAL) << "unknown activation";
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index 625614725282..f031058f899e 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -19,10 +19,13 @@
 
 namespace mxnet {
 namespace op {
+
+namespace batchnorm {
 enum BatchNormOpInputs {kData, kGamma, kBeta};
 enum BatchNormOpOutputs {kOut, kOutNoAffine, kMean, kVar};
 enum BatchNormOpAuxiliary {kMovingMean, kMovingVar};
 enum BatchNormBackResource {kTempSpace};
+}  // namespace batchnorm
 
 struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
   float eps;
@@ -57,48 +60,48 @@ class BatchNormOp : public Operator {
     } else {
       CHECK_GE(out_data.size(), 1);
       CHECK_GE(req.size(), 1);
-      CHECK_EQ(req[kOut], kWriteTo);
+      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
     }
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const real_t scale = static_cast<real_t>(in_data[kData].shape_[1]) /
-                         static_cast<real_t>(in_data[kData].shape_.Size());
+    const real_t scale = static_cast<real_t>(in_data[batchnorm::kData].shape_[1]) /
+                         static_cast<real_t>(in_data[batchnorm::kData].shape_.Size());
     Tensor<xpu, 4> data;
     Tensor<xpu, 4> out, out_no_affine;
-    if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+    if (in_data[batchnorm::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0],
+                               in_data[batchnorm::kData].shape_[1], 1, 1);
+      data = in_data[batchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out = out_data[batchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
       if (ctx.is_train) {
-        out_no_affine = out_data[kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
+        out_no_affine = out_data[batchnorm::kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
+      data = in_data[batchnorm::kData].get<xpu, 4, real_t>(s);
+      out = out_data[batchnorm::kOut].get<xpu, 4, real_t>(s);
       if (ctx.is_train) {
-        out_no_affine = out_data[kOutNoAffine].get<xpu, 4, real_t>(s);
+        out_no_affine = out_data[batchnorm::kOutNoAffine].get<xpu, 4, real_t>(s);
       }
     }
-    Tensor<xpu, 1> slope = in_data[kGamma].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> bias = in_data[kBeta].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_mean = aux_states[kMovingMean].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> moving_var = aux_states[kMovingVar].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> slope = in_data[batchnorm::kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> bias = in_data[batchnorm::kBeta].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
     // cal
     if (ctx.is_train) {
-      Tensor<xpu, 1> mean = out_data[kMean].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 1> var = out_data[kVar].get<xpu, 1, real_t>(s);
-      Assign(mean, req[kMean], scale * sumall_except_dim<1>(data));
-      Assign(var, req[kVar], scale * sumall_except_dim<1>(
+      Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 1> var = out_data[batchnorm::kVar].get<xpu, 1, real_t>(s);
+      Assign(mean, req[batchnorm::kMean], scale * sumall_except_dim<1>(data));
+      Assign(var, req[batchnorm::kVar], scale * sumall_except_dim<1>(
                F<mshadow_op::square>(data - broadcast<1>(mean, data.shape_))));
-      Assign(out_no_affine, req[kOutNoAffine], (data - broadcast<1>(mean, data.shape_)) /
+      Assign(out_no_affine, req[batchnorm::kOutNoAffine], (data - broadcast<1>(mean, data.shape_)) /
              F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)));
-      Assign(out, req[kOut], out_no_affine * broadcast<1>(slope, out.shape_) +
+      Assign(out, req[batchnorm::kOut], out_no_affine * broadcast<1>(slope, out.shape_) +
              broadcast<1>(bias, out.shape_));
       moving_mean = moving_mean * param_.momentum + mean * (1 - param_.momentum);
       moving_var = moving_var * param_.momentum + var * (1 - param_.momentum);
     } else {
-      Assign(out, req[kOut], broadcast<1>(slope /
+      Assign(out, req[batchnorm::kOut], broadcast<1>(slope /
                                           F<mshadow_op::square_root>(moving_var + param_.eps),
                                           data.shape_) * data +
              broadcast<1>(bias - (slope * moving_mean) /
@@ -122,32 +125,32 @@ class BatchNormOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data, grad, grad_in;
     Tensor<xpu, 4> out, out_no_affine;
-    const real_t scale = static_cast<real_t>(out_data[kOut].shape_[1]) /
-                         static_cast<real_t>(out_data[kOut].shape_.Size());
-    if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {out_data[kOut].shape_[0], out_data[kOut].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      grad_in = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out_no_affine = out_data[kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
+    const real_t scale = static_cast<real_t>(out_data[batchnorm::kOut].shape_[1]) /
+                         static_cast<real_t>(out_data[batchnorm::kOut].shape_.Size());
+    if (in_data[batchnorm::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_data[batchnorm::kOut].shape_[0],
+                               out_data[batchnorm::kOut].shape_[1], 1, 1);
+      data = in_data[batchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      grad = out_grad[batchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      grad_in = in_grad[batchnorm::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out = out_data[batchnorm::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out_no_affine = out_data[batchnorm::kOutNoAffine].get_with_shape<xpu, 4, real_t>(dshape, s);
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
-      grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-      grad_in = in_grad[kData].get<xpu, 4, real_t>(s);
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
-      out_no_affine = out_data[kOutNoAffine].get<xpu, 4, real_t>(s);
+      data = in_data[batchnorm::kData].get<xpu, 4, real_t>(s);
+      grad = out_grad[batchnorm::kOut].get<xpu, 4, real_t>(s);
+      grad_in = in_grad[batchnorm::kData].get<xpu, 4, real_t>(s);
+      out = out_data[batchnorm::kOut].get<xpu, 4, real_t>(s);
+      out_no_affine = out_data[batchnorm::kOutNoAffine].get<xpu, 4, real_t>(s);
     }
 
-    Tensor<xpu, 1> mean = out_data[kMean].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> var = out_data[kVar].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> slope = in_data[kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> var = out_data[batchnorm::kVar].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> slope = in_data[batchnorm::kGamma].get<xpu, 1, real_t>(s);
     // Tensor<xpu, 1> bias = in_data[kBeta].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> gslope = in_grad[kGamma].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 1> gbias = in_grad[kBeta].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> gslope = in_grad[batchnorm::kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> gbias = in_grad[batchnorm::kBeta].get<xpu, 1, real_t>(s);
     // get requested temp space
-    Tensor<xpu, 2> workspace = ctx.requested[kTempSpace].get_space<xpu>(
+    Tensor<xpu, 2> workspace = ctx.requested[batchnorm::kTempSpace].get_space<xpu>(
         mshadow::Shape2(3, out.shape_[1]), s);
     Tensor<xpu, 1> gmean = workspace[0];
     Tensor<xpu, 1> gvar = workspace[1];
@@ -164,9 +167,9 @@ class BatchNormOp : public Operator {
     tmp *= gvar;
     gmean += tmp;
     // assign
-    Assign(gslope, req[kGamma], sumall_except_dim<1>(grad * out_no_affine));
-    Assign(gbias, req[kBeta], sumall_except_dim<1>(grad));
-    Assign(grad_in, req[kData], (grad * broadcast<1>(slope, data.shape_)) *
+    Assign(gslope, req[batchnorm::kGamma], sumall_except_dim<1>(grad * out_no_affine));
+    Assign(gbias, req[batchnorm::kBeta], sumall_except_dim<1>(grad));
+    Assign(grad_in, req[batchnorm::kData], (grad * broadcast<1>(slope, data.shape_)) *
            broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
            broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
                                                                                  data.shape_)) +
@@ -226,9 +229,14 @@ class BatchNormProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut],
-            out_data[kOut], out_data[kOutNoAffine], out_data[kMean], out_data[kVar],
-            in_data[kData], in_data[kGamma], in_data[kBeta]
+    return {out_grad[batchnorm::kOut],
+            out_data[batchnorm::kOut],
+            out_data[batchnorm::kOutNoAffine],
+            out_data[batchnorm::kMean],
+            out_data[batchnorm::kVar],
+            in_data[batchnorm::kData],
+            in_data[batchnorm::kGamma],
+            in_data[batchnorm::kBeta]
            };
   }
 
@@ -237,7 +245,7 @@ class BatchNormProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[batchnorm::kOut], in_grad[batchnorm::kData]}};
   }
 
   std::vector<ResourceRequest> BackwardResource(
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
new file mode 100644
index 000000000000..012dc7a2da63
--- /dev/null
+++ b/src/operator/block_grad-inl.h
@@ -0,0 +1,112 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file block_grad-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_BLOCK_GRAD_INL_H_
+#define MXNET_OPERATOR_BLOCK_GRAD_INL_H_
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./mshadow_op.h"
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace blockgrad {
+enum BlockGradientOpInputs {kData};
+enum BlockGradientOpOutputs {kOut};
+}  // namespace blockgrad
+
+template<typename xpu>
+class BlockGradientOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> data = in_data[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[blockgrad::kOut].FlatTo2D<xpu, real_t>(s);
+    out = F<mshadow_op::identity>(data);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2> grad = in_grad[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
+    grad = 0.f;
+  }
+};  // class BlockGradientOp
+
+template<typename xpu>
+Operator *CreateOp();
+
+#if DMLC_USE_CXX11
+class BlockGradientProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {}
+
+  std::map<std::string, std::string> GetParams() const override {
+    return std::map<std::string, std::string>();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1);
+    const TShape &dshape = in_shape->at(blockgrad::kData);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    return new BlockGradientProp();
+  }
+
+  std::string TypeString() const override {
+    return "BlockGrad";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const override {
+    return {};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<void*> &out_data) const override {
+    return {{in_data[blockgrad::kData], out_data[blockgrad::kOut]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+};  // class BlockGradientProperty
+
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_BLOCK_GRAD_INL_H_
diff --git a/src/operator/block_grad.cc b/src/operator/block_grad.cc
new file mode 100644
index 000000000000..67256f79f268
--- /dev/null
+++ b/src/operator/block_grad.cc
@@ -0,0 +1,26 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file block_grad.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./block_grad-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>() {
+  return new BlockGradientOp<cpu>();
+}
+
+Operator *BlockGradientProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp);
+}
+
+MXNET_REGISTER_OP_PROPERTY(BlockGrad, BlockGradientProp)
+.describe("Get output from a symbol and pass 0 gradient back")
+.add_argument("data", "Symbol", "Input data.");
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/block_grad.cu b/src/operator/block_grad.cu
new file mode 100644
index 000000000000..22707e940b7e
--- /dev/null
+++ b/src/operator/block_grad.cu
@@ -0,0 +1,18 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file block_grad.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./block_grad-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>() {
+  return new BlockGradientOp<gpu>();
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index cc51079462b8..3e9c812603e3 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -20,8 +20,10 @@
 namespace mxnet {
 namespace op {
 
+namespace concat_enum {
 enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
 enum ConcatOpOutputs {kOut};
+}  // namespace concat_enum
 
 struct ConcatParam : public dmlc::Parameter<ConcatParam> {
   int num_args;
@@ -46,26 +48,24 @@ class ConcatOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(static_cast<int>(in_data.size()), size_);
     CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(req[concat_enum::kOut], kWriteTo);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > data(size_);
     Tensor<xpu, 4> out;
-    if (in_data[kData0].ndim() == 2) {
+    if (in_data[concat_enum::kData0].ndim() == 2) {
       uint32_t dim = 0;
       for (int i = 0; i < size_; ++i) {
-        uint32_t ds[] = {in_data[i].shape_[0], in_data[i].shape_[1], 1, 1};
-        TShape dshape(ds, ds + 4);
+        Shape<4> dshape = Shape4(in_data[i].shape_[0], in_data[i].shape_[1], 1, 1);
         data[i] = in_data[i].get_with_shape<xpu, 4, real_t>(dshape, s);
         dim += in_data[i].shape_[1];
       }
-      uint32_t ds_out[] = {in_data[kData0].shape_[0], dim, 1, 1};
-      TShape dshape_out(ds_out, ds_out + 4);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
+      Shape<4> dshape_out = Shape4(in_data[concat_enum::kData0].shape_[0], dim, 1, 1);
+      out = out_data[concat_enum::kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
     } else {
       for (int i = 0; i < size_; ++i) {
         data[i] = in_data[i].get<xpu, 4, real_t>(s);
       }
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
+      out = out_data[concat_enum::kOut].get<xpu, 4, real_t>(s);
     }
     Concatenate(data, &out);
   }
@@ -84,24 +84,22 @@ class ConcatOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > grad_in(size_);
     Tensor<xpu, 4> grad;
-    if (out_grad[kOut].ndim() == 2) {
+    if (out_grad[concat_enum::kOut].ndim() == 2) {
       uint32_t dim = 0;
       for (int i = 0; i < size_; ++i) {
-        uint32_t ds[] = {in_grad[i].shape_[0], in_grad[i].shape_[1], 1, 1};
-        TShape dshape(ds, ds + 4);
+        Shape<4> dshape = Shape4(in_grad[i].shape_[0], in_grad[i].shape_[1], 1, 1);
         grad_in[i] = in_grad[i].get_with_shape<xpu, 4, real_t>(dshape, s);
         dim += in_grad[i].shape_[1];
         CHECK_EQ(req[i], kWriteTo);
       }
-      uint32_t ds_out[] = {in_grad[kData0].shape_[0], dim, 1, 1};
-      TShape dshape_out(ds_out, ds_out + 4);
-      grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
+      Shape<4> dshape_out = Shape4(in_grad[concat_enum::kData0].shape_[0], dim, 1, 1);
+      grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 4, real_t>(dshape_out, s);
     } else {
       for (int i = 0; i < size_; ++i) {
         grad_in[i] = in_grad[i].get<xpu, 4, real_t>(s);
         CHECK_EQ(req[i], kWriteTo);
       }
-      grad = out_grad[kOut].get<xpu, 4, real_t>(s);
+      grad = out_grad[concat_enum::kOut].get<xpu, 4, real_t>(s);
     }
     Split(grad, &grad_in);
   }
@@ -137,7 +135,7 @@ class ConcatProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-    TShape dshape = in_shape->at(kData0);
+    TShape dshape = in_shape->at(concat_enum::kData0);
     if (dshape.ndim() == 0) return false;
     CHECK_GT(dshape.ndim(), 1);
     for (int i = 1; i < param_.num_args; ++i) {
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 7299ad97ec0b..29a9288b2870 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -21,9 +21,11 @@
 namespace mxnet {
 namespace op {
 
+namespace conv {
 enum ConvolutionOpInputs {kData, kWeight, kBias};
 enum ConvolutionOpOutputs {kOut};
 enum ConvolutionOpResource {kTempSpace};
+}
 
 struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   TShape kernel;
@@ -68,26 +70,24 @@ class ConvolutionOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(req[conv::kOut], kWriteTo);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
-    // TODO(bing): check the BLAS Handle, be careful
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    uint32_t ws[] = {param_.num_group,
-                     param_.num_filter / param_.num_group,
-                     data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]
-                    };
-    TShape wmat_shape(ws, ws + 3);
-    Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[conv::kData].get<xpu, 4, real_t>(s);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               param_.num_filter / param_.num_group,
+               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3> wmat = in_data[conv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+    Tensor<xpu, 4> out = out_data[conv::kOut].get<xpu, 4, real_t>(s);
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
     const index_t nbatch = data.size(0);
-    Tensor<xpu, 1> workspace = ctx.requested[kTempSpace].get_space<xpu>(
+    Tensor<xpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<xpu>(
         Shape1(this->InitTemp(data.shape_, out.shape_)), s);
     for (index_t i = 0; i < nbatch; i += nstep_) {
       const index_t step = std::min(nstep_, nbatch - i);
@@ -102,15 +102,15 @@ class ConvolutionOp : public Operator {
         temp_col = unpack_patch2col(data.Slice(i, i + step),
                                     param_.kernel[0],
                                     param_.kernel[1],
-                                    param_.stride[0]);
-        // TODO(bing): make mshadow support dual stride
+                                    param_.stride[0],
+                                    param_.stride[1]);
       } else {
         temp_col = unpack_patch2col(pad(data.Slice(i, i + step),
                                         param_.pad[0], param_.pad[1]),
                                     param_.kernel[0],
                                     param_.kernel[1],
-                                    param_.stride[0]);
-        // TODO(bing): make mshadow support dual stride
+                                    param_.stride[0],
+                                    param_.stride[1]);
       }
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
@@ -126,7 +126,7 @@ class ConvolutionOp : public Operator {
     }
     if (!param_.no_bias) {
       // add bias, broadcast bias to dim 1: channel
-      Tensor<xpu, 1> bias = in_data[kBias].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 1> bias = in_data[conv::kBias].get<xpu, 1, real_t>(s);
       out += broadcast<1>(bias, out.shape_);
     }
   }
@@ -145,25 +145,24 @@ class ConvolutionOp : public Operator {
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
-    CHECK_EQ(in_data[kWeight].CheckContiguous(), true);
+    CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
     // get data
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    uint32_t ws[] = {param_.num_group,
-                     param_.num_filter / param_.num_group,
-                     data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]
-                    };
-    TShape wmat_shape(ws, ws + 3);
-    Tensor<xpu, 3> wmat = in_data[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
-    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> gdata = in_grad[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> gwmat = in_grad[kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+    Tensor<xpu, 4> data = in_data[conv::kData].get<xpu, 4, real_t>(s);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               param_.num_filter / param_.num_group,
+               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3> wmat = in_data[conv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
+    Tensor<xpu, 4> grad = out_grad[conv::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> gdata = in_grad[conv::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 3> gwmat = in_grad[conv::kWeight].get_with_shape<xpu, 3, real_t>(wmat_shape, s);
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
     const index_t nbatch = data.size(0);
-    Tensor<xpu, 1> workspace = ctx.requested[kTempSpace].get_space<xpu>(
+    Tensor<xpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<xpu>(
               Shape1(this->InitTemp(data.shape_, grad.shape_)), s);
     for (index_t i = 0; i < nbatch; i += nstep_) {
       const index_t step = std::min(nstep_, nbatch - i);
@@ -176,29 +175,29 @@ class ConvolutionOp : public Operator {
                                                       shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst.shape_);
       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        // TODO(bing): dual stride
         temp_col = unpack_patch2col(data.Slice(i, i + step),
                                      param_.kernel[0],
                                      param_.kernel[1],
-                                     param_.stride[0]);
+                                     param_.stride[0],
+                                     param_.stride[1]);
       } else {
-        // TODO(bing): dual stride
         temp_col = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]),
                                      param_.kernel[0],
                                      param_.kernel[1],
-                                     param_.stride[0]);
+                                     param_.stride[0],
+                                     param_.stride[1]);
       }
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         Tensor<xpu, 2> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
         if (i == 0) {
           Tensor<xpu, 2> tmp_gwmat = gwmat[gid];
-          Assign(tmp_gwmat, req[kWeight], dot(temp_dst[gid], tmpc.T()));
+          Assign(tmp_gwmat, req[conv::kWeight], dot(temp_dst[gid], tmpc.T()));
         } else {
           gwmat[gid] += dot(temp_dst[gid], tmpc.T());
         }
       }
-      if (req[kData] == kWriteTo || req[kData] == kWriteInplace) {
+      if (req[conv::kData] == kWriteTo || req[conv::kData] == kWriteInplace) {
         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
           Tensor<xpu, 2> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
           tmpc = dot(wmat[gid].T(), temp_dst[gid]);
@@ -223,8 +222,8 @@ class ConvolutionOp : public Operator {
       }
     }
     if (!param_.no_bias) {
-      Tensor<xpu, 1> gbias = in_grad[kBias].get<xpu, 1, real_t>(s);
-      Assign(gbias, req[kBias], sumall_except_dim<1>(grad));
+      Tensor<xpu, 1> gbias = in_grad[conv::kBias].get<xpu, 1, real_t>(s);
+      Assign(gbias, req[conv::kBias], sumall_except_dim<1>(grad));
     }
   }
 
@@ -291,24 +290,20 @@ class ConvolutionProp : public OperatorProperty {
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
     }
-    const TShape &dshape = (*in_shape)[kData];
+    const TShape &dshape = (*in_shape)[conv::kData];
     if (dshape.ndim() ==  0) return false;
     CHECK_EQ(dshape.ndim(), 4) \
         << "Input data should be 4D in batch-num_filter-y-x";
     SHAPE_ASSIGN_CHECK(*in_shape,
-                       kWeight,
+                       conv::kWeight,
                        Shape4(param_.num_filter, dshape[1], param_.kernel[0], param_.kernel[1]));
     if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_filter));
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
     }
     out_shape->clear();
     out_shape->push_back(dshape);
     const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
     const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-    const index_t kstride = static_cast<index_t>(param_.stride[0]);
-    // TODO(bing) : support dual stride
-    CHECK_EQ(param_.stride[0], param_.stride[1])
-        << "Only support same stride now";
     CHECK_EQ(dshape[1] % param_.num_group, 0) \
         << "input num_filter must divide group size";
     CHECK_EQ(param_.num_filter % param_.num_group, 0) \
@@ -319,9 +314,9 @@ class ConvolutionProp : public OperatorProperty {
         << "incorrect stride size: " << param_.stride;
     CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
         << "kernel size exceed input";
-    (*out_shape)[kOut][1] = param_.num_filter;
-    (*out_shape)[kOut][2] = (dshape[2] + 2 * param_.pad[0] - ksize_y) / kstride + 1;
-    (*out_shape)[kOut][3] = (dshape[3] + 2 * param_.pad[1] - ksize_x) / kstride + 1;
+    (*out_shape)[conv::kOut][1] = param_.num_filter;
+    (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] - ksize_y) / param_.stride[0] + 1;
+    (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] - ksize_x) / param_.stride[1] + 1;
     return true;
   }
 
@@ -339,7 +334,7 @@ class ConvolutionProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], in_data[kData], in_data[kWeight]};
+    return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]};
   }
 
   std::vector<ResourceRequest> ForwardResource(
diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/cudnn_activation-inl.h
index 99bbfe93e871..7e6acea7c952 100644
--- a/src/operator/cudnn_activation-inl.h
+++ b/src/operator/cudnn_activation-inl.h
@@ -20,13 +20,13 @@ class CuDNNActivationOp : public Operator {
     init_cudnn_ = false;
     dtype_ = CUDNN_DATA_FLOAT;
     switch (param_.act_type) {
-      case kReLU:
+      case activation::kReLU:
         mode_ = CUDNN_ACTIVATION_RELU;
         break;
-      case kSigmoid:
+      case activation::kSigmoid:
         mode_ = CUDNN_ACTIVATION_SIGMOID;
         break;
-      case kTanh:
+      case activation::kTanh:
         mode_ = CUDNN_ACTIVATION_TANH;
         break;
       default:
@@ -51,14 +51,14 @@ class CuDNNActivationOp : public Operator {
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> out;
-    if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
-      data = in_data[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+    if (in_data[activation::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+                               in_data[activation::kData].shape_[1], 1, 1);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data[activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
     } else {
-      data = in_data[kData].get<gpu, 4, real_t>(s);
-      out = out_data[kOut].get<gpu, 4, real_t>(s);
+      data = in_data[activation::kData].get<gpu, 4, real_t>(s);
+      out = out_data[activation::kOut].get<gpu, 4, real_t>(s);
     }
     float alpha = 1.0f;
     float beta = 0.0f;
@@ -105,18 +105,18 @@ class CuDNNActivationOp : public Operator {
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> output_data;
     Tensor<gpu, 4> input_grad;
-    if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
-      data = in_data[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      grad = out_grad[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      output_data = out_data[kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad[kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+    if (in_data[activation::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+                               in_data[activation::kData].shape_[1], 1, 1);
+      data = in_data[activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
     } else {
-      data = in_data[kData].get<gpu, 4, real_t>(s);
-      output_data = out_data[kOut].get<gpu, 4, real_t>(s);
-      grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-      input_grad = in_grad[kData].get<gpu, 4, real_t>(s);
+      data = in_data[activation::kData].get<gpu, 4, real_t>(s);
+      output_data = out_data[activation::kOut].get<gpu, 4, real_t>(s);
+      grad = out_grad[activation::kOut].get<gpu, 4, real_t>(s);
+      input_grad = in_grad[activation::kData].get<gpu, 4, real_t>(s);
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     CHECK_EQ(cudnnActivationBackward(s->dnn_handle_,
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index 2a89e7ee72bc..123187608237 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -47,16 +47,16 @@ class CuDNNConvolutionOp : public Operator {
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> wmat = in_data[kWeight].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[conv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> wmat = in_data[conv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> out = out_data[conv::kOut].get<gpu, 4, real_t>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(wmat.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
-    Tensor<gpu, 1> workspace = ctx.requested[kTempSpace].get_space<gpu>(
+    Tensor<gpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<gpu>(
       mshadow::Shape1(forward_workspace_), s);
     CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_,
                                      &alpha,
@@ -73,7 +73,7 @@ class CuDNNConvolutionOp : public Operator {
                                      out.dptr_), CUDNN_STATUS_SUCCESS);
     if (!param_.no_bias) {
       beta = 1.0f;
-      Tensor<gpu, 1> bias = in_data[kBias].get<gpu, 1, real_t>(s);
+      Tensor<gpu, 1> bias = in_data[conv::kBias].get<gpu, 1, real_t>(s);
       CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
                               CUDNN_ADD_SAME_C,
                               &alpha,
@@ -100,17 +100,17 @@ class CuDNNConvolutionOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     // TODO(bing): think about how to support add to
-    CHECK_EQ(req[kWeight], kWriteTo);
+    CHECK_EQ(req[conv::kWeight], kWriteTo);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> wmat = in_data[kWeight].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> gwmat = in_grad[kWeight].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> gdata = in_grad[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 1> workspace = ctx.requested[kTempSpace].get_space<gpu>(
+    Tensor<gpu, 4> grad = out_grad[conv::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> wmat = in_data[conv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> gwmat = in_grad[conv::kWeight].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[conv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> gdata = in_grad[conv::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 1> workspace = ctx.requested[conv::kTempSpace].get_space<gpu>(
       mshadow::Shape1(backward_workspace_), s);
     if (!param_.no_bias) {
-      Tensor<gpu, 1> gbias = in_grad[kBias].get<gpu, 1, real_t>(s);
+      Tensor<gpu, 1> gbias = in_grad[conv::kBias].get<gpu, 1, real_t>(s);
       CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_,
                                             &alpha,
                                             out_desc_,
@@ -160,8 +160,8 @@ class CuDNNConvolutionOp : public Operator {
       size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(real_t));
       size_t back_size = 0;
       size_t back_size_w = 0;
-      Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> data = in_data[conv::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[conv::kOut].get<gpu, 4, real_t>(s);
       CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
@@ -196,7 +196,7 @@ class CuDNNConvolutionOp : public Operator {
                                           out.shape_[2],
                                           out.shape_[3]), CUDNN_STATUS_SUCCESS);
       if (!param_.no_bias) {
-        Tensor<gpu, 1> bias = in_data[kBias].get<gpu, 1, real_t>(s);
+        Tensor<gpu, 1> bias = in_data[conv::kBias].get<gpu, 1, real_t>(s);
         CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc_,
                                             CUDNN_TENSOR_NCHW,
                                             dtype_,
diff --git a/src/operator/cudnn_lrn-inl.h b/src/operator/cudnn_lrn-inl.h
index eb520b2fbe68..e14c9f742eaa 100644
--- a/src/operator/cudnn_lrn-inl.h
+++ b/src/operator/cudnn_lrn-inl.h
@@ -38,8 +38,8 @@ class CuDNNLocalResponseNormOp : public Operator {
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> out = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
     if (!init_cudnn_) {
       this->Init(s, in_data, out_data);
     }
@@ -72,10 +72,10 @@ class CuDNNLocalResponseNormOp : public Operator {
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> output_data = out_data[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> input_grad = in_grad[kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> grad = out_grad[lrn_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> output_data = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> input_grad = in_grad[lrn_enum::kData].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     CHECK_EQ(cudnnLRNCrossChannelBackward(s->dnn_handle_,
                                           lrn_desc_,
@@ -101,8 +101,8 @@ class CuDNNLocalResponseNormOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (!init_cudnn_) {
       init_cudnn_ = true;
-      Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
       unsigned lrn_n = param_.nsize;
       double alpha = param_.alpha;
       double beta = param_.beta;
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
index 67958ed46f26..3a56b6e545b8 100644
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -22,10 +22,10 @@ class CuDNNPoolingOp : public Operator {
     // TODO(xxx): fp16
     dtype_ = CUDNN_DATA_FLOAT;
     switch (param_.pool_type) {
-      case kMaxPooling:
+      case pool_enum::kMaxPooling:
         mode_ = CUDNN_POOLING_MAX;
         break;
-      case kAvgPooling:
+      case pool_enum::kAvgPooling:
         mode_ = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
         break;
       default:
@@ -49,8 +49,8 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     if (!init_cudnn_) {
       this->Init(s, in_data, out_data);
@@ -85,10 +85,10 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_grad.size(), 1);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> m_out_grad = out_grad[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_data = in_data[kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_out_data = out_data[kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_grad = in_grad[kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_in_data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     float alpha = 1.0f;
     float beta = 0.0f;
@@ -115,8 +115,8 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (!init_cudnn_) {
       init_cudnn_ = true;
-      Tensor<gpu, 4> data = in_data[kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
       CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index 877eab61226b..fa76bd38ccf6 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -17,9 +17,11 @@
 #include "./operator_common.h"
 #include "./mshadow_op.h"
 
+namespace dropout {
 enum DropoutOpInputs {kData};
 enum DropoutOpOutputs {kOut, kMask};
 enum DropoutOpForwardResource {kRandom};
+}  // namespace dropout
 
 namespace mxnet {
 namespace op {
@@ -52,15 +54,15 @@ class DropoutOp : public Operator {
       CHECK_EQ(out_data.size(), 2);
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[dropout::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[dropout::kOut].FlatTo2D<xpu, real_t>(s);
     if (ctx.is_train) {
-      Tensor<xpu, 2> mask = out_data[kMask].FlatTo2D<xpu, real_t>(s);
-      Random<xpu> *prnd = ctx.requested[kRandom].get_random<xpu>(s);
+      Tensor<xpu, 2> mask = out_data[dropout::kMask].FlatTo2D<xpu, real_t>(s);
+      Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu>(s);
       mask = F<mshadow_op::threshold>(prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_);
-      Assign(out, req[kOut], data * mask);
+      Assign(out, req[dropout::kOut], data * mask);
     } else {
-      Assign(out, req[kOut], F<mshadow_op::identity>(data));
+      Assign(out, req[dropout::kOut], F<mshadow_op::identity>(data));
     }
   }
 
@@ -76,10 +78,10 @@ class DropoutOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> mask = out_data[kMask].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> gdata = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(gdata, req[kData], grad * mask);
+    Tensor<xpu, 2> grad = out_grad[dropout::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> mask = out_data[dropout::kMask].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> gdata = in_grad[dropout::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(gdata, req[dropout::kData], grad * mask);
   }
 
  private:
@@ -128,7 +130,7 @@ class DropoutProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], out_data[kMask]};
+    return {out_grad[dropout::kOut], out_data[dropout::kMask]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -136,13 +138,13 @@ class DropoutProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[dropout::kOut], in_grad[dropout::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[dropout::kData], out_data[dropout::kOut]}};
   }
 
   std::vector<ResourceRequest> ForwardResource(
diff --git a/src/operator/elementwise_binary_op-inl.h b/src/operator/elementwise_binary_op-inl.h
index 6b1cf62242bd..4a751146b769 100644
--- a/src/operator/elementwise_binary_op-inl.h
+++ b/src/operator/elementwise_binary_op-inl.h
@@ -19,31 +19,33 @@
 namespace mxnet {
 namespace op {
 
+namespace elembinary {
 enum ElementWiseBinaryOpInputs {kLhs, kRhs};
 enum ElementWiseBinaryOpOutputs {kOut};
 enum ElementWiseBinaryOpType {kPlus, kMinus, kMul, kDiv};
+}  // elembinary
 
 template<typename Op>
-inline ElementWiseBinaryOpType GetOpType();
+inline elembinary::ElementWiseBinaryOpType GetOpType();
 
 template<typename Op>
 inline const char* GetOpTypeString();
 
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::plus>() {
-  return kPlus;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::plus>() {
+  return elembinary::kPlus;
 }
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::minus>() {
-  return kMinus;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::minus>() {
+  return elembinary::kMinus;
 }
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::mul>() {
-  return kMul;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::mul>() {
+  return elembinary::kMul;
 }
 template<>
-inline ElementWiseBinaryOpType GetOpType<mshadow::op::div>() {
-  return kDiv;
+inline elembinary::ElementWiseBinaryOpType GetOpType<mshadow::op::div>() {
+  return elembinary::kDiv;
 }
 
 template<>
@@ -78,10 +80,10 @@ class ElementWiseBinaryOp : public Operator {
     CHECK_EQ(in_data.size(), 2);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> lhs = in_data[kLhs].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> rhs = in_data[kRhs].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[kOut], F<ForwardOp>(lhs, rhs));
+    Tensor<xpu, 2> lhs = in_data[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> rhs = in_data[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[elembinary::kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[elembinary::kOut], F<ForwardOp>(lhs, rhs));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -98,37 +100,37 @@ class ElementWiseBinaryOp : public Operator {
     CHECK_EQ(req.size(), 2);
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> m_out_grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> lhs_grad = in_grad[kLhs].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> rhs_grad = in_grad[kRhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> m_out_grad = out_grad[elembinary::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> lhs_grad = in_grad[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> rhs_grad = in_grad[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
     switch (GetOpType<ForwardOp>()) {
-      case kPlus: {
-        Assign(lhs_grad, req[kLhs], F<mshadow_op::identity>(m_out_grad));
-        Assign(rhs_grad, req[kRhs], F<mshadow_op::identity>(m_out_grad));
+      case elembinary::kPlus: {
+        Assign(lhs_grad, req[elembinary::kLhs], F<mshadow_op::identity>(m_out_grad));
+        Assign(rhs_grad, req[elembinary::kRhs], F<mshadow_op::identity>(m_out_grad));
         break;
       }
-      case kMinus: {
-        Assign(lhs_grad, req[kLhs], F<mshadow_op::identity>(m_out_grad));
-        Assign(rhs_grad, req[kRhs], F<mshadow_op::negation>(m_out_grad));
+      case elembinary::kMinus: {
+        Assign(lhs_grad, req[elembinary::kLhs], F<mshadow_op::identity>(m_out_grad));
+        Assign(rhs_grad, req[elembinary::kRhs], F<mshadow_op::negation>(m_out_grad));
         break;
       }
-      case kMul: {
-        Tensor<xpu, 2> lhs_data = in_data[kLhs].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> rhs_data = in_data[kRhs].FlatTo2D<xpu, real_t>(s);
+      case elembinary::kMul: {
+        Tensor<xpu, 2> lhs_data = in_data[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> rhs_data = in_data[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
         // rhs cannot do inplace
-        CHECK_NE(req[kRhs], kWriteInplace);
-        Assign(rhs_grad, req[kRhs], lhs_data * m_out_grad);
-        Assign(lhs_grad, req[kLhs], rhs_data * m_out_grad);
+        CHECK_NE(req[elembinary::kRhs], kWriteInplace);
+        Assign(rhs_grad, req[elembinary::kRhs], lhs_data * m_out_grad);
+        Assign(lhs_grad, req[elembinary::kLhs], rhs_data * m_out_grad);
         break;
       }
-      case kDiv: {
-        Tensor<xpu, 2> lhs_data = in_data[kLhs].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> rhs_data = in_data[kRhs].FlatTo2D<xpu, real_t>(s);
+      case elembinary::kDiv: {
+        Tensor<xpu, 2> lhs_data = in_data[elembinary::kLhs].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> rhs_data = in_data[elembinary::kRhs].FlatTo2D<xpu, real_t>(s);
         // rhs cannot do inplace
-        CHECK_NE(req[kRhs], kWriteInplace);
-        Assign(rhs_grad, req[kRhs],
+        CHECK_NE(req[elembinary::kRhs], kWriteInplace);
+        Assign(rhs_grad, req[elembinary::kRhs],
                F<mshadow_op::negation>(m_out_grad * lhs_data) / F<mshadow_op::square>(rhs_data));
-        Assign(lhs_grad, req[kLhs], m_out_grad / rhs_data);
+        Assign(lhs_grad, req[elembinary::kLhs], m_out_grad / rhs_data);
         break;
       }
     }
@@ -137,15 +139,15 @@ class ElementWiseBinaryOp : public Operator {
 
 
 template<typename xpu>
-inline Operator* CreateElementWiseBinaryOp_(ElementWiseBinaryOpType type) {
+inline Operator* CreateElementWiseBinaryOp_(elembinary::ElementWiseBinaryOpType type) {
   switch (type) {
-    case kPlus:
+    case elembinary::kPlus:
       return new ElementWiseBinaryOp<xpu, mshadow::op::plus>();
-    case kMinus:
+    case elembinary::kMinus:
       return new ElementWiseBinaryOp<xpu, mshadow::op::minus>();
-    case kMul:
+    case elembinary::kMul:
       return new ElementWiseBinaryOp<xpu, mshadow::op::mul>();
-    case kDiv:
+    case elembinary::kDiv:
       return new ElementWiseBinaryOp<xpu, mshadow::op::div>();
   }
   LOG(FATAL) << "uknown op type";
@@ -154,7 +156,7 @@ inline Operator* CreateElementWiseBinaryOp_(ElementWiseBinaryOpType type) {
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateElementWiseBinaryOp(ElementWiseBinaryOpType type);
+Operator* CreateElementWiseBinaryOp(elembinary::ElementWiseBinaryOpType type);
 
 #if DMLC_USE_CXX11
 template<typename ForwardOp>
@@ -173,14 +175,14 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2) << "Input:[lhs, rhs]";
-    if (in_shape->at(kLhs).ndim() != 0) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kRhs, in_shape->at(kLhs));
-    } else if (in_shape->at(kRhs).ndim() != 0) {
-      in_shape->at(kLhs) = in_shape->at(kRhs);
+    if (in_shape->at(elembinary::kLhs).ndim() != 0) {
+      SHAPE_ASSIGN_CHECK(*in_shape, elembinary::kRhs, in_shape->at(elembinary::kLhs));
+    } else if (in_shape->at(elembinary::kRhs).ndim() != 0) {
+      in_shape->at(elembinary::kLhs) = in_shape->at(elembinary::kRhs);
     } else {
       return false;
     }
-    const TShape &dshape = in_shape->at(kLhs);
+    const TShape &dshape = in_shape->at(elembinary::kLhs);
     out_shape->clear();
     out_shape->push_back(dshape);
     return true;
@@ -204,12 +206,12 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
     switch (GetOpType<ForwardOp>()) {
-      case kPlus:
-      case kMinus:
-        return {out_grad[kOut]};
-      case kMul:
-      case kDiv:
-        return {out_grad[kOut], in_data[kLhs], in_data[kRhs]};
+      case elembinary::kPlus:
+      case elembinary::kMinus:
+        return {out_grad[elembinary::kOut]};
+      case elembinary::kMul:
+      case elembinary::kDiv:
+        return {out_grad[elembinary::kOut], in_data[elembinary::kLhs], in_data[elembinary::kRhs]};
     }
     LOG(FATAL) << "not reached";
     return {};
@@ -221,12 +223,12 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
     switch (GetOpType<ForwardOp>()) {
-      case kPlus:
-      case kMinus:
+      case elembinary::kPlus:
+      case elembinary::kMinus:
         return {};
-      case kMul:
-      case kDiv:
-        return {{out_grad[kOut], in_grad[kLhs]}};
+      case elembinary::kMul:
+      case elembinary::kDiv:
+        return {{out_grad[elembinary::kOut], in_grad[elembinary::kLhs]}};
     }
     LOG(FATAL) << "not reached";
     return {};
@@ -235,7 +237,7 @@ class ElementWiseBinaryOpProp : public OperatorProperty {
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kLhs], out_data[kOut]}};
+    return {{in_data[elembinary::kLhs], out_data[elembinary::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const override;
diff --git a/src/operator/elementwise_binary_op.cc b/src/operator/elementwise_binary_op.cc
index 0485707ffc18..940dce2beec6 100644
--- a/src/operator/elementwise_binary_op.cc
+++ b/src/operator/elementwise_binary_op.cc
@@ -8,7 +8,7 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateElementWiseBinaryOp<cpu>(ElementWiseBinaryOpType type) {
+Operator* CreateElementWiseBinaryOp<cpu>(elembinary::ElementWiseBinaryOpType type) {
   return CreateElementWiseBinaryOp_<cpu>(type);
 }
 
diff --git a/src/operator/elementwise_binary_op.cu b/src/operator/elementwise_binary_op.cu
index ba8991707f12..90d85ae20a18 100644
--- a/src/operator/elementwise_binary_op.cu
+++ b/src/operator/elementwise_binary_op.cu
@@ -8,7 +8,7 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateElementWiseBinaryOp<gpu>(ElementWiseBinaryOpType type) {
+Operator* CreateElementWiseBinaryOp<gpu>(elembinary::ElementWiseBinaryOpType type) {
   return CreateElementWiseBinaryOp_<gpu>(type);
 }
 }  // namespace op
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index 213add51357a..d9c4c0e36206 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -21,8 +21,10 @@
 namespace mxnet {
 namespace op {
 
+namespace elemsum {
 enum ElementWiseSumOpInputs {kData0, kData1, kData2, kData3};
 enum ElementWiseSumOpOutputs {kOut};
+}  // namespace elemsum
 
 struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
   int num_args;
@@ -47,35 +49,35 @@ class ElementWiseSumOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(static_cast<int>(in_data.size()), size_);
     CHECK_EQ(out_data.size(), 1);
-    if (req[kOut] == kNullOp) return;
+    if (req[elemsum::kOut] == kNullOp) return;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
     switch (size_) {
       case 2: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1);
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], in_0 + in_1);
         break;
       }
       case 3: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1 + in_2);
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2);
         break;
       }
       case 4: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[kData2].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_3 = in_data[kData3].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], in_0 + in_1 + in_2 + in_3);
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2> in_3 = in_data[elemsum::kData3].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2 + in_3);
         break;
       }
       default: {
-        Tensor<xpu, 2> in_0 = in_data[kData0].FlatTo2D<xpu, real_t>(s);
-        Assign(out, req[kOut], F<mshadow_op::identity>(in_0));
+        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Assign(out, req[elemsum::kOut], F<mshadow_op::identity>(in_0));
         for (int i = 1; i < size_; ++i) {
           out += in_data[i].FlatTo2D<xpu, real_t>(s);
         }
@@ -95,7 +97,7 @@ class ElementWiseSumOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> ograd = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> ograd = out_grad[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
     for (int i = 0; i < size_; ++i) {
       if (req[i] == kNullOp || req[i] == kWriteInplace) continue;
       Tensor<xpu, 2> igrad = in_grad[i].FlatTo2D<xpu, real_t>(s);
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index 6fec9f5d13a5..262aba95d0fb 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -21,8 +21,10 @@ namespace op {
 
 // Declare enumeration of input order to make code more intuitive.
 // These enums are only visible within this header
+namespace fullc {
 enum FullyConnectedOpInputs {kData, kWeight, kBias};
 enum FullyConnectedOpOutputs {kOut};
+}  // fullc
 
 struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   int num_hidden;
@@ -55,7 +57,7 @@ class FullyConnectedOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(req[kOut], kWriteTo);
+    CHECK_EQ(req[fullc::kOut], kWriteTo);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
@@ -67,12 +69,12 @@ class FullyConnectedOp : public Operator {
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif  // __CUDACC__
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[fullc::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> wmat = in_data[fullc::kWeight].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2> out = out_data[fullc::kOut].FlatTo2D<xpu, real_t>(s);
     out = dot(data, wmat.T());
     if (!param_.no_bias) {
-      Tensor<xpu, 1> bias = in_data[kBias].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 1> bias = in_data[fullc::kBias].get<xpu, 1, real_t>(s);
       out += repmat(bias, data.size(0));
     }
   }
@@ -93,26 +95,26 @@ class FullyConnectedOp : public Operator {
     // TODO(bing): check the BLAS Handle, be careful
     //  maybe need blas handle from context
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 2> grad = out_grad[kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> data = in_data[fullc::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> wmat = in_data[fullc::kWeight].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2> grad = out_grad[fullc::kOut].FlatTo2D<xpu, real_t>(s);
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
     //  backprop
-    CHECK_NE(req[kWeight], kWriteInplace) << "cannot write weight inplace";
+    CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
     // gradient of weight
-    Tensor<xpu, 2> gwmat = in_grad[kWeight].get<xpu, 2, real_t>(s);
-    Assign(gwmat, req[kWeight], dot(grad.T(), data));
+    Tensor<xpu, 2> gwmat = in_grad[fullc::kWeight].get<xpu, 2, real_t>(s);
+    Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
     // gradient of bias
     if (!param_.no_bias) {
-      Tensor<xpu, 1> gbias = in_grad[kBias].get<xpu, 1, real_t>(s);
-      Assign(gbias, req[kBias], sum_rows(grad));
+      Tensor<xpu, 1> gbias = in_grad[fullc::kBias].get<xpu, 1, real_t>(s);
+      Assign(gbias, req[fullc::kBias], sum_rows(grad));
     }
     // gradient of data
-    Tensor<xpu, 2> gdata = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(gdata, req[kData], dot(grad, wmat));
+    Tensor<xpu, 2> gdata = in_grad[fullc::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(gdata, req[fullc::kData], dot(grad, wmat));
   }
 
  private:
@@ -151,16 +153,16 @@ class FullyConnectedProp : public OperatorProperty {
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
     }
-    const TShape &dshape = (*in_shape)[kData];
+    const TShape &dshape = (*in_shape)[fullc::kData];
     // require data to be known
     if (dshape.ndim() ==  0) return false;
 
     index_t num_input = 0;
     mshadow::Shape<2> ishape = dshape.FlatTo2D();
     num_input = ishape[1];
-    SHAPE_ASSIGN_CHECK(*in_shape, kWeight, Shape2(param_.num_hidden, num_input));
+    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input));
     if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_hidden));
+      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden));
     }
     out_shape->clear();
     out_shape->push_back(Shape2(dshape[0], param_.num_hidden));
@@ -182,7 +184,7 @@ class FullyConnectedProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], in_data[kData], in_data[kWeight]};
+    return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -190,7 +192,7 @@ class FullyConnectedProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{in_data[kData], in_grad[kData]}};
+    return {{in_data[fullc::kData], in_grad[fullc::kData]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index 68cb52eea25f..4bdb65ef415a 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -21,10 +21,12 @@
 namespace mxnet {
 namespace op {
 
+namespace leakyrelu {
 enum LeakyReLUOpInputs {kData, kGamma};
 enum LeakyReLUOpOutputs {kOut, kMask};
 enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU};
 enum LeakyReLUOpResource {kRandom};
+}  // namespace leakyrelu
 
 struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
   // use int for enumeration
@@ -33,10 +35,10 @@ struct LeakyReLUParam : public dmlc::Parameter<LeakyReLUParam> {
   float lower_bound;
   float upper_bound;
   DMLC_DECLARE_PARAMETER(LeakyReLUParam) {
-    DMLC_DECLARE_FIELD(act_type).set_default(kLeakyReLU)
-    .add_enum("rrelu", kRReLU)
-    .add_enum("leaky", kLeakyReLU)
-    .add_enum("prelu", kPReLU)
+    DMLC_DECLARE_FIELD(act_type).set_default(leakyrelu::kLeakyReLU)
+    .add_enum("rrelu", leakyrelu::kRReLU)
+    .add_enum("leaky", leakyrelu::kLeakyReLU)
+    .add_enum("prelu", leakyrelu::kPReLU)
     .describe("Activation function to be applied.");
     DMLC_DECLARE_FIELD(slope).set_default(0.25f)
     .describe("Init slope for the activation. (For leaky only)");
@@ -67,47 +69,48 @@ class LeakyReLUOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    size_t expected = param_.act_type == kPReLU ? 2 : 1;
+    size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1;
     CHECK_EQ(in_data.size(), expected);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data;
     Tensor<xpu, 4> out;
     Tensor<xpu, 4> mask;
     Tensor<xpu, 1> weight;
-    if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      out = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
+    if (in_data[leakyrelu::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[leakyrelu::kData].shape_[0],
+                               in_data[leakyrelu::kData].shape_[1], 1, 1);
+      data = in_data[leakyrelu::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out = out_data[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
-      out = out_data[kOut].get<xpu, 4, real_t>(s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get<xpu, 4, real_t>(s);
+      data = in_data[leakyrelu::kData].get<xpu, 4, real_t>(s);
+      out = out_data[leakyrelu::kOut].get<xpu, 4, real_t>(s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get<xpu, 4, real_t>(s);
       }
     }
     switch (param_.act_type) {
-      case kLeakyReLU: {
-        Assign(out, req[kOut], F<mshadow_op::xelu>(data, param_.slope));
+      case leakyrelu::kLeakyReLU: {
+        Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, param_.slope));
         break;
       }
-      case kPReLU: {
-        weight = in_data[kGamma].get<xpu, 1, real_t>(s);
-        Assign(out, req[kOut], F<mshadow_op::xelu>(data, broadcast<1>(weight, out.shape_)));
+      case leakyrelu::kPReLU: {
+        weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
+        Assign(out, req[leakyrelu::kOut],
+               F<mshadow_op::xelu>(data, broadcast<1>(weight, out.shape_)));
         break;
       }
-      case kRReLU: {
+      case leakyrelu::kRReLU: {
         if (ctx.is_train) {
-          Random<xpu>* prnd = ctx.requested[kRandom].get_random<xpu>(s);
+          Random<xpu>* prnd = ctx.requested[leakyrelu::kRandom].get_random<xpu>(s);
           mask = prnd->uniform(mask.shape_);
           mask = mask * (param_.upper_bound - param_.lower_bound) + param_.lower_bound;
-          Assign(out, req[kOut], F<mshadow_op::xelu>(data, mask));
+          Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, mask));
         } else {
           const float slope = (param_.lower_bound + param_.upper_bound) / 2.0f;
-          Assign(out, req[kOut], F<mshadow_op::xelu>(data, slope));
+          Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, slope));
         }
         break;
       }
@@ -126,7 +129,7 @@ class LeakyReLUOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): double check
-    size_t expected = param_.act_type == kPReLU ? 2 : 1;
+    size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1;
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data.size(), expected);
@@ -138,43 +141,43 @@ class LeakyReLUOp : public Operator {
     Tensor<xpu, 4> mask;
     Tensor<xpu, 1> weight;
     Tensor<xpu, 1> grad_weight;
-    if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
-      grad = out_grad[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      gdata = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      output = out_data[kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
+    if (in_data[leakyrelu::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[leakyrelu::kData].shape_[0],
+                               in_data[leakyrelu::kData].shape_[1], 1, 1);
+      grad = out_grad[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      gdata = in_grad[leakyrelu::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      output = out_data[leakyrelu::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
-      if (param_.act_type == kPReLU) {
-        data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      if (param_.act_type == leakyrelu::kPReLU) {
+        data = in_data[leakyrelu::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
       }
     } else {
-      grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-      gdata = in_grad[kData].get<xpu, 4, real_t>(s);
-      output = out_data[kOut].get<xpu, 4, real_t>(s);
-      if (param_.act_type == kRReLU) {
-        mask = out_data[kMask].get<xpu, 4, real_t>(s);
+      grad = out_grad[leakyrelu::kOut].get<xpu, 4, real_t>(s);
+      gdata = in_grad[leakyrelu::kData].get<xpu, 4, real_t>(s);
+      output = out_data[leakyrelu::kOut].get<xpu, 4, real_t>(s);
+      if (param_.act_type == leakyrelu::kRReLU) {
+        mask = out_data[leakyrelu::kMask].get<xpu, 4, real_t>(s);
       }
-      if (param_.act_type == kPReLU) {
-        data = in_data[kData].get<xpu, 4, real_t>(s);
+      if (param_.act_type == leakyrelu::kPReLU) {
+        data = in_data[leakyrelu::kData].get<xpu, 4, real_t>(s);
       }
     }
     switch (param_.act_type) {
-      case kLeakyReLU: {
-        Assign(gdata, req[kData], F<mshadow_op::xelu_grad>(output, param_.slope) * grad);
+      case leakyrelu::kLeakyReLU: {
+        Assign(gdata, req[leakyrelu::kData], F<mshadow_op::xelu_grad>(output, param_.slope) * grad);
         break;
       }
-      case kPReLU: {
-        weight = in_data[kGamma].get<xpu, 1, real_t>(s);
-        grad_weight = in_grad[kGamma].get<xpu, 1, real_t>(s);
+      case leakyrelu::kPReLU: {
+        weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
+        grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         grad_weight = sumall_except_dim<1>(F<prelu_grad>(data) * grad);
         gdata = F<mshadow_op::xelu_grad>(output, broadcast<1>(weight, data.shape_)) * grad;
         break;
       }
-      case kRReLU: {
-        Assign(gdata, req[kData], F<mshadow_op::xelu_grad>(output, mask) * grad);
+      case leakyrelu::kRReLU: {
+        Assign(gdata, req[leakyrelu::kData], F<mshadow_op::xelu_grad>(output, mask) * grad);
         break;
       }
       default:
@@ -204,19 +207,19 @@ class LeakyReLUProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    if (param_.act_type == kPReLU) {
+    if (param_.act_type == leakyrelu::kPReLU) {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, gamma]";
     } else {
       CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
     }
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(leakyrelu::kData);
     if (dshape.ndim() == 0) return false;
-    if (param_.act_type == kPReLU) {
-      in_shape->at(kGamma) = TShape(Shape1(dshape[1]));
+    if (param_.act_type == leakyrelu::kPReLU) {
+      in_shape->at(leakyrelu::kGamma) = TShape(Shape1(dshape[1]));
     }
     out_shape->clear();
     out_shape->push_back(dshape);
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       out_shape->push_back(dshape);
     }
     return true;
@@ -237,12 +240,15 @@ class LeakyReLUProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    if (param_.act_type == kPReLU) {
-      return {out_grad[kOut], out_data[kOut], in_data[kData], in_data[kGamma]};
-    } else if (param_.act_type == kRReLU) {
-      return {out_grad[kOut], out_data[kMask], out_data[kOut]};
+    if (param_.act_type == leakyrelu::kPReLU) {
+      return {out_grad[leakyrelu::kOut],
+              out_data[leakyrelu::kOut],
+              in_data[leakyrelu::kData],
+              in_data[leakyrelu::kGamma]};
+    } else if (param_.act_type == leakyrelu::kRReLU) {
+      return {out_grad[leakyrelu::kOut], out_data[leakyrelu::kMask], out_data[leakyrelu::kOut]};
     } else {
-      return {out_grad[kOut], out_data[kData]};
+      return {out_grad[leakyrelu::kOut], out_data[leakyrelu::kData]};
     }
   }
 
@@ -251,21 +257,21 @@ class LeakyReLUProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[leakyrelu::kOut], in_grad[leakyrelu::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    if (param_.act_type == kPReLU) {
+    if (param_.act_type == leakyrelu::kPReLU) {
       return {};
     } else {
-      return {{in_data[kData], out_data[kOut]}};
+      return {{in_data[leakyrelu::kData], out_data[leakyrelu::kOut]}};
     }
   }
 
   std::vector<std::string> ListArguments() const override {
-    if (param_.act_type == kPReLU) {
+    if (param_.act_type == leakyrelu::kPReLU) {
       return {"data", "gamma"};
     } else {
       return {"data"};
@@ -273,7 +279,7 @@ class LeakyReLUProp : public OperatorProperty {
   }
 
   std::vector<std::string> ListOutputs() const override {
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       return {"output", "mask"};
     } else {
       return {"output"};
@@ -281,7 +287,7 @@ class LeakyReLUProp : public OperatorProperty {
   }
 
   int NumOutputs() const override {
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       return 2;
     } else {
       return 1;
@@ -294,7 +300,7 @@ class LeakyReLUProp : public OperatorProperty {
 
   virtual std::vector<ResourceRequest> ForwardResource(
       const std::vector<TShape> &in_shape) const {
-    if (param_.act_type == kRReLU) {
+    if (param_.act_type == leakyrelu::kRReLU) {
       return {ResourceRequest::kRandom};
     } else {
       return std::vector<ResourceRequest>();
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
index 93c0e346de42..35aac8fe73ae 100644
--- a/src/operator/lrn-inl.h
+++ b/src/operator/lrn-inl.h
@@ -18,8 +18,11 @@
 
 namespace mxnet {
 namespace op {
+
+namespace lrn_enum {
 enum LRNInputs {kData};
 enum LRNOutputs {kOut, kTmpNorm};
+}  // namespace lrn_enum
 
 struct LRNParam : public dmlc::Parameter<LRNParam> {
   float alpha;
@@ -58,11 +61,11 @@ class LocalResponseNormOp : public Operator {
     CHECK_EQ(param_.nsize % 2, 1) << "LRN only supports odd values for local_size";
     const real_t salpha = param_.alpha / param_.nsize;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[kTmpNorm].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
     tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
-    Assign(out, req[kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
+    Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -79,10 +82,10 @@ class LocalResponseNormOp : public Operator {
     CHECK_EQ(out_data.size(), 2);
     const real_t salpha = param_.alpha / param_.nsize;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[kTmpNorm].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad = out_grad[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad_in = in_grad[lrn_enum::kData].get<xpu, 4, real_t>(s);
     grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
     grad_in += (- 2.0f * param_.beta * salpha) *
                chpool<red::sum>(grad * data *
@@ -138,9 +141,9 @@ class LocalResponseNormProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
 #if MXNET_USE_CUDNN == 1
-    return {out_grad[kOut], in_data[kData], out_data[kOut]};
+    return {out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], out_data[lrn_enum::kOut]};
 #else
-    return {out_grad[kOut], in_data[kData], out_data[kTmpNorm]};
+    return {out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], out_data[lrn_enum::kTmpNorm]};
 #endif
   }
 
@@ -152,7 +155,7 @@ class LocalResponseNormProp : public OperatorProperty {
 #if MXNET_USE_CUDNN == 1
     return {};
 #else
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[lrn_enum::kOut], in_grad[lrn_enum::kData]}};
 #endif
   }
 
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 9238ee049c0b..94db8d78cde0 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -80,6 +80,23 @@ struct tanh_grad {
   }
 };
 
+struct exp {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return expf(a);
+  }
+};
+
+struct log {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return logf(a);
+  }
+};
+
+struct log_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 1.0f / a;
+  }
+};
 
 struct square {
   MSHADOW_XINLINE static real_t Map(real_t a) {
@@ -87,6 +104,12 @@ struct square {
   }
 };
 
+struct square_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 2.0f * a;
+  }
+};
+
 /*! \brief used for generate Bernoulli mask */
 struct threshold {
   MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
@@ -107,6 +130,13 @@ struct square_root {
     return sqrt(a);
   }
 };
+
+struct square_root_grad {
+  MSHADOW_XINLINE static real_t Map(real_t a) {
+    return 0.5f / a;
+  }
+};
+
 }  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/param.h b/src/operator/param.h
deleted file mode 100644
index 9b08c197a160..000000000000
--- a/src/operator/param.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file param.h
- * \brief Common operator parameters
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_PARAM_H_
-#define MXNET_OPERATOR_PARAM_H_
-
-#include <cstring>
-
-namespace mxnet {
-namespace op {
-/*! \brief possible parameter for each operator */
-struct Param {
-  /*! \brief number of hidden layers */
-  int num_hidden;
-  /*! \brief number of output channel */
-  int num_channel;
-  /*! \brief number of parallel group */
-  int num_group;
-  /*! \brief kernel height */
-  int kernel_y;
-  /*! \brief kernel width */
-  int kernel_x;
-  /*! \brief stride in y dimension*/
-  int stride_y;
-  /*! \brief stride in x dimension */
-  int stride_x;
-  /*! \brief padding in y dimension */
-  int pad_y;
-  /*! \brief padding in x dimension */
-  int pad_x;
-  /*! \brief whether not include bias term */
-  int no_bias;
-  /*! \brief maximum temp_col_size allowed in each layer */
-  int temp_col_max;
-  /*! \brief reserved fields, for future compatibility */
-  int reserved[64];
-
-  // constructor
-  Param() {
-    memset(this, 0, sizeof(Param));
-  }
-
-  inline void SetParam(const char *name, const char* val) {
-    if (!strcmp(name, "num_hidden")) num_hidden = atoi(val);
-    if (!strcmp(name, "num_channel")) num_channel = atoi(val);
-    if (!strcmp(name, "num_group")) num_group = atoi(val);
-    if (!strcmp(name, "kernel_size")) {
-      kernel_y = kernel_x = atoi(val);
-    }
-    if (!strcmp(name, "kernel_height")) kernel_y = atoi(val);
-    if (!strcmp(name, "kernel_width")) kernel_x = atoi(val);
-    if (!strcmp(name, "stride")) {
-      stride_y = stride_x = atoi(val);
-    }
-    if (!strcmp(name, "stride_y")) stride_y = atoi(val);
-    if (!strcmp(name, "stride_x")) stride_x = atoi(val);
-
-    if (!strcmp(name, "pad")) {
-      pad_y = pad_x  = atoi(val);
-    }
-    if (!strcmp(name, "pad_y")) pad_y = atoi(val);
-    if (!strcmp(name, "pad_x")) pad_x = atoi(val);
-    if (!strcmp(name, "no_bias")) no_bias = atoi(val);
-    if (!strcmp(name, "temp_col_max")) temp_col_max = atoi(val) << 18;
-  }
-};  // struct Param
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_PARAM_H_
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index b7eb8e2f2634..1f3d76e1ab7a 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -20,9 +20,12 @@
 
 namespace mxnet {
 namespace op {
+
+namespace pool_enum {
 enum PoolingOpInputs {kData};
 enum PoolingOpOutputs {kOut};
 enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+}  // namespace pool_enum
 
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
@@ -36,9 +39,9 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .describe("pooling kernel size: (y, x)");
 
     DMLC_DECLARE_FIELD(pool_type)
-    .add_enum("max", kMaxPooling)
-    .add_enum("avg", kAvgPooling)
-    .add_enum("sum", kSumPooling)
+    .add_enum("max", pool_enum::kMaxPooling)
+    .add_enum("avg", pool_enum::kAvgPooling)
+    .add_enum("sum", pool_enum::kSumPooling)
     .describe("Pooling type to be applied.");
 
     int stride_shape[] = {1, 1};
@@ -70,23 +73,23 @@ class PoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
     mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
     // TODO(bing): dual stride in mshadow
     CHECK_EQ(param_.stride[0], param_.stride[1])
         << "Only same stride is supported now";
-    if (param_.pool_type == kMaxPooling || param_.pool_type == kSumPooling) {
+    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
       Assign(out,
-             req[kOut],
+             req[pool_enum::kOut],
              pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                            out_shape,
                            param_.kernel[0],
                            param_.kernel[1],
                            param_.stride[0]));
-    } else if (param_.pool_type == kAvgPooling) {
+    } else if (param_.pool_type == pool_enum::kAvgPooling) {
       Assign(out,
-             req[kOut],
+             req[pool_enum::kOut],
              (1.0f / (param_.kernel[0] * param_.kernel[1])) * \
              pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                            out_shape,
@@ -112,15 +115,15 @@ class PoolingOp : public Operator {
     CHECK_EQ(in_grad.size(), 1);
     // TODO(bing): remove pad (0,0)
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> output_data = out_data[kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> input_grad = in_grad[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad = out_grad[pool_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> output_data = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> input_grad = in_grad[pool_enum::kData].get<xpu, 4, real_t>(s);
 
     mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
 
-    if (param_.pool_type == kMaxPooling || param_.pool_type == kSumPooling) {
-      Assign(input_grad, req[kData],
+    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
+      Assign(input_grad, req[pool_enum::kData],
              crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),
                                   pad(grad, 0, 0),
@@ -130,8 +133,8 @@ class PoolingOp : public Operator {
                   in_shape,
                   param_.pad[0],
                   param_.pad[1]));
-    } else if (param_.pool_type == kAvgPooling) {
-      Assign(input_grad, req[kData],
+    } else if (param_.pool_type == pool_enum::kAvgPooling) {
+      Assign(input_grad, req[pool_enum::kData],
              (1.0f / param_.kernel[0] / param_.kernel[1]) *\
              crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),
@@ -197,7 +200,7 @@ class PoolingProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut], in_data[kData], out_data[kOut]};
+    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -208,7 +211,7 @@ class PoolingProp : public OperatorProperty {
 #if MXNET_USE_CUDNN == 1
     return {};
 #else
-    return {{in_data[kData], in_grad[kData]}};
+    return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}};
 #endif
   }
 
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 44f80f854468..598fe176e5db 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -11,11 +11,11 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param) {
   switch (param.pool_type) {
-    case kMaxPooling:
+    case pool_enum::kMaxPooling:
       return new PoolingOp<cpu, mshadow::red::maximum>(param);
-    case kAvgPooling:
+    case pool_enum::kAvgPooling:
       return new PoolingOp<cpu, mshadow::red::sum>(param);
-    case kSumPooling:
+    case pool_enum::kSumPooling:
       return new PoolingOp<cpu, mshadow::red::sum>(param);
     default:
       LOG(FATAL) << "unknown activation type";
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index 9b59ffaba6ad..57b92427dfe5 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -18,11 +18,11 @@ Operator *CreateOp<gpu>(PoolingParam param) {
   return new CuDNNPoolingOp(param);
 #else
   switch (param.pool_type) {
-    case kMaxPooling:
+    case pool_enum::kMaxPooling:
       return new PoolingOp<gpu, mshadow::red::maximum>(param);
-    case kAvgPooling:
+    case pool_enum::kAvgPooling:
       return new PoolingOp<gpu, mshadow::red::sum>(param);
-    case kSumPooling:
+    case pool_enum::kSumPooling:
       return new PoolingOp<gpu, mshadow::red::sum>(param);
     default:
       LOG(FATAL) << "unknown activation type";
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index 4c4bf6ffb625..479579d4b472 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -16,9 +16,12 @@
 
 namespace mxnet {
 namespace op {
+
+namespace reg_enum {
 enum RegressionOutputOpInputs {kData, kLabel};
 enum RegressionOutputOutputs {kOut};
 enum RegressionOutputType {kLinear, kLogistic};
+}  // reg_enum
 
 // Special Operator to output regression value in forward
 // And get gradient in calculation.
@@ -35,9 +38,9 @@ class RegressionOutputOp : public Operator {
     CHECK_EQ(in_data.size(), 2) << "RegressionOutputOp Input: [data, label]";
     CHECK_EQ(out_data.size(), 1) << "RegressionOutputOp Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Assign(out, req[kOut], F<ForwardOp>(data));
+    Tensor<xpu, 2> data = in_data[reg_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[reg_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Assign(out, req[reg_enum::kOut], F<ForwardOp>(data));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -54,19 +57,19 @@ class RegressionOutputOp : public Operator {
     CHECK_GE(in_grad.size(), 1);
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    Assign(grad, req[kData], F<BackwardOp>(out, reshape(label, grad.shape_)));
+    Tensor<xpu, 1> label = in_data[reg_enum::kLabel].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 2> out = out_data[reg_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad = in_grad[reg_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Assign(grad, req[reg_enum::kData], F<BackwardOp>(out, reshape(label, grad.shape_)));
   }
 };
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateRegressionOutputOp(RegressionOutputType type);
+Operator* CreateRegressionOutputOp(reg_enum::RegressionOutputType type);
 
 #if DMLC_USE_CXX11
-template<RegressionOutputType type>
+template<reg_enum::RegressionOutputType type>
 class RegressionOutputProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
@@ -100,8 +103,8 @@ class RegressionOutputProp : public OperatorProperty {
 
   std::string TypeString() const override {
     switch (type) {
-      case kLinear: return "LinearRegressionOutput";
-      case kLogistic: return "LogisticRegressionOutput";
+      case reg_enum::kLinear: return "LinearRegressionOutput";
+      case reg_enum::kLogistic: return "LogisticRegressionOutput";
       default: LOG(FATAL) << "unknown type"; return "";
     }
   }
@@ -110,7 +113,7 @@ class RegressionOutputProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[kLabel], out_data[kOut]};
+    return {in_data[reg_enum::kLabel], out_data[reg_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -118,13 +121,13 @@ class RegressionOutputProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_data[kOut], in_grad[kData]}};
+    return {{out_data[reg_enum::kOut], in_grad[reg_enum::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[reg_enum::kData], out_data[reg_enum::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
index e10888d624e3..5d729c57f730 100644
--- a/src/operator/regression_output.cc
+++ b/src/operator/regression_output.cc
@@ -10,11 +10,11 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator *CreateRegressionOutputOp<cpu>(RegressionOutputType type) {
+Operator *CreateRegressionOutputOp<cpu>(reg_enum::RegressionOutputType type) {
   switch (type) {
-    case kLinear:
+    case reg_enum::kLinear:
       return new RegressionOutputOp<cpu, mshadow::op::identity, mshadow::op::minus>();
-    case kLogistic:
+    case reg_enum::kLogistic:
       return new RegressionOutputOp<cpu, mshadow_op::sigmoid, mshadow::op::minus>();
     default:
       LOG(FATAL) << "unknown activation type " << type;
@@ -23,17 +23,17 @@ Operator *CreateRegressionOutputOp<cpu>(RegressionOutputType type) {
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
-template<RegressionOutputType type>
+template<reg_enum::RegressionOutputType type>
 Operator *RegressionOutputProp<type>::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateRegressionOutputOp, type);
 }
 
-MXNET_REGISTER_OP_PROPERTY(LinearRegressionOutput, RegressionOutputProp<kLinear>)
+MXNET_REGISTER_OP_PROPERTY(LinearRegressionOutput, RegressionOutputProp<reg_enum::kLinear>)
 .describe("Use linear regression for final output, this is used on final output of a net.")
 .add_argument("data", "Symbol", "Input data to function.")
 .add_argument("label", "Symbol", "Input label to function.");
 
-MXNET_REGISTER_OP_PROPERTY(LogisticRegressionOutput, RegressionOutputProp<kLogistic>)
+MXNET_REGISTER_OP_PROPERTY(LogisticRegressionOutput, RegressionOutputProp<reg_enum::kLogistic>)
 .describe("Use Logistic regression for final output, this is used on final output of a net.\n"
           "Logistic regression is suitable for binary classification "
           "or probability prediction tasks.")
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
index c653b556278d..64968f2e968b 100644
--- a/src/operator/regression_output.cu
+++ b/src/operator/regression_output.cu
@@ -10,11 +10,11 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator *CreateRegressionOutputOp<gpu>(RegressionOutputType type) {
+Operator *CreateRegressionOutputOp<gpu>(reg_enum::RegressionOutputType type) {
   switch (type) {
-    case kLinear:
+    case reg_enum::kLinear:
       return new RegressionOutputOp<gpu, mshadow::op::identity, mshadow::op::minus>();
-    case kLogistic:
+    case reg_enum::kLogistic:
       return new RegressionOutputOp<gpu, mshadow_op::sigmoid, mshadow::op::minus>();
     default:
       LOG(FATAL) << "unknown activation type " << type;
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index 730751b1a594..12c2071a8c97 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -20,8 +20,10 @@
 namespace mxnet {
 namespace op {
 
+namespace reshape_enum {
 enum ReshapeOpInputs {kData};
 enum ReshapeOpOutputs {kOut};
+}  // namespace reshape_enum
 
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   TShape target_shape;
@@ -33,6 +35,8 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
 template<typename xpu>
 class ReshapeOp : public Operator {
  public:
+  explicit ReshapeOp(ReshapeParam param) {}  // Do nothing
+
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
@@ -43,16 +47,15 @@ class ReshapeOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(req.size(), 1);
     CHECK_EQ(out_data.size(), 1);
-    if (req[kOut] == kNullOp) return;
+    if (req[reshape_enum::kOut] == kNullOp) return;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    // TODO(bing): potentail bug here for non-4D input
-    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> data = in_data[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (data.dptr_ == out.dptr_) return;
     CHECK_EQ(data.shape_.Size(), out.shape_.Size());
-    Assign(out, req[kOut], reshape(data, out.shape_));
+    Assign(out, req[reshape_enum::kOut], reshape(data, out.shape_));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -65,22 +68,22 @@ class ReshapeOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req.size(), 1);
-    if (req[kData] == kNullOp) return;
+    if (req[reshape_enum::kData] == kNullOp) return;
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad_out = out_grad[kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> grad_in = in_grad[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> grad_out = out_grad[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
     CHECK_EQ(grad_out.CheckContiguous(), true);
     CHECK_EQ(grad_in.CheckContiguous(), true);
     if (grad_out.dptr_ == grad_in.dptr_) return;
     CHECK_EQ(grad_out.shape_.Size(), grad_in.shape_.Size());
-    Assign(grad_in, req[kData], reshape(grad_out, grad_in.shape_));
+    Assign(grad_in, req[reshape_enum::kData], reshape(grad_out, grad_in.shape_));
   }
 };  // class ReshapeOp
 
 template<typename xpu>
-Operator* CreateOp();
+Operator* CreateOp(ReshapeParam);
 
 #if DMLC_USE_CXX11
 class ReshapeProp : public OperatorProperty {
@@ -99,7 +102,7 @@ class ReshapeProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(reshape_enum::kData);
     if (dshape.ndim() == 0) return false;
     CHECK(param_.target_shape.Size() == dshape.Size())
         << "Target shape size is different to source. "
@@ -124,13 +127,13 @@ class ReshapeProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[kOut]};
+    return {out_grad[reshape_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[reshape_enum::kData], out_data[reshape_enum::kOut]}};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -138,7 +141,7 @@ class ReshapeProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_grad[kOut], in_grad[kData]}};
+    return {{out_grad[reshape_enum::kOut], in_grad[reshape_enum::kData]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
@@ -164,14 +167,14 @@ class FlattenProp : public ReshapeProp {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    const TShape &dshape = in_shape->at(kData);
+    const TShape &dshape = in_shape->at(reshape_enum::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
     uint32_t target_dim = 1;
     for (uint32_t i = 1; i < dshape.ndim(); ++i) {
       target_dim *= dshape[i];
     }
-    out_shape->push_back(mshadow::Shape4(dshape[0], 1, 1, target_dim));
+    out_shape->push_back(mshadow::Shape2(dshape[0], target_dim));
     return true;
   }
 
diff --git a/src/operator/reshape.cc b/src/operator/reshape.cc
index 6bd077172d4a..bc4375b136ce 100644
--- a/src/operator/reshape.cc
+++ b/src/operator/reshape.cc
@@ -11,12 +11,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>() {
-  return new ReshapeOp<cpu>();
+Operator *CreateOp<cpu>(ReshapeParam param) {
+  return new ReshapeOp<cpu>(param);
 }
 
 Operator* ReshapeProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp);
+  DO_BIND_DISPATCH(CreateOp, param_);
 }
 
 DMLC_REGISTER_PARAMETER(ReshapeParam);
diff --git a/src/operator/reshape.cu b/src/operator/reshape.cu
index b810862f3c73..06bbaec1fdfd 100644
--- a/src/operator/reshape.cu
+++ b/src/operator/reshape.cu
@@ -11,8 +11,8 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>() {
-  return new ReshapeOp<gpu>();
+Operator *CreateOp<gpu>(ReshapeParam param) {
+  return new ReshapeOp<gpu>(param);
 }
 
 }  // namespace op
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 25d8ef2cd844..05e3da199bda 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -21,8 +21,10 @@
 namespace mxnet {
 namespace op {
 
+namespace slice_enum {
 enum SliceChannelOpInputs {kData};
 enum SliceChannelOpOutputs {kOut0, kOut1, kOut2, kOut3, kOut4};
+}  // namespace slice_enum
 
 struct SliceChannelParam : public dmlc::Parameter<SliceChannelParam> {
   int num_outputs;
@@ -50,18 +52,17 @@ class SliceChannelOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > outputs(size_);
     Tensor<xpu, 4> data;
-    if (in_data[kData].ndim() == 2) {
-      uint32_t ds[] = {in_data[kData].shape_[0], in_data[kData].shape_[1], 1, 1};
-      TShape dshape(ds, ds + 4);
-      data = in_data[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
-      uint32_t dim = data.shape_[1] / size_;
-      ds[1] = dim;
-      TShape slice_shape(ds, ds + 4);
+    if (in_data[slice_enum::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[slice_enum::kData].shape_[0],
+                               in_data[slice_enum::kData].shape_[1], 1, 1);
+      data = in_data[slice_enum::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      Shape<4> slice_shape = dshape;
+      slice_shape[1] = dshape[1] / size_;
       for (int i = 0; i < size_; ++i) {
         outputs[i] = out_data[i].get_with_shape<xpu, 4, real_t>(slice_shape, s);
       }
     } else {
-      data = in_data[kData].get<xpu, 4, real_t>(s);
+      data = in_data[slice_enum::kData].get<xpu, 4, real_t>(s);
       for (int i = 0; i < size_; ++i) {
         outputs[i] = out_data[i].get<xpu, 4, real_t>(s);
       }
@@ -83,20 +84,20 @@ class SliceChannelOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 4> > grad_out(size_);
     Tensor<xpu, 4> grad;
-    if (out_grad[kOut0].ndim() == 2) {
-      uint32_t ds[] = {out_grad[kOut0].shape_[0], out_grad[kOut0].shape_[1], 1, 1};
-      TShape slice_shape(ds, ds + 4);
+    if (out_grad[slice_enum::kOut0].ndim() == 2) {
+      Shape<4> slice_shape = Shape4(out_grad[slice_enum::kOut0].shape_[0],
+                                    out_grad[slice_enum::kOut0].shape_[1], 1, 1);
       for (int i = 0; i < size_; ++i) {
         grad_out[i] = out_grad[i].get_with_shape<xpu, 4, real_t>(slice_shape, s);
       }
-      ds[1] *= size_;
-      TShape dshape(ds, ds + 4);
-      grad = in_grad[kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      Shape<4> dshape = slice_shape;
+      dshape[1] *= size_;
+      grad = in_grad[slice_enum::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
     } else {
       for (int i = 0; i < size_; ++i) {
         grad_out[i] = out_grad[i].get<xpu, 4, real_t>(s);
       }
-      grad = in_grad[kData].get<xpu, 4, real_t>(s);
+      grad = in_grad[slice_enum::kData].get<xpu, 4, real_t>(s);
     }
     Concatenate(grad_out, &grad);
   }
@@ -138,7 +139,7 @@ class SliceChannelProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1);
-    TShape dshape = in_shape->at(kData);
+    TShape dshape = in_shape->at(slice_enum::kData);
     if (dshape.ndim() == 0) return false;
     CHECK_GT(dshape.ndim(), 1);
     CHECK_EQ(dshape[1] % param_.num_outputs, 0)
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index 87cab0cb4568..d1e5331d9d06 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -20,14 +20,21 @@
 namespace mxnet {
 namespace op {
 
+namespace softmax_enum {
 enum SoftmaxOpInputs {kData, kLabel};
 enum SoftmaxOpOutputs {kOut};
+}  // namespace softmax_enum
 
 struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
   float grad_scale;
+  bool multi_output;
   DMLC_DECLARE_PARAMETER(SoftmaxParam) {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scale the gradient by a float factor");
+    DMLC_DECLARE_FIELD(multi_output).set_default(false)
+    .describe("If set to true, for a (n,k,x_1,..,x_n) dimensional"
+      "input tensor, softmax will generate n*x_1*...*x_n output, each"
+      "has k classes");
   };
 };
 
@@ -46,9 +53,18 @@ class SoftmaxOp : public Operator {
     CHECK_EQ(in_data.size(), 2) << "Softmax Input: [data, label]";
     CHECK_EQ(out_data.size(), 1) << "Softmax Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Softmax(out, data);
+    if (param_.multi_output) {
+      int n = in_data[softmax_enum::kData].size(0);
+      int k = in_data[softmax_enum::kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_enum::kData].Size()/n/k));
+      Tensor<xpu, 3> data = in_data[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Softmax(out, data);
+    } else {
+      Tensor<xpu, 2> data = in_data[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
+      Softmax(out, data);
+    }
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -65,12 +81,25 @@ class SoftmaxOp : public Operator {
     CHECK_GE(in_grad.size(), 1);
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> label = in_data[kLabel].get<xpu, 1, real_t>(s);
-    Tensor<xpu, 2> out = out_data[kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad = in_grad[kData].FlatTo2D<xpu, real_t>(s);
-    SoftmaxGrad(grad, out, label);
-    if (param_.grad_scale < 1.0) {
-      grad *= param_.grad_scale;
+    if (param_.multi_output) {
+      int n = out_data[softmax_enum::kOut].size(0);
+      int k = out_data[softmax_enum::kOut].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmax_enum::kOut].Size()/n/k));
+      Tensor<xpu, 2> label = in_data[softmax_enum::kLabel].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> grad = in_grad[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      SoftmaxGrad(grad, out, label);
+      if (param_.grad_scale < 1.0) {
+        grad *= param_.grad_scale;
+      }
+    } else {
+      Tensor<xpu, 1> label = in_data[softmax_enum::kLabel].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> grad = in_grad[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
+      SoftmaxGrad(grad, out, label);
+      if (param_.grad_scale < 1.0) {
+        grad *= param_.grad_scale;
+      }
     }
   }
 
@@ -104,7 +133,12 @@ class SoftmaxProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
-    SHAPE_ASSIGN_CHECK(*in_shape, kLabel, Shape1(dshape[0]));
+    if (param_.multi_output) {
+      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel,
+                         Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
+    } else {
+      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel, Shape1(dshape[0]));
+    }
     out_shape->clear();
     out_shape->push_back(dshape);
     return true;
@@ -124,7 +158,7 @@ class SoftmaxProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[kLabel], out_data[kOut]};
+    return {in_data[softmax_enum::kLabel], out_data[softmax_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -132,13 +166,13 @@ class SoftmaxProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_data[kOut], in_grad[kData]}};
+    return {{out_data[softmax_enum::kOut], in_grad[softmax_enum::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[kData], out_data[kOut]}};
+    return {{in_data[softmax_enum::kData], out_data[softmax_enum::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 59e821e372db..847d3d47adad 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -247,6 +247,7 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) {
 }
 
 GraphExecutor::~GraphExecutor() {
+  Engine::Get()->WaitForAll();
   // need to delete the operators before delete the NDArray they referenced.
   for (OpNode& node : op_nodes_) {
     node.DeleteOperator();
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
new file mode 100644
index 000000000000..f8f43e3d52dc
--- /dev/null
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -0,0 +1,7 @@
+import sys
+sys.path.insert(0, '../unittest')
+from test_operator import *
+
+if __name__ == '__main__':
+	test_softmax_with_shape((3,4), mx.gpu())
+    test_multi_softmax_with_shape((3,4,5), mx.gpu())
\ No newline at end of file
diff --git a/tests/python/multi-node/README.md b/tests/python/multi-node/README.md
index 0101e66186ed..9713199ee17c 100644
--- a/tests/python/multi-node/README.md
+++ b/tests/python/multi-node/README.md
@@ -1,15 +1,312 @@
 # Test multi-devices and multi-machines
 
-must disable `CUDNN`
+Note that `CUDNN` leads to randomness, need to disable if comparing to the baseline
 
-`local_*` for multi-devices and single machine. Requires two GPUs.
+- `local_*` for multi-devices and single machine. Requires two GPUs.
 
+- `dist_sync_*` for multi-machines with BSP synchronizations
 
-`dist_*` for multi-machines. Run in local machine with 2 workers (requires at
-least two gpus) and 2 servers.
-
+`dist_async_*` for multi-machines with asynchronous SGD
 
 ```
-ln -s ../../../dmlc-core/tracker/dmlc_local.py
+ln -s ../../../ps-lite/tracker/dmlc_local.py .
 ./dmlc_local.py -n 2 -s 2 ./dist_sync_mlp.py
 ```
+
+# Results
+
+## cifar10, inceptions
+
+single gtx 980. batch size = 128 and learning rate = .1
+
+```
+[03:42:04] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:42:04] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:42:04] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:42:04] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+INFO:root:Iteration[0] Train-accuracy=0.523938
+INFO:root:Iteration[0] Time cost=104.396
+INFO:root:Iteration[0] Validation-accuracy=0.665941
+INFO:root:Iteration[1] Train-accuracy=0.721108
+INFO:root:Iteration[1] Time cost=105.245
+INFO:root:Iteration[1] Validation-accuracy=0.755934
+INFO:root:Iteration[2] Train-accuracy=0.793298
+INFO:root:Iteration[2] Time cost=105.101
+INFO:root:Iteration[2] Validation-accuracy=0.784909
+INFO:root:Iteration[3] Train-accuracy=0.835198
+INFO:root:Iteration[3] Time cost=104.816
+INFO:root:Iteration[3] Validation-accuracy=0.799150
+INFO:root:Iteration[4] Train-accuracy=0.869625
+INFO:root:Iteration[4] Time cost=104.571
+INFO:root:Iteration[4] Validation-accuracy=0.809533
+INFO:root:Iteration[5] Train-accuracy=0.895201
+INFO:root:Iteration[5] Time cost=104.357
+INFO:root:Iteration[5] Validation-accuracy=0.811214
+INFO:root:Iteration[6] Train-accuracy=0.911025
+INFO:root:Iteration[6] Time cost=104.347
+INFO:root:Iteration[6] Validation-accuracy=0.799644
+INFO:root:Iteration[7] Train-accuracy=0.923853
+INFO:root:Iteration[7] Time cost=104.108
+INFO:root:Iteration[7] Validation-accuracy=0.806468
+INFO:root:Iteration[8] Train-accuracy=0.936301
+INFO:root:Iteration[8] Time cost=104.178
+INFO:root:Iteration[8] Validation-accuracy=0.813687
+INFO:root:Iteration[9] Train-accuracy=0.950068
+INFO:root:Iteration[9] Time cost=104.522
+INFO:root:Iteration[9] Validation-accuracy=0.820115
+INFO:root:Accuracy = 0.820100
+```
+
+using 3x dual gtx 980 machines, async inception with batch size = 128 and
+learning rate = .05
+
+
+```
+[03:23:29] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:31] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:29] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:31] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:30] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/train.rec, use 4 threads for decoding..
+[03:23:30] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:29] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:31] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:29] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:31] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+[03:23:30] src/io/iter_image_recordio.cc:212: ImageRecordIOParser: data/cifar/test.rec, use 4 threads for decoding..
+[03:23:30] src/io/./iter_normalize.h:98: Load mean image from data/cifar/cifar_mean.bin
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Iteration[0] Train-accuracy=0.185276
+INFO:root:Iteration[0] Time cost=21.556
+INFO:root:Iteration[0] Train-accuracy=0.184255
+INFO:root:Iteration[0] Time cost=22.021
+INFO:root:Iteration[0] Train-accuracy=0.183834
+INFO:root:Iteration[0] Time cost=22.342
+INFO:root:Iteration[0] Validation-accuracy=0.225079
+INFO:root:Iteration[0] Validation-accuracy=0.236452
+INFO:root:Iteration[0] Validation-accuracy=0.237836
+INFO:root:Iteration[1] Train-accuracy=0.308624
+INFO:root:Iteration[1] Time cost=21.617
+INFO:root:Iteration[1] Train-accuracy=0.312977
+INFO:root:Iteration[1] Time cost=21.603
+INFO:root:Iteration[1] Train-accuracy=0.309637
+INFO:root:Iteration[1] Time cost=21.917
+INFO:root:Iteration[1] Validation-accuracy=0.333169
+INFO:root:Iteration[1] Validation-accuracy=0.382812
+INFO:root:Iteration[1] Validation-accuracy=0.385186
+INFO:root:Iteration[2] Train-accuracy=0.426885
+INFO:root:Iteration[2] Time cost=21.531
+INFO:root:Iteration[2] Train-accuracy=0.420802
+INFO:root:Iteration[2] Time cost=21.469
+INFO:root:Iteration[2] Train-accuracy=0.436844
+INFO:root:Iteration[2] Time cost=22.053
+INFO:root:Iteration[2] Validation-accuracy=0.487935
+INFO:root:Iteration[2] Validation-accuracy=0.491495
+INFO:root:Iteration[2] Validation-accuracy=0.532832
+INFO:root:Iteration[3] Train-accuracy=0.541209
+INFO:root:Iteration[3] Time cost=21.817
+INFO:root:Iteration[3] Train-accuracy=0.544072
+INFO:root:Iteration[3] Time cost=21.759
+INFO:root:Iteration[3] Train-accuracy=0.546458
+INFO:root:Iteration[3] Time cost=22.156
+INFO:root:Iteration[3] Validation-accuracy=0.589102
+INFO:root:Iteration[3] Validation-accuracy=0.559138
+INFO:root:Iteration[3] Validation-accuracy=0.613528
+INFO:root:Iteration[4] Train-accuracy=0.618500
+INFO:root:Iteration[4] Time cost=21.552
+INFO:root:Iteration[4] Train-accuracy=0.614862
+INFO:root:Iteration[4] Time cost=21.544
+INFO:root:Iteration[4] Train-accuracy=0.619573
+INFO:root:Iteration[4] Time cost=21.890
+INFO:root:Iteration[4] Validation-accuracy=0.630241
+INFO:root:Iteration[4] Validation-accuracy=0.618176
+INFO:root:Iteration[4] Validation-accuracy=0.666930
+INFO:root:Iteration[5] Train-accuracy=0.673843
+INFO:root:Iteration[5] Time cost=21.056
+INFO:root:Iteration[5] Train-accuracy=0.675692
+INFO:root:Iteration[5] Time cost=21.120
+INFO:root:Iteration[5] Train-accuracy=0.678912
+INFO:root:Iteration[5] Time cost=21.721
+INFO:root:Iteration[5] Validation-accuracy=0.657634
+INFO:root:Iteration[5] Validation-accuracy=0.677809
+INFO:root:Iteration[5] Validation-accuracy=0.715882
+INFO:root:Iteration[6] Train-accuracy=0.722149
+INFO:root:Iteration[6] Time cost=20.579
+INFO:root:Iteration[6] Train-accuracy=0.724833
+INFO:root:Iteration[6] Time cost=20.548
+INFO:root:Iteration[6] Train-accuracy=0.720241
+INFO:root:Iteration[6] Time cost=20.772
+INFO:root:Iteration[6] Validation-accuracy=0.692939
+INFO:root:Iteration[6] Validation-accuracy=0.714794
+INFO:root:Iteration[6] Validation-accuracy=0.748220
+INFO:root:Iteration[7] Train-accuracy=0.760854
+INFO:root:Iteration[7] Time cost=20.801
+INFO:root:Iteration[7] Train-accuracy=0.757276
+INFO:root:Iteration[7] Time cost=21.080
+INFO:root:Iteration[7] Validation-accuracy=0.735858
+INFO:root:Iteration[7] Train-accuracy=0.758767
+INFO:root:Iteration[7] Time cost=21.353
+INFO:root:Iteration[7] Validation-accuracy=0.737638
+INFO:root:Iteration[7] Validation-accuracy=0.774328
+INFO:root:Iteration[8] Train-accuracy=0.794967
+INFO:root:Iteration[8] Time cost=21.593
+INFO:root:Iteration[8] Train-accuracy=0.798485
+INFO:root:Iteration[8] Time cost=21.672
+INFO:root:Iteration[8] Validation-accuracy=0.762460
+INFO:root:Iteration[8] Train-accuracy=0.795503
+INFO:root:Iteration[8] Time cost=22.155
+INFO:root:Iteration[8] Validation-accuracy=0.745748
+INFO:root:Iteration[8] Validation-accuracy=0.784513
+INFO:root:Iteration[9] Train-accuracy=0.825561
+INFO:root:Iteration[9] Time cost=21.644
+INFO:root:Iteration[9] Train-accuracy=0.821923
+INFO:root:Iteration[9] Time cost=21.479
+INFO:root:Iteration[9] Validation-accuracy=0.727453
+INFO:root:Iteration[9] Validation-accuracy=0.745253
+INFO:root:Iteration[9] Train-accuracy=0.819716
+INFO:root:Iteration[9] Time cost=21.927
+INFO:root:Iteration[9] Validation-accuracy=0.781151
+INFO:root:Iteration[10] Train-accuracy=0.842975
+INFO:root:Iteration[10] Time cost=21.431
+INFO:root:Iteration[10] Train-accuracy=0.841543
+INFO:root:Iteration[10] Time cost=21.387
+INFO:root:Iteration[10] Validation-accuracy=0.768196
+INFO:root:Iteration[10] Validation-accuracy=0.781448
+INFO:root:Iteration[10] Train-accuracy=0.843989
+INFO:root:Iteration[10] Time cost=21.875
+INFO:root:Iteration[10] Validation-accuracy=0.804391
+INFO:root:Iteration[11] Train-accuracy=0.860329
+INFO:root:Iteration[11] Time cost=20.664
+INFO:root:Iteration[11] Train-accuracy=0.858958
+INFO:root:Iteration[11] Time cost=20.734
+INFO:root:Iteration[11] Validation-accuracy=0.780063
+INFO:root:Iteration[11] Validation-accuracy=0.774426
+INFO:root:Iteration[11] Train-accuracy=0.861104
+INFO:root:Iteration[11] Time cost=21.449
+INFO:root:Iteration[11] Validation-accuracy=0.818335
+INFO:root:Iteration[12] Train-accuracy=0.885973
+INFO:root:Iteration[12] Time cost=21.037
+INFO:root:Iteration[12] Train-accuracy=0.887583
+INFO:root:Iteration[12] Time cost=21.066
+INFO:root:Iteration[12] Validation-accuracy=0.798358
+INFO:root:Iteration[12] Validation-accuracy=0.803204
+INFO:root:Iteration[12] Train-accuracy=0.885914
+INFO:root:Iteration[12] Time cost=21.738
+INFO:root:Iteration[12] Validation-accuracy=0.812203
+INFO:root:Iteration[13] Train-accuracy=0.904103
+INFO:root:Iteration[13] Time cost=21.326
+INFO:root:Iteration[13] Train-accuracy=0.904282
+INFO:root:Iteration[13] Time cost=21.278
+INFO:root:Iteration[13] Validation-accuracy=0.791238
+INFO:root:Iteration[13] Validation-accuracy=0.799842
+INFO:root:Iteration[13] Train-accuracy=0.901002
+INFO:root:Iteration[13] Time cost=21.408
+INFO:root:Iteration[13] Validation-accuracy=0.802116
+INFO:root:Iteration[14] Train-accuracy=0.911140
+INFO:root:Iteration[14] Time cost=21.527
+INFO:root:Iteration[14] Train-accuracy=0.913705
+INFO:root:Iteration[14] Time cost=21.569
+INFO:root:Iteration[14] Validation-accuracy=0.803204
+INFO:root:Iteration[14] Validation-accuracy=0.803303
+INFO:root:Iteration[14] Train-accuracy=0.914182
+INFO:root:Iteration[14] Time cost=22.170
+INFO:root:Iteration[14] Validation-accuracy=0.771460
+INFO:root:Iteration[15] Train-accuracy=0.915852
+INFO:root:Iteration[15] Time cost=21.608
+INFO:root:Iteration[15] Train-accuracy=0.911975
+INFO:root:Iteration[15] Time cost=21.623
+INFO:root:Iteration[15] Validation-accuracy=0.801325
+INFO:root:Iteration[15] Validation-accuracy=0.798259
+INFO:root:Iteration[15] Train-accuracy=0.923008
+INFO:root:Iteration[15] Time cost=21.806
+INFO:root:Iteration[15] Validation-accuracy=0.809335
+INFO:root:Iteration[16] Train-accuracy=0.938096
+INFO:root:Iteration[16] Time cost=21.857
+INFO:root:Iteration[16] Train-accuracy=0.944358
+INFO:root:Iteration[16] Time cost=21.954
+INFO:root:Iteration[16] Validation-accuracy=0.790249
+INFO:root:Iteration[16] Validation-accuracy=0.795095
+INFO:root:Iteration[16] Train-accuracy=0.947877
+INFO:root:Iteration[16] Time cost=21.844
+INFO:root:Iteration[16] Validation-accuracy=0.812797
+INFO:root:Iteration[17] Train-accuracy=0.953006
+INFO:root:Iteration[17] Time cost=21.357
+INFO:root:Iteration[17] Train-accuracy=0.957121
+INFO:root:Iteration[17] Time cost=21.431
+INFO:root:Iteration[17] Validation-accuracy=0.793908
+INFO:root:Iteration[17] Validation-accuracy=0.793216
+INFO:root:Iteration[17] Train-accuracy=0.962846
+INFO:root:Iteration[17] Time cost=21.819
+INFO:root:Iteration[17] Validation-accuracy=0.812994
+INFO:root:Iteration[18] Train-accuracy=0.961772
+INFO:root:Iteration[18] Time cost=20.599
+INFO:root:Iteration[18] Train-accuracy=0.963800
+INFO:root:Iteration[18] Time cost=20.569
+INFO:root:Iteration[18] Validation-accuracy=0.815467
+INFO:root:Iteration[18] Validation-accuracy=0.818829
+INFO:root:Iteration[18] Train-accuracy=0.966603
+INFO:root:Iteration[18] Time cost=21.018
+INFO:root:Iteration[18] Validation-accuracy=0.812698
+INFO:root:Iteration[19] Train-accuracy=0.975131
+INFO:root:Iteration[19] Time cost=20.671
+INFO:root:Iteration[19] Train-accuracy=0.975847
+INFO:root:Iteration[19] Time cost=20.758
+INFO:root:Iteration[19] Validation-accuracy=0.822785
+INFO:root:Iteration[19] Validation-accuracy=0.823378
+INFO:root:Iteration[19] Train-accuracy=0.981990
+INFO:root:Iteration[19] Time cost=20.912
+INFO:root:Accuracy = 0.823800
+INFO:root:Iteration[19] Validation-accuracy=0.828521
+INFO:root:Accuracy = 0.829200
+INFO:root:Accuracy = 0.833000
+```
+
+## imagenet
+
+3 x dual 980, with cudnn, 1G ethernet
+
+`dist_sync`:
+
+```
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Start training with [gpu(0), gpu(1)]
+INFO:root:Iter[0] Batch [5]	Speed: 175.98 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 173.52 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 171.04 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 107.82 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 108.03 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 107.79 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 109.53 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 109.74 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 110.21 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 113.19 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 111.20 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 110.38 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 111.24 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 109.90 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 107.48 samples/sec
+```
+
+`dist_aync`
+
+```
+INFO:root:Iter[0] Batch [5]	Speed: 202.15 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 181.41 samples/sec
+INFO:root:Iter[0] Batch [5]	Speed: 179.61 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 125.75 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 108.90 samples/sec
+INFO:root:Iter[0] Batch [10]	Speed: 109.25 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 118.44 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 112.89 samples/sec
+INFO:root:Iter[0] Batch [15]	Speed: 112.83 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 123.68 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 115.85 samples/sec
+INFO:root:Iter[0] Batch [20]	Speed: 105.82 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 124.24 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 115.21 samples/sec
+INFO:root:Iter[0] Batch [25]	Speed: 106.60 samples/sec
+INFO:root:Iter[0] Batch [30]	Speed: 120.62 samples/sec
+INFO:root:Iter[0] Batch [30]	Speed: 121.35 samples/sec
+```
diff --git a/tests/python/multi-node/common.py b/tests/python/multi-node/common.py
index 2d33a32c7145..0db092462a78 100644
--- a/tests/python/multi-node/common.py
+++ b/tests/python/multi-node/common.py
@@ -58,10 +58,11 @@ def cifar10(batch_size, input_shape, num_parts=1, part_index=0):
         rand_mirror = False,
         shuffle     = False,
         round_batch = False,
-        data_shape  = (3,28,28),
+        data_shape  = input_shape,
         batch_size  = batch_size)
     return (train, val)
 
+
 def accuracy(model, data):
     """evaluate acc"""
     # predict
diff --git a/tests/python/multi-node/dist_async_inception.py b/tests/python/multi-node/dist_async_inception.py
new file mode 100644
index 000000000000..8cb116fc26e2
--- /dev/null
+++ b/tests/python/multi-node/dist_async_inception.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# pylint: skip-file
+import mxnet as mx
+import common
+import logging
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+kv = mx.kvstore.create('dist_async')
+(train, val) = common.cifar10(num_parts = kv.num_workers,
+                              part_index = kv.rank,
+                              batch_size = 128,
+                              input_shape=(3,28,28))
+devs = [mx.gpu(i) for i in range(2)]
+model = mx.model.FeedForward.create(
+    ctx           = devs,
+    kvstore       = kv,
+    symbol        = common.inception(),
+    X             = train,
+    eval_data     = val,
+    num_round     = 20,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001,
+    initializer   = mx.init.Uniform(0.07))
+
+common.accuracy(model, val)
diff --git a/tests/python/multi-node/dist_async_lenet.py b/tests/python/multi-node/dist_async_lenet.py
new file mode 100644
index 000000000000..866eed3b8f2a
--- /dev/null
+++ b/tests/python/multi-node/dist_async_lenet.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import common
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+# feed each machine the whole data
+(train, val) = common.mnist(num_parts = kv.num_workers,
+                            part_index = kv.rank,
+                            batch_size = 100,
+                            input_shape = (1,28,28))
+
+model  = mx.model.FeedForward.create(
+    ctx           = mx.gpu(kv.rank),
+    kvstore       = kv,
+    symbol        = common.lenet(),
+    X             = train,
+    num_round     = 10,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001)
+
+common.accuracy(model, val)
diff --git a/tests/python/multi-node/dist_async_mlp.py b/tests/python/multi-node/dist_async_mlp.py
new file mode 100644
index 000000000000..98abdca797ca
--- /dev/null
+++ b/tests/python/multi-node/dist_async_mlp.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import common
+
+mx.random.seed(0)
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_async')
+
+(train, val) = common.mnist(num_parts = kv.num_workers,
+                            part_index = kv.rank,
+                            batch_size = 100,
+                            input_shape = (784,))
+
+# train
+model  = mx.model.FeedForward.create(
+    symbol        = common.mlp(),
+    ctx           = mx.cpu(),
+    X             = train,
+    num_round     = 4,
+    learning_rate = 0.05,
+    wd            = 0.0004,
+    momentum      = 0.9,
+    kvstore       = kv)
+
+common.accuracy(model, val)
diff --git a/tests/python/multi-node/dist_imagenet_inception.py b/tests/python/multi-node/dist_imagenet_inception.py
new file mode 100644
index 000000000000..978b821f8fa6
--- /dev/null
+++ b/tests/python/multi-node/dist_imagenet_inception.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+import mxnet as mx
+import logging
+import imagenet
+
+logging.basicConfig(level=logging.DEBUG)
+
+kv = mx.kvstore.create('dist_sync')
+
+batch_size = 96
+(train, val) = imagenet.ilsvrc12(num_parts = kv.num_workers,
+                                part_index = kv.rank,
+                                batch_size = batch_size,
+                                input_shape = (3, 224, 224))
+
+# assume each worker has two gpus
+devs = [mx.gpu(i) for i in range(2)]
+
+model = mx.model.FeedForward(
+    ctx           = devs,
+    symbol        = imagenet.inception(1000),
+    num_round     = 20,
+    learning_rate = 0.05,
+    momentum      = 0.9,
+    wd            = 0.00001)
+
+model.fit(X        = train,
+          eval_data     = val,
+          kvstore       = kv,
+          epoch_end_callback = mx.callback.Speedometer(batch_size, 5))
diff --git a/tests/python/multi-node/dist_sync_inception.py b/tests/python/multi-node/dist_sync_inception.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_sync_kvstore.py b/tests/python/multi-node/dist_sync_kvstore.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_sync_lenet.py b/tests/python/multi-node/dist_sync_lenet.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/dist_sync_mlp.py b/tests/python/multi-node/dist_sync_mlp.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/imagenet.py b/tests/python/multi-node/imagenet.py
new file mode 100644
index 000000000000..7663df8d1bad
--- /dev/null
+++ b/tests/python/multi-node/imagenet.py
@@ -0,0 +1,101 @@
+import sys
+sys.path.insert(0, "../common/")
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+import get_data
+import numpy as np
+import logging
+
+def ilsvrc12(batch_size, input_shape, num_parts=1, part_index=0):
+    """return ilsvrc12 iterator
+    """
+    data_dir = "../../../../ilsvrc12/"
+    train = mx.io.ImageRecordIter(
+        path_imgrec = data_dir + "train.rec",
+        mean_img    = data_dir + "mean.bin",
+        data_shape  = input_shape,
+        batch_size  = batch_size,
+        rand_crop   = True,
+        rand_mirror = True,
+        shuffle     = True,
+        round_batch = True,
+        num_parts   = num_parts,
+        part_index  = part_index)
+    val = mx.io.ImageRecordIter(
+        path_imgrec = data_dir + "val.rec",
+        mean_img    = data_dir + "mean.bin",
+        rand_crop   = False,
+        rand_mirror = False,
+        shuffle     = False,
+        round_batch = False,
+        data_shape  = input_shape,
+        batch_size  = batch_size)
+    return (train, val)
+
+def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
+    bn = mx.symbol.BatchNorm(data=conv, name='bn_%s%s' %(name, suffix))
+    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix))
+    return act
+
+def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
+    # 1x1
+    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
+    # concat
+    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
+    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
+    # double 3x3 reduce + double 3x3
+    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
+    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
+    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    # concat
+    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def inception(nhidden):
+    # data
+    data = mx.symbol.Variable(name="data")
+    # stage 1
+    conv1 = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='conv1')
+    pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool1', pool_type='max')
+    # stage 2
+    conv2red = ConvFactory(data=pool1, num_filter=64, kernel=(1, 1), stride=(1, 1), name='conv2red')
+    conv2 = ConvFactory(data=conv2red, num_filter=192, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='conv2')
+    pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool2', pool_type='max')
+    # stage 2
+    in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
+    in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
+    in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
+    # stage 3
+    in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+    in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+    in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+    in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
+    in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+    # stage 4
+    in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+    in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+    # global avg pooling
+    avg = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
+    # linear classifier
+    flatten = mx.symbol.Flatten(data=avg, name='flatten')
+    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
+    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    return softmax
diff --git a/tests/python/multi-node/local_inception.py b/tests/python/multi-node/local_inception.py
old mode 100755
new mode 100644
index 5c5fad3c4da1..fcaa8dc79688
--- a/tests/python/multi-node/local_inception.py
+++ b/tests/python/multi-node/local_inception.py
@@ -15,8 +15,9 @@ def test_inception(devs, kv_type):
         ctx           = devs,
         symbol        = common.inception(),
         X             = train,
+        eval_data     = val,
         kvstore       = kv_type,
-        num_round     = 4,
+        num_round     = 10,
         learning_rate = 0.1,
         momentum      = 0.9,
         wd            = 0.00001,
diff --git a/tests/python/multi-node/local_lenet.py b/tests/python/multi-node/local_lenet.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/local_mlp.py b/tests/python/multi-node/local_mlp.py
old mode 100755
new mode 100644
diff --git a/tests/python/multi-node/test_data.py b/tests/python/multi-node/test_data.py
old mode 100755
new mode 100644
diff --git a/tests/python/predict/mxnet_predict_example.py b/tests/python/predict/mxnet_predict_example.py
new file mode 100644
index 000000000000..7eed3c72ceb8
--- /dev/null
+++ b/tests/python/predict/mxnet_predict_example.py
@@ -0,0 +1,63 @@
+import sys, os
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append("../../../predict/python/")
+sys.path.append("../../../python/")
+
+from mxnet_predict import Predictor, load_ndarray_file
+import mxnet as mx
+import logging
+import numpy as np
+from skimage import io, transform
+
+# Load the pre-trained model
+prefix = "Inception/Inception_BN"
+num_round = 39
+symbol_file = "%s-symbol.json" % prefix
+param_file = "%s-0039.params" % prefix
+predictor = Predictor(open(symbol_file).read(),
+                      open(param_file).read(),
+                      {'data':(1, 3, 224, 224)})
+mean_img = load_ndarray_file(open("Inception/mean_224.nd").read())["mean_img"]
+
+synset = [l.strip() for l in open('Inception/synset.txt').readlines()]
+
+def PreprocessImage(path, show_img=False):
+    # load image
+    img = io.imread(path)
+    print("Original Image Shape: ", img.shape)
+    # we crop image from center
+    short_egde = min(img.shape[:2])
+    yy = int((img.shape[0] - short_egde) / 2)
+    xx = int((img.shape[1] - short_egde) / 2)
+    crop_img = img[yy : yy + short_egde, xx : xx + short_egde]
+    # resize to 224, 224
+    resized_img = transform.resize(crop_img, (224, 224))
+    if show_img:
+        io.imshow(resized_img)
+    # convert to numpy.ndarray
+    sample = np.asarray(resized_img) * 256
+    # swap channel from RGB to BGR
+    sample = sample[:, :, [2,1,0]]
+    # swap axes to make image from (224, 224, 4) to (3, 224, 224)
+    sample = np.swapaxes(sample, 0, 2)
+    sample = np.swapaxes(sample, 1, 2)
+
+    # sub mean
+    normed_img = sample - mean_img
+    normed_img.resize(1, 3, 224, 224)
+    return normed_img
+
+# Get preprocessed batch (single image batch)
+batch = PreprocessImage('./download.png', True)
+
+predictor.forward(data=batch)
+prob = predictor.get_output(0)[0]
+
+pred = np.argsort(prob)[::-1]
+# Get top1 label
+top1 = synset[pred[0]]
+print("Top1: ", top1)
+# Get top5 label
+top5 = [synset[pred[i]] for i in range(5)]
+print("Top5: ", top5)
+
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index df2b7b98afb2..bc068153c24e 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -26,9 +26,9 @@
 fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
 softmax = mx.symbol.Softmax(data = fc2, name = 'sm')
 
-num_round = 1
+num_epoch = 1
 model = mx.model.FeedForward(softmax, mx.cpu(),
-                             num_round=num_round,
+                             num_epoch=num_epoch,
                              learning_rate=0.1, wd=0.0001,
                              momentum=0.9)
 # check data
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 85266e12df52..5f1c27062066 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -20,7 +20,7 @@ def accuracy(label, pred):
     py = np.argmax(pred, axis=1)
     return np.sum(py == label) / float(label.size)
 
-num_round = 4
+num_epoch = 4
 prefix = './mlp'
 
 #check data
@@ -46,9 +46,9 @@ def test_mlp():
         X=train_dataiter,
         eval_data=val_dataiter,
         eval_metric=mx.metric.np(accuracy),
-        iter_end_callback=mx.callback.do_checkpoint(prefix),
+        epoch_end_callback=mx.callback.do_checkpoint(prefix),
         ctx=[mx.cpu(i) for i in range(2)],
-        num_round=num_round,
+        num_epoch=num_epoch,
         learning_rate=0.1, wd=0.0004,
         momentum=0.9)
 
@@ -78,7 +78,7 @@ def test_mlp():
     assert np.sum(np.abs(prob - prob2)) == 0
 
     # load model from checkpoint
-    model3 = mx.model.FeedForward.load(prefix, num_round)
+    model3 = mx.model.FeedForward.load(prefix, num_epoch)
     prob3 = model3.predict(val_dataiter)
     assert np.sum(np.abs(prob - prob3)) == 0
 
@@ -88,7 +88,7 @@ def test_mlp():
     prob4 = model4.predict(val_dataiter)
     assert np.sum(np.abs(prob - prob4)) == 0
 
-    for i in range(num_round):
+    for i in range(num_epoch):
         os.remove('%s-%04d.params' % (prefix, i + 1))
     os.remove('%s-symbol.json' % prefix)
     os.remove('%s-0128.params' % prefix)
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index 77439677320f..dd8149d4822e 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -27,6 +27,13 @@ def test_single_kv_pair():
     kv.pull(3, out = val)
     check_diff_to_scalar(val, 1)
 
+def test_init():
+    """test init"""
+    kv = mx.kv.create()
+    kv.init(3, mx.nd.ones(shape)*4)
+    a = mx.nd.zeros(shape)
+    kv.pull(3, out=a)
+    check_diff_to_scalar(a, 4)
 
 def test_list_kv_pair():
     """list key-value pair push & pull"""
@@ -110,6 +117,7 @@ def test_get_type():
     assert kv.type == kvtype
 
 if __name__ == '__main__':
+    test_init()
     test_get_type()
     test_single_kv_pair()
     test_list_kv_pair()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 36a1672bc636..c00350a3ad28 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -14,7 +14,7 @@ def same(a, b):
     return np.sum(a != b) == 0
 
 
-def check_with_uniform(uf, arg_shapes, dim=None):
+def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10):
     """check function consistency with uniform random numbers"""
     if isinstance(arg_shapes, int):
         assert dim
@@ -23,14 +23,19 @@ def check_with_uniform(uf, arg_shapes, dim=None):
     ndarray_arg = []
     numpy_arg = []
     for s in arg_shapes:
-        npy = np.random.uniform(-10, 10, s)
+        npy = np.random.uniform(rmin, 10, s)
         narr = mx.nd.array(npy)
         ndarray_arg.append(narr)
         numpy_arg.append(npy)
     out1 = uf(*ndarray_arg)
-    out2 = uf(*numpy_arg)
+    if npuf is None:
+        out2 = uf(*numpy_arg)
+    else:
+        out2 = npuf(*numpy_arg)
     assert out1.shape == out2.shape
-    assert reldiff(out1.asnumpy(), out2) < 1e-6
+    if isinstance(out1, mx.nd.NDArray):
+        out1 = out1.asnumpy()
+    assert reldiff(out1, out2) < 1e-6
 
 
 def random_ndarray(dim):
@@ -48,6 +53,9 @@ def test_ndarray_elementwise():
             check_with_uniform(lambda x, y: x - y, 2, dim)
             check_with_uniform(lambda x, y: x * y, 2, dim)
             check_with_uniform(lambda x, y: x / y, 2, dim)
+            check_with_uniform(mx.nd.sqrt, 2, dim, np.sqrt, rmin=0)
+            check_with_uniform(mx.nd.square, 2, dim, np.square, rmin=0)
+            check_with_uniform(lambda x: mx.nd.norm(x).asscalar(), 1, dim, np.linalg.norm)
 
 def test_ndarray_negate():
     npy = np.random.uniform(-10, 10, (2,3,4))
@@ -61,6 +69,30 @@ def test_ndarray_negate():
     assert reldiff(npy, arr.asnumpy()) < 1e-6
 
 
+def test_ndarray_choose():
+    shape = (100, 20)
+    npy = np.arange(np.prod(shape)).reshape(shape)
+    arr = mx.nd.array(npy)
+    nrepeat = 3
+    for repeat in range(nrepeat):
+        indices = np.random.randint(shape[1], size=shape[0])
+        assert same(npy[np.arange(shape[0]), indices],
+                    mx.nd.choose_element(arr, mx.nd.array(indices)).asnumpy())
+
+
+def test_ndarray_choose():
+    shape = (100, 20)
+    npy = np.arange(np.prod(shape)).reshape(shape)
+    arr = mx.nd.array(npy)
+    nrepeat = 3
+    for repeat in range(nrepeat):
+        indices = np.random.randint(shape[1], size=shape[0])
+        npy[:] = 0.0
+        npy[np.arange(shape[0]), indices] = 1.0
+        mx.nd.onehot_encode(mx.nd.array(indices), out=arr)
+        assert same(npy, arr.asnumpy())
+
+
 def test_ndarray_copy():
     c = mx.nd.array(np.random.uniform(-10, 10, (10, 10)))
     d = c.copyto(mx.Context('cpu', 0))
@@ -158,3 +190,4 @@ def test_dot():
     test_ndarray_scalar()
     test_clip()
     test_dot()
+    test_ndarray_choose()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index fbc007b9fed7..b0743a6f0bb6 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -171,8 +171,41 @@ def test_regression():
                      lambda x: x,
                      lambda x, y : x - y)
 
+def check_softmax_with_shape(shape, xpu):
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.Softmax(data=X, label=L)
+    x = mx.random.uniform(-1, 1, shape, ctx = xpu)
+    l = mx.nd.empty((shape[0],), ctx = xpu)
+    l[:] = np.random.randint(0, shape[0]-1, (shape[0],))
+    grad = mx.nd.empty(shape, ctx = xpu)
+
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    print('foward')
+    exec1.forward()
+    print(exec1.outputs[0].asnumpy())
+    exec1.backward()
+    print(grad.asnumpy())
+
+def check_multi_softmax_with_shape(shape, xpu):
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.Softmax(data=X, label=L, multi_output=True)
+    x = mx.random.uniform(-1, 1, shape, ctx = xpu)
+    l = mx.nd.empty((shape[0], shape[2]), ctx = xpu)
+    l[:] = np.random.randint(0, shape[1]-1, (shape[0], shape[2]))
+    grad = mx.nd.empty(shape, ctx = xpu)
+
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    exec1.forward()
+    print(exec1.outputs[0].asnumpy())
+    exec1.backward()
+    print(grad.asnumpy())
+
 if __name__ == '__main__':
     test_elementwise_sum()
     test_concat()
     test_slice_channel()
     test_regression()
+    #check_softmax_with_shape((3,4), mx.cpu())
+    #check_multi_softmax_with_shape((3,4,5), mx.cpu())
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index ea673c4a7863..9b5bcde99848 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -2,6 +2,7 @@
 from caffe.proto import caffe_pb2
 from google.protobuf import text_format
 import argparse
+import sys
 
 def readProtoSolverFile(filepath):
     solver_config = caffe.proto.caffe_pb2.NetParameter()
@@ -22,7 +23,12 @@ def proto2script(proto_file):
     top = dict()
     flatten_count = 0
     symbol_string = ""
-    layer = proto.layer
+    if len(proto.layer):
+        layer = proto.layer
+    elif len(proto.layers):
+        layer = proto.layers
+    else:
+        raise Exception('Invalid proto file.')
 
     # We assume the first bottom blob of first layer is the output from data layer
     input_name = layer[0].bottom[0]
@@ -33,7 +39,7 @@ def proto2script(proto_file):
         type_string = ''
         param_string = ''
         name = layer[i].name.replace('/', '_')
-        if layer[i].type == 'Convolution':
+        if layer[i].type == 'Convolution' or layer[i].type == 4:
             type_string = 'mx.symbol.Convolution'
             param = layer[i].convolution_param 
             pad = 0 if len(param.pad) == 0 else param.pad[0]
@@ -42,7 +48,7 @@ def proto2script(proto_file):
                 (param.num_output, pad, pad, param.kernel_size[0],\
                 param.kernel_size[0], stride, stride, not param.bias_term)
             need_flatten[name] = True
-        if layer[i].type == 'Pooling':
+        if layer[i].type == 'Pooling' or layer[i].type == 17:
             type_string = 'mx.symbol.Pooling'
             param = layer[i].pooling_param
             param_string = "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" %\
@@ -55,37 +61,37 @@ def proto2script(proto_file):
             else:
                 raise Exception("Unknown Pooling Method!")
             need_flatten[name] = True
-        if layer[i].type == 'ReLU':
+        if layer[i].type == 'ReLU' or layer[i].type == 18:
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='relu'"
-            need_flatten[name] = need_flatten[mapping[proto.layer[i].bottom[0]]]
-        if layer[i].type == 'LRN':
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'LRN' or layer[i].type == 15:
             type_string = 'mx.symbol.LRN'
             param = layer[i].lrn_param  
             param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" %\
                 (param.alpha, param.beta, param.k, param.local_size)
             need_flatten[name] = True
-        if layer[i].type == 'InnerProduct':
+        if layer[i].type == 'InnerProduct' or layer[i].type == 14:
             type_string = 'mx.symbol.FullyConnected'
             param = layer[i].inner_product_param
             param_string = "num_hidden=%d, no_bias=%s" % (param.num_output, not param.bias_term)
             need_flatten[name] = False
-        if layer[i].type == 'Dropout':
+        if layer[i].type == 'Dropout' or layer[i].type == 6:
             type_string = 'mx.symbol.Dropout'
             param = layer[i].dropout_param
             param_string = "p=%f" % param.dropout_ratio
-            need_flatten[name] = need_flatten[mapping[proto.layer[i].bottom[0]]]
-        if layer[i].type == 'Softmax':
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Softmax' or layer[i].type == 20:
             type_string = 'mx.symbol.Softmax'
 
             # We only support single output network for now.
             output_name = name
-        if layer[i].type == 'Flatten':
+        if layer[i].type == 'Flatten' or layer[i].type == 8:
             type_string = 'mx.symbol.Flatten'
             need_flatten[name] = False
-        if layer[i].type == 'Split':
+        if layer[i].type == 'Split' or layer[i].type == 22:
             type_string = 'split'
-        if layer[i].type == 'Concat':
+        if layer[i].type == 'Concat' or layer[i].type == 3:
             type_string = 'mx.symbol.Concat'
             need_flatten[name] = True
         if type_string == '':
@@ -121,3 +127,14 @@ def proto2symbol(proto_file):
     exec(sym)
     exec("ret = " + output_name)
     return ret
+
+def main():
+    symbol_string, output_name = proto2script(sys.argv[1])
+    if len(sys.argv) > 2:
+        with open(sys.argv[2], 'w') as fout:
+            fout.write(symbol_string)
+    else:
+        print(symbol_string)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 61cee06bd23d..471287a15182 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -28,13 +28,17 @@ int main(int argc, char *argv[]) {
            "\tresize=newsize resize the shorter edge of image to the newsize, original images will be packed by default\n"\
            "\tlabel_width=WIDTH[default=1] specify the label_width in the list, by default set to 1\n"\
            "\tnsplit=NSPLIT[default=1] used for part generation, logically split the image.list to NSPLIT parts by position\n"\
-           "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n");
+           "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n"
+           "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it rectangular.\n"
+           "\tquality=QUALITY[default=80] JPEG quality for encoding, 1-100.\n");
     return 0;
   }
   int label_width = 1;
   int new_size = -1;
   int nsplit = 1;
   int partid = 0;
+  int center_crop = 0;
+  int quality = 80;
   for (int i = 4; i < argc; ++i) {
     char key[128], val[128];
     if (sscanf(argv[i], "%[^=]=%s", key, val) == 2) {
@@ -42,6 +46,8 @@ int main(int argc, char *argv[]) {
       if (!strcmp(key, "label_width")) label_width = atoi(val);
       if (!strcmp(key, "nsplit")) nsplit = atoi(val);
       if (!strcmp(key, "part")) partid = atoi(val);
+      if (!strcmp(key, "center_crop")) center_crop = atoi(val);
+      if (!strcmp(key, "quality")) quality = atoi(val);
     }
   }
   if (new_size > 0) {
@@ -49,6 +55,9 @@ int main(int argc, char *argv[]) {
   } else {
     LOG(INFO) << "Keep origin image size";
   }
+  if (center_crop) {
+    LOG(INFO) << "Center cropping to rectangular";
+  }
   
   using namespace dmlc;
   const static size_t kBufferSize = 1 << 20UL;
@@ -73,7 +82,8 @@ int main(int argc, char *argv[]) {
   std::vector<unsigned char> encode_buf;
   std::vector<int> encode_params;
   encode_params.push_back(CV_IMWRITE_JPEG_QUALITY);
-  encode_params.push_back(80);
+  encode_params.push_back(quality);
+  LOG(INFO) << "JPEG encoding quality: " << quality;
   dmlc::InputSplit::Blob line;
 
   while (flist->NextRecord(&line)) {
@@ -111,6 +121,15 @@ int main(int argc, char *argv[]) {
     if (new_size > 0) {
       cv::Mat img = cv::imdecode(decode_buf, CV_LOAD_IMAGE_COLOR);
       CHECK(img.data != NULL) << "OpenCV decode fail:" << path;
+      if (center_crop) {
+        if (img.rows > img.cols) {
+          int margin = (img.rows - img.cols)/2;
+          img = img(cv::Range(margin, margin+img.cols), cv::Range(0, img.cols));
+        } else {
+          int margin = (img.cols - img.rows)/2;
+          img = img(cv::Range(0, img.rows), cv::Range(margin, margin + img.rows));
+        }
+      }
       cv::Mat res;
       if (img.rows > img.cols) {
         cv::resize(img, res, cv::Size(new_size, img.rows * new_size / img.cols),
diff --git a/tools/make_list.py b/tools/make_list.py
new file mode 100644
index 000000000000..926902807a54
--- /dev/null
+++ b/tools/make_list.py
@@ -0,0 +1,73 @@
+import os
+import random
+import numpy as np
+import argparse
+
+def list_image(root, recursive, exts):
+    image_list = []
+    if recursive:
+        cat = {}
+        for path, subdirs, files in os.walk(root):
+            print path
+            for fname in files:
+                fpath = os.path.join(path, fname)
+                suffix = os.path.splitext(fname)[1].lower()
+                if os.path.isfile(fpath) and (suffix in exts):
+                    if path not in cat:
+                        cat[path] = len(cat)
+                    image_list.append((os.path.relpath(fpath, root), cat[path]))
+    else:
+        for fname in os.listdir(root):
+            fpath = os.path.join(root, fname)
+            suffix = os.path.splitext(fname)[1].lower()
+            if os.path.isfile(fpath) and (suffix in exts):
+                image_list.append((os.path.relpath(fpath, root), 0))
+    return image_list
+
+def write_list(path_out, image_list):
+    with open(path_out, 'w') as fout:
+        for i in xrange(len(image_list)):
+            fout.write('%d \t %d \t %s\n'%(i, image_list[i][1], image_list[i][0]))
+
+
+def make_list(prefix_out, root, recursive, exts, num_chunks, train_ratio):
+    image_list = list_image(root, recursive, exts)
+    random.shuffle(image_list)
+    N = len(image_list)
+    chunk_size = N/num_chunks
+    for i in xrange(num_chunks):
+        chunk = image_list[i*chunk_size:(i+1)*chunk_size]
+        if num_chunks > 1:
+            str_chunk = '_%d'%i
+        else:
+            str_chunk = ''
+        if train_ratio < 1:
+            sep = int(chunk_size*train_ratio)
+            write_list(prefix_out+str_chunk+'_train.lst', chunk[:sep])
+            write_list(prefix_out+str_chunk+'_val.lst', chunk[sep:])
+        else:
+            write_list(prefix_out+str_chunk+'.lst', chunk)
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Make image list files that are\
+        required by im2rec')
+    parser.add_argument('root', help='path to folder that contain images.')
+    parser.add_argument('prefix', help='prefix of output list files.')
+    parser.add_argument('--exts', type=list, default=['.jpeg','.jpg'],
+        help='list of acceptable image extensions.')
+    parser.add_argument('--chunks', type=int, default=1, help='number of chunks.')
+    parser.add_argument('--train_ratio', type=float, default=1.0,
+        help='Percent of images to use for training.')
+    parser.add_argument('--recursive', type=bool, default=False,
+        help='If true recursively walk through subdirs and assign an unique label\
+        to images in each folder. Otherwise only include images in the root folder\
+        and give them label 0.')
+    args = parser.parse_args()
+    
+    make_list(args.prefix, args.root, args.recursive,
+        args.exts, args.chunks, args.train_ratio)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file