diff --git a/.gitignore b/.gitignore
index 749197668afc..3bbcd2ce1ba9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -115,3 +115,12 @@ scala-package/*/*/target/
 .project
 .cproject
 .pydevproject
+CMakeFiles
+cmake_install.cmake
+dmlc-core
+ps-lite
+nnvm
+lib
+
+# Visual Studio Code
+.vscode
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b81b1910c015..505d3c260130 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,8 +37,15 @@ else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
   check_cxx_compiler_flag("-msse2"       SUPPORT_MSSE2)
-  set(CMAKE_C_FLAGS "-O3 -Wall -msse2 -Wno-unknown-pragmas -fPIC")
-  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}")
+  set(CMAKE_C_FLAGS "-Wall -msse2 -Wno-unknown-pragmas -fPIC")
+  if(NDEBUG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
+  else(NDEBUG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0 -ggdb3")
+  endif(NDEBUG)
+  if(SUPPORT_CXX11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  endif()
 endif(MSVC)
 
 if(USE_OPENCV)
@@ -72,7 +79,7 @@ if(USE_CUDNN)
     add_definitions(-DUSE_CUDNN)
     include_directories(SYSTEM ${CUDNN_INCLUDE})
     list(APPEND mxnet_LINKER_LIBS ${CUDNN_LIBRARY})
-	add_definitions(-DMSHADOW_USE_CUDNN=1)
+	  add_definitions(-DMSHADOW_USE_CUDNN=1)
   endif()
 endif()
 
@@ -135,6 +142,37 @@ if(USE_PLUGINS_WARPCTC)
 	list(APPEND CUDA ${PLUGINS_CUSRC})
 endif()
 
+if(USE_PLUGIN_CAFFE)
+  if(NOT DEFINED CAFFE_PATH)
+    if(EXISTS ${PROJECT_SOURCE_DIR}/caffe)
+      # Need newer FindCUDA.cmake that correctly handles -std=c++11
+      cmake_minimum_required(VERSION 3.3)
+      set(CAFFE_PATH ${PROJECT_SOURCE_DIR}/caffe)
+    endif()
+  endif()
+  list(APPEND CMAKE_MODULE_PATH ${CAFFE_PATH}/cmake)
+  include_directories(${CAFFE_PATH}/include)
+  include_directories(${CAFFE_PATH}/build/src)
+  include_directories(${CMAKE_BINARY_DIR}/caffe/include)
+  link_directories(${CAFFE_PATH}/build/lib)
+  if(NOT DEFINED CAFFE_PATH)
+    message(FATAL_ERROR "Please set CAFFE_PATH to point to the caffe source installation")
+  endif()
+  mxnet_source_group("Include\\plugin\\caffe"   GLOB "plugin/caffe/*.h")
+  mxnet_source_group("Source\\plugin\\caffe"   GLOB "plugin/caffe/*.cc")
+  mxnet_source_group("Cuda\\plugin\\caffe"   GLOB "plugin/caffe/*.cu")
+  FILE(GLOB_RECURSE PLUGINS_SOURCE "plugin/caffe/*.cc" "plugin/caffe/*.h")
+  FILE(GLOB_RECURSE PLUGINS_CUSRC "plugin/caffe/*.cu")
+  list(APPEND SOURCE ${PLUGINS_SOURCE})
+  list(APPEND CUDA ${PLUGINS_CUSRC})
+  include_directories(${CMAKE_BINARY_DIR}/include)
+  list(APPEND mxnet_LINKER_LIBS
+    protobuf boost_system boost_thread boost_filesystem
+    gflags glog caffe
+    ${Caffe_LINKER_LIBS}
+)
+endif()
+
 if (NOT (EXTRA_OPERATORS STREQUAL ""))
 	mxnet_source_group("Extra"   GLOB_RECURSE "${EXTRA_OPERATORS}/*.cc")
 	mxnet_source_group("Extra\\Cuda"   GLOB_RECURSE "${EXTRA_OPERATORS}/*.cu")
@@ -163,15 +201,23 @@ if(USE_CUDA)
     list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
   else(MSVC)
     list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
+    link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
   endif()
   list(APPEND SOURCE ${cuda_objs} ${CUDA})
 endif()
 
+# unsupported: if caffe is a subdirectory of mxnet, load its CMakeLists.txt as well
+if(USE_PLUGIN_CAFFE)
+  if(EXISTS ${PROJECT_SOURCE_DIR}/caffe)
+    add_subdirectory(caffe)
+  endif()
+endif()
+
 if(NOT MSVC)
   # Only add c++11 flags and definitions after cuda compiling
   add_definitions(-DDMLC_USE_CXX11)
   add_definitions(-DMSHADOW_IN_CXX11)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c++0x")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
 else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index ee6fbcf057d3..eddbdd58ed1c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -28,6 +28,7 @@ The committers are the granted write access to the project.
   - Chuntao is the initiator and provides the initial design of engine.
 * [Chiyuan Zhang](https://github.com/pluskid)
   - Chiyuan is the creator of MXNet Julia Package.
+* [Junyuan Xie](https://github.com/piiswrong)
 * [Qiang Kou](https://github.com/thirdwing)
   - KK is a R ninja, he makes mxnet available for R users.
 * [Tong He](https://github.com/hetong007)
@@ -64,7 +65,6 @@ List of Contributors
 * [Shuzhe Wu](https://github.com/II-Matto)
 * [Xiaodong](https://github.com/XD-DENG)
 * [Nan Xiao](https://github.com/road2stat)
-* [Junyuan Xie](https://github.com/piiswrong)
 * [Wei Wu](https://github.com/tornadomeet)
 * [Michaël Benesty](https://github.com/pommedeterresautee)
   -Michaël contributes the R visualization module of mxnet
@@ -110,3 +110,4 @@ List of Contributors
 * [Yang Bo](https://github.com/Atry)
 * [Jonas Amaro](https://github.com/jonasrla)
 * [Yan Li](https://github.com/Godricly)
+* [Yuance Li](https://github.com/liyuance)
diff --git a/Makefile b/Makefile
index ccea1ba0a7b0..2ccd668814eb 100644
--- a/Makefile
+++ b/Makefile
@@ -100,10 +100,10 @@ CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC))
 
 # extra operators
 ifneq ($(EXTRA_OPERATORS),)
-	EXTRA_SRC = $(wildcard $(EXTRA_OPERATORS)/*.cc $(EXTRA_OPERATORS)/*/*.cc)
-	EXTRA_OBJ = $(patsubst $(EXTRA_OPERATORS)/%.cc, $(EXTRA_OPERATORS)/build/%.o, $(EXTRA_SRC))
-	EXTRA_CUSRC = $(wildcard $(EXTRA_OPERATORS)/*.cu $(EXTRA_OPERATORS)/*/*.cu)
-	EXTRA_CUOBJ = $(patsubst $(EXTRA_OPERATORS)/%.cu, $(EXTRA_OPERATORS)/build/%_gpu.o, $(EXTRA_CUSRC))
+	EXTRA_SRC = $(wildcard $(patsubst %, %/*.cc %/*/*.cc, $(EXTRA_OPERATORS)))
+	EXTRA_OBJ = $(patsubst %.cc, %.o, $(EXTRA_SRC))
+	EXTRA_CUSRC = $(wildcard $(patsubst %, %/*.cu %/*/*.cu, $(EXTRA_OPERATORS)))
+	EXTRA_CUOBJ = $(patsubst %.cu, %_gpu.o, $(EXTRA_CUSRC))
 else
 	EXTRA_SRC =
 	EXTRA_OBJ =
@@ -157,11 +157,6 @@ build/src/%_gpu.o: src/%.cu
 	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" -M -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
 
-build/plugin/%.o: plugin/%.cc
-	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
-	$(CXX) -std=c++11 -c $(CFLAGS) -c $< -o $@
-
 # A nvcc bug cause it to generate "generic/xxx.h" dependencies from torch headers.
 # Use CXX to generate dependency instead.
 build/plugin/%_gpu.o: plugin/%.cu
@@ -169,16 +164,21 @@ build/plugin/%_gpu.o: plugin/%.cu
 	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
 
-$(EXTRA_OPERATORS)/build/%.o: $(EXTRA_OPERATORS)/%.cc
+build/plugin/%.o: plugin/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++11 $(CFLAGS) -Isrc/operator -MM -MT $(EXTRA_OPERATORS)/build/$*.o $< >$(EXTRA_OPERATORS)/build/$*.d
-	$(CXX) -std=c++11 -c $(CFLAGS) -Isrc/operator -c $< -o $@
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -c $< -o $@
 
-$(EXTRA_OPERATORS)/build/%_gpu.o: $(EXTRA_OPERATORS)/%.cu
+%_gpu.o: %.cu
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS) -Isrc/operator" -M -MT $(EXTRA_OPERATORS)/build/$*_gpu.o $< >$(EXTRA_OPERATORS)/build/$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CFLAGS) -Isrc/operator" -M -MT $*_gpu.o $< >$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS) -Isrc/operator" $<
 
+%.o: %.cc
+	@mkdir -p $(@D)
+	$(CXX) -std=c++11 $(CFLAGS) -Isrc/operator -MM -MT $*.o $< >$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -Isrc/operator -c $< -o $@
+
 # NOTE: to statically link libmxnet.a we need the option
 # --Wl,--whole-archive -lmxnet --Wl,--no-whole-archive
 lib/libmxnet.a: $(ALL_DEP)
@@ -189,12 +189,15 @@ lib/libmxnet.so: $(ALL_DEP)
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
 
-$(PS_PATH)/build/libps.a:
+$(PS_PATH)/build/libps.a: PSLITE
+
+PSLITE:
 	$(MAKE) CXX=$(CXX) DEPS_PATH=$(DEPS_PATH) -C $(PS_PATH) ps
-	ln -fs $(PS_PATH)/tracker .
 
-$(DMLC_CORE)/libdmlc.a:
-	+ cd $(DMLC_CORE); make libdmlc.a config=$(ROOTDIR)/$(config); cd $(ROOTDIR)
+$(DMLC_CORE)/libdmlc.a: DMLCCORE
+
+DMLCCORE:
+	+ cd $(DMLC_CORE); make libdmlc.a USE_SSE=$(USE_SSE) config=$(ROOTDIR)/$(config); cd $(ROOTDIR)
 
 bin/im2rec: tools/im2rec.cc $(ALL_DEP)
 
@@ -265,7 +268,7 @@ clean:
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~
 	cd $(DMLC_CORE); make clean; cd -
 	cd $(PS_PATH); make clean; cd -
-	$(RM) -r $(EXTRA_OPERATORS)/build
+	$(RM) -r  $(patsubst %, %/*.d %/*/*.d %/*.o %/*/*.o, $(EXTRA_OPERATORS))
 else
 clean:
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~
@@ -279,5 +282,5 @@ clean_all: clean
 -include build/*/*.d
 -include build/*/*/*.d
 ifneq ($(EXTRA_OPERATORS),)
-	-include $(EXTRA_OPERATORS)/build/*.d
+	-include $(patsubst %, %/*.d %/*/*.d, $(EXTRA_OPERATORS))
 endif
diff --git a/R-package/R/context.R b/R-package/R/context.R
index 5a1b18c30505..fdcb48a857d6 100644
--- a/R-package/R/context.R
+++ b/R-package/R/context.R
@@ -1,6 +1,6 @@
 # Initialize the global context
 init.context.default <- function() {
-  .GlobalEnv$mx.ctx.internal.default.value <- mx.cpu()
+  assign("mx.ctx.internal.default.value", mx.cpu(), envir = .MXNetEnv)
 }
 
 #' Set/Get default context for array creation.
@@ -11,9 +11,9 @@ init.context.default <- function() {
 #' @export
 mx.ctx.default <- function(new = NULL) {
   if (!is.null(new)) {
-    mx.ctx.internal.default.value <<- new
+    assign("mx.ctx.internal.default.value", new, envir = .MXNetEnv)
   }
-  return (mx.ctx.internal.default.value)
+  return (.MXNetEnv$mx.ctx.internal.default.value)
 }
 
 #' Check if the type is mxnet context.
diff --git a/R-package/R/model.R b/R-package/R/model.R
index 39eb4c7479b4..102c9ae7a4e4 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -352,6 +352,8 @@ mx.model.select.layout.predict <- function(X, model) {
 #'     This is only used when X is R array.
 #' @param ctx mx.context or list of mx.context, optional
 #'     The devices used to perform training.
+#' @param begin.round integer (default=1)
+#'     The initial iteration over the training data to train the model.
 #' @param num.round integer (default=10)
 #'     The number of iterations over training data to train the model.
 #' @param optimizer string, default="sgd"
@@ -387,7 +389,7 @@ mx.model.select.layout.predict <- function(X, model) {
 #' @export
 
 mx.model.FeedForward.create <-
-function(symbol, X, y=NULL, ctx=NULL,
+function(symbol, X, y=NULL, ctx=NULL, begin.round=1,
          num.round=10, optimizer="sgd",
          initializer=mx.init.uniform(0.01),
          eval.data=NULL, eval.metric=NULL,
@@ -444,7 +446,7 @@ function(symbol, X, y=NULL, ctx=NULL,
   kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx), verbose=verbose)
   model <- mx.model.train(symbol, ctx, input.shape,
                           params$arg.params, params$aux.params,
-                          1, num.round, optimizer=optimizer,
+                          begin.round, num.round, optimizer=optimizer,
                           train.data=X, eval.data=eval.data,
                           metric=eval.metric,
                           epoch.end.callback=epoch.end.callback,
diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index d8e32ad58ea9..752237fc14b3 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -22,10 +22,26 @@ NULL
 #' @name mx.nd.argmax.channel
 NULL
 
+#' Calculate batched dot product of two matrices. (batch, M, K) batch_dot (batch, K, N) --> (batch, M, N)
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.batch.dot
+NULL
+
 #' Broadcast data in the given axis to the given size. The original size of the broadcasting axis must be 1.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     The axes to perform the broadcasting.
+#' @param size  Shape(tuple), optional, default=()
+#'     Target sizes of the broadcasting axes.
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -80,6 +96,30 @@ NULL
 #' @name mx.nd.broadcast.plus
 NULL
 
+#' lhs power rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.power
+NULL
+
+#' Broadcast data to the target shape. The original size of the broadcasting axis must be 1.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @param shape  Shape(tuple), optional, default=()
+#'     The shape of the desired array. We can set the dim to zero if it's same as the original. E.g `A = broadcast_to(B, shape=(10, 0, 0))` has the same meaning as `A = broadcast_axis(B, axis=0, size=10)`.
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.to
+NULL
+
 #' Take ceil value of the src
 #' 
 #' @param src  NDArray
@@ -130,6 +170,10 @@ NULL
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param begin  Shape(tuple), required
+#'     starting coordinates
+#' @param end  Shape(tuple), required
+#'     ending coordinates
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -162,6 +206,8 @@ NULL
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  int (non-negative), required
+#'     Position (amongst axes) where new axis is to be inserted.
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -186,6 +232,8 @@ NULL
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  int, required
+#'     The dimension to flip
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -212,40 +260,56 @@ NULL
 #' @name mx.nd.log
 NULL
 
-#' Take max of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' Take max of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @return out The result mx.ndarray
 #' 
 #' @export
 #' @name mx.nd.max
 NULL
 
-#' (Depreciated! Use max instead!) Take max of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' (Depreciated! Use max instead!) Take max of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @return out The result mx.ndarray
 #' 
 #' @export
 #' @name mx.nd.max.axis
 NULL
 
-#' Take min of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' Take min of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @return out The result mx.ndarray
 #' 
 #' @export
 #' @name mx.nd.min
 NULL
 
-#' (Depreciated! Use min instead!) Take min of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' (Depreciated! Use min instead!) Take min of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -306,6 +370,12 @@ NULL
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  int, required
+#'     The axis to be sliced
+#' @param begin  int, required
+#'     The beginning index to be sliced
+#' @param end  int, required
+#'     The end index to be sliced
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -354,20 +424,28 @@ NULL
 #' @name mx.nd.square
 NULL
 
-#' Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @return out The result mx.ndarray
 #' 
 #' @export
 #' @name mx.nd.sum
 NULL
 
-#' (Depreciated! Use sum instead!) Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' (Depreciated! Use sum instead!) Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -378,6 +456,8 @@ NULL
 #' 
 #' @param src  NDArray
 #'     Source input to the function
+#' @param axes  Shape(tuple), optional, default=()
+#'     Target axis order. By default the axes will be inverted.
 #' @return out The result mx.ndarray
 #' 
 #' @export
@@ -615,6 +695,8 @@ mx.symbol.Cast <- function(...) {
 #'     Tmp workspace for convolution (MB).
 #' @param no.bias  boolean, optional, default=False
 #'     Whether to disable bias parameter.
+#' @param cudnn.tune  {'fastest', 'limited_workspace', 'off'},optional, default='limited_workspace'
+#'     Whether to find convolution algo by running performance test.Leads to higher startup time but may give better speed
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1058,6 +1140,27 @@ mx.symbol.SoftmaxOutput <- function(...) {
   mx.varg.symbol.SoftmaxOutput(list(...))
 }
 
+#' Apply spatial transformer to input feature map.
+#' 
+#' @param data  Symbol
+#'     Input data to the SpatialTransformerOp.
+#' @param loc  Symbol
+#'     localisation net, the output dim should be 6 when transform_type is affine, and the name of loc symbol should better starts with 'stn_loc', so that initialization it with iddentify tranform, or you shold initialize the weight and bias by yourself.
+#' @param target.shape  Shape(tuple), optional, default=(0,0)
+#'     output shape(h, w) of spatial transformer: (y, x)
+#' @param transform.type  {'affine'}, required
+#'     transformation type
+#' @param sampler.type  {'bilinear'}, required
+#'     sampling type
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.SpatialTransformer <- function(...) {
+  mx.varg.symbol.SpatialTransformer(list(...))
+}
+
 #' Apply swapaxis to input.
 #' 
 #' @param data  Symbol
@@ -1113,10 +1216,29 @@ mx.symbol.abs <- function(...) {
   mx.varg.symbol.abs(list(...))
 }
 
+#' Calculate batched dot product of two matrices. (batch, M, K) batch_dot (batch, K, N) --> (batch, M, N)
+#' 
+#' @param lhs  Symbol
+#'     Left symbolic input to the function
+#' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.batch_dot <- function(...) {
+  mx.varg.symbol.batch_dot(list(...))
+}
+
 #' Broadcast data in the given axis to the given size. The original size of the broadcasting axis must be 1.
 #' 
 #' @param src  Symbol
 #'     Left symbolic input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     The axes to perform the broadcasting.
+#' @param size  Shape(tuple), optional, default=()
+#'     Target sizes of the broadcasting axes.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1186,6 +1308,36 @@ mx.symbol.broadcast_plus <- function(...) {
   mx.varg.symbol.broadcast_plus(list(...))
 }
 
+#' lhs power rhs with broadcast
+#' 
+#' @param lhs  Symbol
+#'     Left symbolic input to the function
+#' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_power <- function(...) {
+  mx.varg.symbol.broadcast_power(list(...))
+}
+
+#' Broadcast data to the target shape. The original size of the broadcasting axis must be 1.
+#' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param shape  Shape(tuple), optional, default=()
+#'     The shape of the desired array. We can set the dim to zero if it's same as the original. E.g `A = broadcast_to(B, shape=(10, 0, 0))` has the same meaning as `A = broadcast_axis(B, axis=0, size=10)`.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_to <- function(...) {
+  mx.varg.symbol.broadcast_to(list(...))
+}
+
 #' Take ceil value of the src
 #' 
 #' @param src  Symbol
@@ -1244,6 +1396,8 @@ mx.symbol.exp <- function(...) {
 #' 
 #' @param src  Symbol
 #'     Left symbolic input to the function
+#' @param axis  int (non-negative), required
+#'     Position (amongst axes) where new axis is to be inserted.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1352,6 +1506,12 @@ mx.symbol.sin <- function(...) {
 #' 
 #' @param src  Symbol
 #'     Left symbolic input to the function
+#' @param axis  int, required
+#'     The axis to be sliced
+#' @param begin  int, required
+#'     The beginning index to be sliced
+#' @param end  int, required
+#'     The end index to be sliced
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1415,10 +1575,14 @@ mx.symbol.square <- function(...) {
   mx.varg.symbol.square(list(...))
 }
 
-#' Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  Symbol
 #'     Left symbolic input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1428,10 +1592,14 @@ mx.symbol.sum <- function(...) {
   mx.varg.symbol.sum(list(...))
 }
 
-#' (Depreciated! Use sum instead!) Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' (Depreciated! Use sum instead!) Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.
 #' 
 #' @param src  Symbol
 #'     Left symbolic input to the function
+#' @param axis  Shape(tuple), optional, default=()
+#'     Same as Numpy. The axes to perform the reduction.If left empty, a global reduction will be performed.
+#' @param keepdims  boolean, optional, default=False
+#'     Same as Numpy. If keepdims is set to true, the axis which is reduced is left in the result as dimension with size one.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1445,6 +1613,8 @@ mx.symbol.sum_axis <- function(...) {
 #' 
 #' @param src  Symbol
 #'     Left symbolic input to the function
+#' @param axes  Shape(tuple), optional, default=()
+#'     Target axis order. By default the axes will be inverted.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 36543931d1f6..820e382cb9ed 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -149,6 +149,246 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
   return(list(create.state=create.state, update=update))
 }
 
+#' Create an Adam optimizer with respective parameters.
+#' Adam optimizer as described in [King2014].
+#'
+#' [King2014] Diederik Kingma, Jimmy Ba,
+#' Adam: A Method for Stochastic Optimization,
+#' http://arxiv.org/abs/1412.6980
+#'
+#' @param learning.rate float, default=0.001
+#'      Step size.
+#' @param beta1 float, default=0.9
+#'      Exponential decay rate for the first moment estimates.
+#' @param beta2 float, default=0.999
+#'      Exponential decay rate for the second moment estimates.
+#' @param epsilon float, default=1e-8
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#'      The learning rate scheduler.
+#'
+mx.opt.adam <- function(learning.rate=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8,
+                        wd=0,
+                        rescale.grad=1,
+                        clip_gradient = NULL,
+                        lr_scheduler = NULL) {
+  # use lr as short for learing rate.
+  lr <- learning.rate
+  count       <- 0
+  num_update  <- 0
+
+  adam <- new.env()
+  adam$lr <- lr
+  adam$count <- 0
+  adam$num_update <- 0
+
+  create.state <- function(index, weight) {
+      return (list(mean=mx.nd.zeros(dim(weight), ctx(weight)),
+                   variance=mx.nd.zeros(dim(weight), ctx(weight))))
+  }
+
+  update <- function(index, weight, grad, state) {
+    if (!is.null(lr_scheduler)){
+      lr_scheduler(adam) ## changing lr
+      lr <- adam$lr
+      ## update count
+      indexKey <- paste0('ik', index)
+      if (!exists(envir = adam, x = indexKey)){
+        assign(x = indexKey, value = 0, envir = adam)
+      } else {
+        indexValue <- get(envir = adam, x = indexKey)
+        assign(x = indexKey, value = indexValue + 1, envir = adam)
+        adam$num_update <- max(adam$num_update, get(envir = adam, x = indexKey))
+      }
+    }
+
+    # increment time
+    time.key <- paste0('t', index)
+    if (!exists(envir = adam, x = time.key)){
+      assign(x = time.key, value = 0, envir = adam)
+    }
+    t <- get(envir = adam, x = time.key)
+    t <- t + 1
+    assign(x = time.key, value = t, envir = adam)
+
+    mean <- state$mean
+    variance <- state$variance
+
+    grad <- grad * rescale.grad
+    if (!is.null(clip_gradient)){
+      if(clip_gradient >= 0){
+          grad_ctx <- ctx(grad)
+          grad <- as.array(grad)
+          grad <- pmax(grad, -1 * clip_gradient)
+          grad <- pmin(grad, clip_gradient)
+          grad <- mx.nd.array(grad, grad_ctx)
+      } else {
+        stop("Error: clip_gradient should be positive number.")
+      }
+    }
+
+    mean <- beta1 * mean + (1 - beta1) * grad
+    variance <- beta2 * variance + (1 - beta2) * (grad * grad)
+
+    coef1 <- 1 - beta1^t
+    coef2 <- 1 - beta2^t
+    lr <- lr * sqrt(coef2)/coef1
+
+    weight <- weight - lr * mean / (mx.nd.sqrt(variance) + epsilon)
+    weight <- weight - lr * wd * weight
+
+    state <- list(mean=mean, variance=variance)
+
+    return(list(weight=weight, state=state))
+  }
+  return(list(create.state=create.state, update=update))
+}
+
+#' Create an AdaGrad optimizer with respective parameters.
+#' AdaGrad optimizer of Duchi et al., 2011,
+#'
+#' This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
+#' by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
+#' in some cases.
+#'
+#' @param learning.rate float, default=0.05
+#'      Step size.
+#' @param epsilon float, default=1e-8
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#'      The learning rate scheduler.
+#'
+mx.opt.adagrad <- function(learning.rate=0.05,
+                           epsilon=1e-8,
+                           wd=0,
+                           rescale.grad=1,
+                           clip_gradient = NULL,
+                           lr_scheduler = NULL) {
+  # use lr as short for learing rate.
+  lr <- learning.rate
+  count       <- 0
+  num_update  <- 0
+
+  adagrad <- new.env()
+  adagrad$lr <- lr
+  adagrad$count <- 0
+  adagrad$num_update <- 0
+
+  create.state <- function(index, weight) {
+      return (mx.nd.zeros(dim(weight), ctx(weight))) #history
+  }
+
+  update <- function(index, weight, grad, state) {
+    if (!is.null(lr_scheduler)){
+      lr_scheduler(adagrad) ## changing lr
+      lr <- adagrad$lr
+      ## update count
+      indexKey <- paste0('ik', index)
+      if (!exists(envir = adagrad, x = indexKey)){
+        assign(x = indexKey, value = 0, envir = adagrad)
+      } else {
+        indexValue <- get(envir = adagrad, x = indexKey)
+        assign(x = indexKey, value = indexValue + 1, envir = adagrad)
+        adagrad$num_update <- max(adagrad$num_update, get(envir = adagrad, x = indexKey))
+      }
+    }
+
+    grad <- grad * rescale.grad
+    if (!is.null(clip_gradient)){
+      if(clip_gradient >= 0){
+          grad_ctx <- ctx(grad)
+          grad <- as.array(grad)
+          grad <- pmax(grad, -1 * clip_gradient)
+          grad <- pmin(grad, clip_gradient)
+          grad <- mx.nd.array(grad, grad_ctx)
+      } else {
+        stop("Error: clip_gradient should be positive number.")
+      }
+    }
+
+    history <- state
+    history <- history + (grad * grad)
+    weight <- weight - lr * (grad / mx.nd.sqrt(history + epsilon) + wd * weight)
+    state <- history
+
+    return(list(weight=weight, state=state))
+  }
+  return(list(create.state=create.state, update=update))
+}
+
+#' Create an AdaDelta optimizer with respective parameters.
+#'
+#' AdaDelta optimizer as described in Zeiler, M. D. (2012).
+#' *ADADELTA: An adaptive learning rate method.*
+#' http://arxiv.org/abs/1212.5701
+#'
+#' @param rho float, default=0.90
+#'      Decay rate for both squared gradients and delta x.
+#' @param epsilon float, default=1e-5
+#'      The constant as described in the thesis.
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#'
+mx.opt.adadelta <- function(rho=0.90,
+                            epsilon=1e-5,
+                            wd=0,
+                            rescale.grad=1,
+                            clip_gradient = NULL) {
+  adadelta <- new.env()
+
+  create.state <- function(index, weight) {
+    return (list(acc.g=mx.nd.zeros(dim(weight), ctx(weight)),       # accumulated g
+                 acc.delta=mx.nd.zeros(dim(weight), ctx(weight))))  # accumulated delta
+  }
+
+  update <- function(index, weight, grad, state) {
+    # preprocess grad
+    grad <- grad * rescale.grad
+    if (!is.null(clip_gradient)){
+      if(clip_gradient >= 0){
+          grad_ctx <- ctx(grad)
+          grad <- as.array(grad)
+          grad <- pmax(grad, -1 * clip_gradient)
+          grad <- pmin(grad, clip_gradient)
+          grad <- mx.nd.array(grad, grad_ctx)
+      } else {
+        stop("Error: clip_gradient should be positive number.")
+      }
+    }
+
+    # accumulated g and delta initlization
+    acc.g <- state$acc.g
+    acc.delta <- state$acc.delta
+
+    # update g, delta
+    acc.g <- rho * acc.g + (1 - rho) * (grad * grad)
+    current.delta <- mx.nd.sqrt(acc.delta + epsilon) / mx.nd.sqrt(acc.g + epsilon) * grad
+    acc.delta <- rho * acc.delta + (1 - rho) * (current.delta * current.delta)
+    weight <- weight - current.delta - wd * weight
+    state <- list(acc.g=acc.g, acc.delta=acc.delta)
+
+    return(list(weight=weight, state=state))
+  }
+  return(list(create.state=create.state, update=update))
+}
+
 #' Create an optimizer by name and parameters
 #'
 #' @param name The name of the optimizer
@@ -162,6 +402,15 @@ mx.opt.create <- function(name, ...) {
   else if (name == "rmsprop") {
     return (mx.opt.rmsprop(...))
   }
+  else if (name == "adam") {
+    return (mx.opt.adam(...))
+  }
+  else if (name == "adagrad") {
+    return (mx.opt.adagrad(...))
+  }
+  else if (name == "adadelta") {
+    return (mx.opt.adadelta(...))
+  }
   stop(paste("Unknown optimizer ", name))
 }
 
diff --git a/R-package/R/zzz.R b/R-package/R/zzz.R
index dc91e55fa436..d19cb2d33029 100644
--- a/R-package/R/zzz.R
+++ b/R-package/R/zzz.R
@@ -12,6 +12,8 @@
 #' @import methods Rcpp
 NULL
 
+.MXNetEnv <- new.env()
+
 .onLoad <- function(libname, pkgname) {
   # Require methods for older versions of R
   require(methods)
diff --git a/R-package/man/mx.opt.adadelta.Rd b/R-package/man/mx.opt.adadelta.Rd
new file mode 100644
index 000000000000..c5e698b54797
--- /dev/null
+++ b/R-package/man/mx.opt.adadelta.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimizer.R
+\name{mx.opt.adadelta}
+\alias{mx.opt.adadelta}
+\title{Create an AdaDelta optimizer with respective parameters.}
+\usage{
+mx.opt.adadelta(rho = 0.9, epsilon = 1e-05, wd = 0, rescale.grad = 1,
+  clip_gradient = NULL)
+}
+\arguments{
+\item{rho}{float, default=0.90
+Decay rate for both squared gradients and delta x.}
+
+\item{epsilon}{float, default=1e-5
+The constant as described in the thesis.}
+
+\item{wd}{float, default=0.0
+L2 regularization coefficient add to all the weights.}
+
+\item{rescale.grad}{float, default=1.0
+rescaling factor of gradient.}
+
+\item{clip_gradient}{float, optional
+clip gradient in range [-clip_gradient, clip_gradient].}
+}
+\description{
+AdaDelta optimizer as described in Zeiler, M. D. (2012).
+*ADADELTA: An adaptive learning rate method.*
+http://arxiv.org/abs/1212.5701
+}
+
diff --git a/R-package/man/mx.opt.adagrad.Rd b/R-package/man/mx.opt.adagrad.Rd
new file mode 100644
index 000000000000..a5ca8be82f5c
--- /dev/null
+++ b/R-package/man/mx.opt.adagrad.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimizer.R
+\name{mx.opt.adagrad}
+\alias{mx.opt.adagrad}
+\title{Create an AdaGrad optimizer with respective parameters.
+AdaGrad optimizer of Duchi et al., 2011,}
+\usage{
+mx.opt.adagrad(learning.rate = 0.05, epsilon = 1e-08, wd = 0,
+  rescale.grad = 1, clip_gradient = NULL, lr_scheduler = NULL)
+}
+\arguments{
+\item{learning.rate}{float, default=0.05
+Step size.}
+
+\item{epsilon}{float, default=1e-8}
+
+\item{wd}{float, default=0.0
+L2 regularization coefficient add to all the weights.}
+
+\item{rescale.grad}{float, default=1.0
+rescaling factor of gradient.}
+
+\item{clip_gradient}{float, optional
+clip gradient in range [-clip_gradient, clip_gradient].}
+
+\item{lr_scheduler}{function, optional
+The learning rate scheduler.}
+}
+\description{
+This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
+by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
+in some cases.
+}
+
diff --git a/R-package/man/mx.opt.adam.Rd b/R-package/man/mx.opt.adam.Rd
new file mode 100644
index 000000000000..74002c8c8298
--- /dev/null
+++ b/R-package/man/mx.opt.adam.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimizer.R
+\name{mx.opt.adam}
+\alias{mx.opt.adam}
+\title{Create an Adam optimizer with respective parameters.
+Adam optimizer as described in [King2014].}
+\usage{
+mx.opt.adam(learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
+  epsilon = 1e-08, wd = 0, rescale.grad = 1, clip_gradient = NULL,
+  lr_scheduler = NULL)
+}
+\arguments{
+\item{learning.rate}{float, default=0.001
+Step size.}
+
+\item{beta1}{float, default=0.9
+Exponential decay rate for the first moment estimates.}
+
+\item{beta2}{float, default=0.999
+Exponential decay rate for the second moment estimates.}
+
+\item{epsilon}{float, default=1e-8}
+
+\item{wd}{float, default=0.0
+L2 regularization coefficient add to all the weights.}
+
+\item{rescale.grad}{float, default=1.0
+rescaling factor of gradient.}
+
+\item{clip_gradient}{float, optional
+clip gradient in range [-clip_gradient, clip_gradient].}
+
+\item{lr_scheduler}{function, optional
+The learning rate scheduler.}
+}
+\description{
+[King2014] Diederik Kingma, Jimmy Ba,
+Adam: A Method for Stochastic Optimization,
+http://arxiv.org/abs/1412.6980
+}
+
diff --git a/README.md b/README.md
index 2e9e15725ad3..c42e0b563685 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,8 @@
 =====
 
 [![Build Status](https://travis-ci.org/dmlc/mxnet.svg?branch=master)](https://travis-ci.org/dmlc/mxnet)
-[![Build Status](http://ci.dmlc.ml/buildStatus/icon?job=mxnet)](http://ci.dmlc.ml/job/mxnet)
-[![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](http://mxnet.readthedocs.org/en/latest/)
+[![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](http://mxnet.io/)
 [![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-[![todofy badge](https://todofy.org/b/dmlc/mxnet)](https://todofy.org/r/dmlc/mxnet)
 
 ![banner](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/banner.png)
 
diff --git a/dmlc-core b/dmlc-core
index 755f577a38cf..bf321638b22d 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 755f577a38cf3aa07f38a2667ffc583d22195e52
+Subproject commit bf321638b22d1d33bb36775e925f7b43b22db688
diff --git a/docs/.dockerignore b/docs/.dockerignore
new file mode 100644
index 000000000000..8f1738ff0c2f
--- /dev/null
+++ b/docs/.dockerignore
@@ -0,0 +1,3 @@
+Dockerfile
+_build
+
diff --git a/docs/Dockerfile b/docs/Dockerfile
new file mode 100644
index 000000000000..bea556ed398a
--- /dev/null
+++ b/docs/Dockerfile
@@ -0,0 +1,43 @@
+FROM ubuntu:14.04
+MAINTAINER Mu Li <muli@cs.cmu.edu>
+
+#
+# First, build MXNet binaries (ref mxnet/docker/cpu/Dockerfile)
+#
+
+RUN apt-get update && apt-get install -y build-essential git libopenblas-dev libopencv-dev
+RUN git clone --recursive https://github.com/dmlc/mxnet/ && cd mxnet && \
+    cp make/config.mk . && \
+    echo "USE_BLAS=openblas" >>config.mk && \
+    make -j$(nproc)
+
+# python pakcage
+RUN apt-get install -y python-numpy wget unzip
+ENV PYTHONPATH /mxnet/python
+
+#
+# Now set up tools for doc build
+#
+
+RUN apt-get update && apt-get install -y \
+    doxygen \
+    build-essential \
+    git \
+    python-pip
+
+RUN pip install sphinx==1.3.5 CommonMark==0.5.4 breathe mock==1.0.1 recommonmark
+
+WORKDIR /opt/mxnet/docs
+
+# Fool it into thinking it's on a READTHEDOCS server, so it builds the
+# API reference
+ENV READTHEDOCS true
+
+ENTRYPOINT /opt/mxnet/docs/build-preview.sh
+
+EXPOSE 8008
+
+# Put this at the end so that you don't have to rebuild the earlier
+# layers when iterating on the docs themselves.
+ADD . /opt/mxnet/docs
+
diff --git a/docs/README b/docs/README
deleted file mode 100644
index 71f2b69c817d..000000000000
--- a/docs/README
+++ /dev/null
@@ -1,14 +0,0 @@
-This document is automatically generated from the codes.
-
-A built version of document is available at http://mxnet.dmlc.ml
-
-To build the document locally, type
-
-```make html```
-
-Install the required packages for Ubutun 14.04
-
-```
-sudo apt-get install doxygen python-pip
-sudo pip install sphinx==1.3.5 CommonMark==0.5.4 breathe mock
-```
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000000..7444c5ef7369
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,50 @@
+# MXNet documentation
+
+A built version of document is available at http://mxnet.dmlc.ml
+
+## To build docs with Docker
+
+The `Dockerfile` in this directory encapsulates all the dependencies needed
+to build the docs.  The default entrypoint builds the docs and serves them
+through a simple HTTP server for previewing.
+
+```
+docker build -t mxnet/docs .
+docker run -it -p 8008:8008 mxnet/docs
+open http://localhost:8008/
+```
+
+### Faster iterative development
+
+If you are working on the docs and want to rebuild them without creating a new
+docker image each time, you can do this with
+
+```
+docker run -it -p 8008:8008 -v `pwd`:/opt/mxnet/docs mxnet/docs
+```
+
+which maps your current directory into the docker image to get any local 
+changes.
+
+**NOTE:** Any changes to the API reference will not get rebuilt this way.
+The API reference docs are introspected from the built binaries, which 
+in this Dockerfile are pulled from github/dmlc/master.  To work-around
+this, map a volume with your code changes into the container, and rebuild
+MXNet in the container before doing the doc build.  Or use the local
+build described below.
+
+## Local build
+
+To build the documentation without docker on your local machine, first
+install the required packages for Ubutun 14.04.  These are approximately:
+
+```
+sudo apt-get install doxygen python-pip
+sudo pip install sphinx==1.3.5 CommonMark==0.5.4 breathe mock==1.0.1 recommonmark
+```
+
+(Refer to the Dockerfile for a more reliable description of the dependencies.)
+Once the MXNet binaries are built, and you have the dependencies installed,
+you can build the docs with:
+
+```make html```
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index a0901e42783d..cb8c15195768 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -26,29 +26,29 @@
     <div class="row">
       <div class="col-lg-4 col-sm-6">
         <h3><i class="fa fa-flag"></i> Flexible</h3>
-        <p>Supports both imperative and symbolic programmings. </p>
+        <p>Supports both imperative and symbolic programming</p>
       </div>
       <div class="col-lg-4 col-sm-6">
         <h3><i class="fa fa-cube"></i> Portable</h3>
         <p>Runs on CPUs or GPUs, on clusters, servers, desktops, or mobile phones</p>
       </div>
       <div class="col-lg-4 col-sm-6">
-        <h3><i class="fa fa-wrench"></i>Multiple Lanuages</h3>
+        <h3><i class="fa fa-wrench"></i>Multiple Languages</h3>
         <p>Supports over 7 programming languages, including C++, Python, R,
-        Scala, Julia, Matlab, and Javascripts.</p>
+        Scala, Julia, Matlab, and Javascript</p>
       </div>
       <div class="col-lg-4 col-sm-6">
         <h3><i class="fa fa-cogs"></i> Auto-Differentiation</h3>
-        <p>Calculates the gradient automatically for training a model. </p>
+        <p>Calculates the gradient automatically for training a model</p>
       </div>
       <div class="col-lg-4 col-sm-6">
         <h3><i class="fa fa-cloud"></i>Distributed on Cloud</h3>
         <p>Supports distributed training on multiple CPU/GPU machines, including AWS,
-          GCE, Azure, and Yarn clusters. </p>
+          GCE, Azure, and Yarn clusters</p>
       </div>
       <div class="col-lg-4 col-sm-6">
         <h3><i class="fa fa-rocket"></i> Performance</h3>
-        <p>The well-optimized C++ backend engine parallelize both I/O and computations </p>
+        <p>Optimized C++ backend engine parallelizes both I/O and computation</p>
       </div>
     </div>
   </div>
@@ -59,7 +59,7 @@ <h3><i class="fa fa-rocket"></i> Performance</h3>
     <div class="row">
       <p>
         MXNet is developed by collaborators from multiple universities and
-        companies. We sincerely thank the following organizations for supporing
+        companies. We sincerely thank the following organizations for supporting
         MXNet and sponsoring its major developers (alphabetical order).
       </p>
       <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/baidu-logo.jpg">
@@ -85,6 +85,8 @@ <h3><i class="fa fa-rocket"></i> Performance</h3>
       <br>
       <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/ms-logo.png">
       &nbsp;
+      <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/tusimple-logo.png">
+      &nbsp;
 
       <img height="45px" src="http://www.toolkit.ualberta.ca/Toolkit%20Downloads/~/media/identity/Toolkit/Logos/UA/UA-COLOUR-180px.png">
       &nbsp;
diff --git a/docs/_static/mxnet.css b/docs/_static/mxnet.css
index 83de570c5683..39fe4346c6fd 100644
--- a/docs/_static/mxnet.css
+++ b/docs/_static/mxnet.css
@@ -4,9 +4,6 @@ a, abbr, acronym, address, applet, big, blockquote, body, caption, cite, code, d
     padding: 0;
     border: 0;
     outline: 0;
-    font-weight: inherit;
-    font-style: inherit;
-    font-family: inherit;
     font-size: 100 %;
     vertical-align: baseline
 }
@@ -23,11 +20,6 @@ body {
     -moz-osx-font-smoothing: grayscale;
 }
 
-ol,
-ul {
-    list-style: none
-}
-
 table {
     border-collapse: separate;
     border-spacing: 0
@@ -423,7 +415,7 @@ div.sphinxsidebar {
 div.sphinxsidebar ul { padding: 0 }
 div.sphinxsidebar ul ul { margin-left: 15px }
 
-@media (min-width:1200px) {
+@media (min-width:1000px) {
     .content { float: right; width: 66.66666667%; margin-right: 5% }
     div.sphinxsidebar {display: block}
 }
diff --git a/docs/build-preview.sh b/docs/build-preview.sh
new file mode 100755
index 000000000000..40bcfbd63184
--- /dev/null
+++ b/docs/build-preview.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Script to build the HTML docs and serve them.
+# Run within docker container for best results.
+
+echo "Building MXNet documentation..."
+make clean
+make html
+echo "Done building MXNet documentation..."
+
+echo "Serving MXNet docs on port 8008..."
+cd _build/html
+python -m SimpleHTTPServer 8008 
+
diff --git a/docs/conf.py b/docs/conf.py
index b0f85b1b4608..d9d72b856367 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -27,7 +27,7 @@
 
 # -- mock out modules
 import mock
-MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', 'sklearn', 'matplotlib']
+MOCK_MODULES = ['numpy', 'numpy.testing', 'scipy', 'scipy.sparse', 'sklearn', 'matplotlib']
 for mod_name in MOCK_MODULES:
     sys.modules[mod_name] = mock.Mock()
 
diff --git a/docs/get_started/index.md b/docs/get_started/index.md
index 0133d3fa033b..a7c90e3d3f5c 100644
--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
@@ -278,13 +278,13 @@ probs = mx.predict(model, test_provider)
 ## Tensor Computation
 
 Next we briefly introduce the tensor computation interface, which is often more
-flexiable to use than the previous symbolic interface. It is often used to
+flexible to use than the previous symbolic interface. It is often used to
 implement the layers, define weight updating rules, and debug.
 
 
 ### Python
 
-The python inferface is similar to `numpy.NDArray`.
+The python interface is similar to `numpy.NDArray`.
 
 ```python
 >>> import mxnet as mx
@@ -333,3 +333,5 @@ res3: ml.dmlc.mxnet.Shape = (2,3)
 ```
 
 ### Julia
+
+Coming soon...
diff --git a/docs/get_started/overview_zh.md b/docs/get_started/overview_zh.md
index f1a812422107..949dc6a9a1e0 100644
--- a/docs/get_started/overview_zh.md
+++ b/docs/get_started/overview_zh.md
@@ -1,5 +1,5 @@
 The following is an overview of MXNet in Chinese. For english readers, please
-refer to our [NIPS learnsys paper](http://learningsys.org/papers/LearningSys_2015_paper_1.pdf)
+refer to our [NIPS LearningSys paper](http://learningsys.org/papers/LearningSys_2015_paper_1.pdf)
 
 # MXNet设计和实现简介
 
diff --git a/docs/how_to/bucketing.md b/docs/how_to/bucketing.md
index 88b70cf9e2e1..b13506f0b976 100644
--- a/docs/how_to/bucketing.md
+++ b/docs/how_to/bucketing.md
@@ -14,7 +14,7 @@ Taking the [PennTreeBank language model example](https://github.com/dmlc/mxnet/t
 
 The architecture used in the example is a simple word-embedding layer followed by two LSTM layers. In the original example, the model is unrolled explicitly in time for a fixed length of 32. In this demo, we will show how to use bucketing to implement variable-length sequence training.
 
-In order to enable bucketing, MXNet need to know how to construct a new unrolled symbolic architecture for a different sequence length. To achieve this, instead of constructing a model with a fixed `Symbol`, we use a callback function that generating new `Symbol` on a *bucket key*.
+In order to enable bucketing, MXNet need to know how to construct a new unrolled symbolic architecture for a different sequence length. To achieve this, instead of constructing a model with a fixed `Symbol`, we use a callback function that generates new `Symbol` on a *bucket key*.
 
 
 ```python
@@ -35,11 +35,11 @@ The data iterator need to report the `default_bucket_key`, which allows MXNet to
 
 However, to achieve training, we still need to add some extra bits to our `DataIter`. Apart from reporting the `default_bucket_key` as mentioned above, we also need to report the current `bucket_key` for each mini-batch. More specifically, the `DataBatch` object returned in each mini-batch by the `DataIter` should contain the following *extra* properties
 
-* `bucket_key`: the bucket key corresponding to this batch of data. In our example, it will be the sequence length for this batch of data. If the executors corresponding to this bucket key has not yet been created, they will be constructed according to the symbolic model returned by `gen_sym` on this bucket key. Created executors will be cached for future use. Note the generated `Symbol` could be arbitrary, but they should all have the same trainable parameters and auxiliary states.
+* `bucket_key`: The bucket key corresponding to this batch of data. In our example, it will be the sequence length for this batch of data. If the executors corresponding to this bucket key has not yet been created, they will be constructed according to the symbolic model returned by `gen_sym` on this bucket key. Created executors will be cached for future use. Note that generated `Symbol` could be arbitrary, but they should all have the same trainable parameters and auxiliary states.
 * `provide_data`: this is the same information reported by the `DataIter` object. Since now each bucket corresponds to different architecture, they could have different input data. Also, one **should** make sure that the `provide_data` information returned by the `DataIter` object is compatible with the architecture for `default_bucket_key`.
 * `provide_label`: the same as `provide_data`.
 
-Now the `DataIter` is responsible for grouping the data into different buckets. Assuming randomization is enabled, in each mini-batch, it choose a random bucket (according to a distribution balanced by the bucket sizes), and then randomly choose sequences from that bucket to form a mini-batch. And do some proper *padding* for sequences of different length *within* the mini-batch if necessary.
+Now, the `DataIter` is responsible for grouping the data into different buckets. Assuming randomization is enabled, in each mini-batch, it choose a random bucket (according to a distribution balanced by the bucket sizes), and then randomly choose sequences from that bucket to form a mini-batch. And do some proper *padding* for sequences of different length *within* the mini-batch if necessary.
 
 Please refer to [example/rnn/lstm_ptb_bucketing.py](https://github.com/dmlc/mxnet/blob/master/example/rnn/lstm_bucketing.py) for the full implementation of a `DataIter` that read text sequences implement the API shown above. In this example, bucketing can be used with a static configuration (e.g. `buckets = [10, 20, 30, 40, 50, 60]`), or let MXnet to generate bucketing automatically according to dataset(`buckets = []`). The latter approach is implemented with greedily adding a bucket as long as the number of input for the bucket is large enough(see [default_gen_buckets()](https://github.com/dmlc/mxnet/blob/master/example/rnn/bucket_io.py#L43)). 
 
diff --git a/docs/how_to/build.md b/docs/how_to/build.md
index 4ef8ac5420b7..4dfa89a64c5c 100644
--- a/docs/how_to/build.md
+++ b/docs/how_to/build.md
@@ -142,6 +142,9 @@ various distributed filesystem such as HDFS/Amazon S3/...
 #### Building with Intel MKL Support
 First, `source /path/to/intel/bin/compilervars.sh` to automatically set environment variables. Then, edit [make/config.mk](../../make/config.mk), let `USE_BLAS = mkl`. `USE_INTEL_PATH = NONE` is usually not necessary to be modified.
 
+#### Building for distributed training
+To be able to run distributed training jobs, the `USE_DIST_KVSTORE=1` flag must be set.  This enables a distributed
+key-value store needed to share parameters between multiple machines working on training the same neural network.
 
 ## Python Package Installation
 
@@ -351,3 +354,67 @@ For a guide to Docker, see the [official docs](https://docs.docker.com).
 CUDA support requires [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker).
 For more details on how to use the MXNet Docker images,
 consult the [source project](https://github.com/Kaixhin/dockerfiles).
+
+## Build Dependent Libraries from Source
+
+This section we provide instructions to build MXNet' dependent libraries from source. It is often useful in two situations:
+
+- You use a low version or server linux, there is no according packages or the package versions are low by using `yum` or `apt-get`
+- You do not have the root permission to install packages. In this case, you need to change the install directory from `/usr/local` into another one such as `${HOME}` in the following examples.
+
+### Build GCC from Source
+
+Building gcc needs 32-bit libc, you can install it by
+
+- Ubuntu:  `sudo apt-get install libc6-dev-i386`
+- Red Hat `sudo yum install glibc-devel.i686`
+- CentOS 5.8, `sudo yum install glibc-devel.i386`
+- CentOS 6 / 7, `sudo yum install glibc-devel.i686`
+
+First download 
+```bash
+wget http://mirrors.concertpass.com/gcc/releases/gcc-4.8.5/gcc-4.8.5.tar.gz
+tar -zxf gcc-4.8.5.tar.gz
+cd gcc-4.8.5
+./contrib/download_prerequisites
+```
+
+Then build
+```
+mkdir release && cd release
+../configure --prefix=/usr/local --enable-languages=c,c++
+make -j10
+sudo make install
+```
+
+Finally you may want to add lib path in your `~/.bashrc`
+```bash
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib64
+```
+
+### Build Opencv from Source
+
+First download opencv 
+```bash
+wget https://codeload.github.com/opencv/opencv/zip/2.4.13
+unzip 2.4.13
+cd opencv-2.4.13
+mkdir release
+cd release/
+```
+Building opencv needs cmake, if you do not have cmake or your cmake verion is too low (e.g the one installed by default on RHEL), then 
+```bash
+wget https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.tar.gz
+tar -zxvf cmake-3.6.1-Linux-x86_64.tar.gz
+alias cmake="cmake-3.6.1-Linux-x86_64/bin/cmake"
+```
+Now build opencv. We disable GPU support, which may significantly slow down to run a MXNet program on GPU. We also disable 1394 which may generate warning. 
+```bash
+cmake -D BUILD_opencv_gpu=OFF -D WITH_CUDA=OFF -D WITH_1394=OFF -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local ..
+make -j8
+sudo make install
+```
+Finally, you may want to add the following into the end of your `~/.bashrc`:
+```bash
+export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/lib/pkgconfig/
+```
diff --git a/docs/how_to/caffe.md b/docs/how_to/caffe.md
new file mode 100644
index 000000000000..d3ec17426524
--- /dev/null
+++ b/docs/how_to/caffe.md
@@ -0,0 +1,49 @@
+# How to use Caffe operator in MXNet
+
+[Caffe](http://caffe.berkeleyvision.org/) has been a well-known and widely-used deep learning framework. Now MXNet has supported calling most Caffe operators(layers) and loss functions directly in its symbolic graph! Using one's own customized caffe layer is also effortless.
+
+Besides Caffe, MXNet has already embedded Torch modules and its tensor mathematical functions. ([link](https://github.com/dmlc/mxnet/blob/master/docs/how_to/torch.md))
+
+This blog demonstrates two steps to use Caffe op in MXNet:
+
+* How to install MXNet with Caffe support.
+
+* How to embed Caffe op into MXNet's symbolic graph.
+
+## Install Caffe With MXNet interface
+* Download official Caffe repository [BVLC/Caffe](https://github.com/BVLC/caffe).
+* Download [Caffe patch for mxnet interface] (https://github.com/BVLC/caffe/pull/4527.patch). Move patch file under your Caffe root folder and apply the patch by `git apply patch_file_name`.
+* Install Caffe following the [official guide](http://caffe.berkeleyvision.org/installation.html).
+
+## Compile with Caffe
+* In mxnet folder, open `config.mk` (if you haven't already, copy `make/config.mk` (Linux) or `make/osx.mk` (Mac) into MXNet root folder as `config.mk`) and uncomment the lines `CAFFE_PATH = $(HOME)/caffe` and `MXNET_PLUGINS += plugin/caffe/caffe.mk`. Modify `CAFFE_PATH` to your Caffe installation if necessary. 
+* Run `make clean && make` to build with Caffe support.
+
+## Caffe Operator (Layer)
+Caffe's neural network operator and loss functions are supported by MXNet through `mxnet.symbol.CaffeOp` and `mxnet.symbol.CaffeLoss` respectively.
+For example, the following code shows [multi-layer perceptron] (https://en.wikipedia.org/wiki/Multilayer_perceptron)(MLP) network for classifying MNIST digits ([full code](https://github.com/dmlc/mxnet/blob/master/example/caffe/caffe_net.py)):
+
+### Python
+```Python
+data = mx.symbol.Variable('data')
+fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+```
+
+Let us break it down. First, `data = mx.symbol.Variable('data')` defines a variable as place holder for input.
+Then it is fed through Caffe operators with `fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")`.
+
+The inputs to Caffe op are named as data_i for i=0 ... num_data-1 as `num_data` is the number of inputs. You may skip the argument, as the example does, if its value is 1. While `num_weight` is number of `blobs_`(weights). Its default value is 0, as many ops maintain no weight. `prototxt` is the configuration string.
+
+We could also replace the last line by:
+
+```Python
+label = mx.symbol.Variable('softmax_label')
+mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+```
+
+to use loss function in Caffe.
diff --git a/docs/how_to/cloud.md b/docs/how_to/cloud.md
index 26cda6ab8cc0..0d5581cf7b2d 100644
--- a/docs/how_to/cloud.md
+++ b/docs/how_to/cloud.md
@@ -1,8 +1,8 @@
-# Cloud setup for MXNet
+# Cloud Setup for MXNet
 
 ## Setup an AWS GPU Cluster from Scratch
 
-In this document we give a step-by-step tutorial on how to set up Amazon AWS for
+In this document, we provide a step-by-step tutorial on how to set up an Amazon AWS cluster with
 MXNet. In particular, we will address:
 
 - [Use Amazon S3 to host data](#use-amazon-s3-to-host-data)
@@ -33,11 +33,11 @@ wget http://data.dmlc.ml/mxnet/data/mnist.zip
 unzip mnist.zip && s3cmd put t*-ubyte s3://dmlc/mnist/
 ```
 
-### Set Up an EC2 GPU Instance
+### Setup an EC2 GPU Instance
 
 MXNet requires the following libraries
 
-- C++ compiler with C++11 suports, such as `gcc >= 4.8`
+- C++ compiler with C++11 supports, such as `gcc >= 4.8`
 - `CUDA` (`CUDNN` in optional) for GPU linear algebra
 - `BLAS` (cblas, open-blas, atblas, mkl, or others) for CPU linear algebra
 - `opencv` for image augmentations
@@ -60,7 +60,7 @@ We provide a public Amazon Machine Images, [ami-12fd8178](https://console.aws.am
 
 ### Build and Run MXNet on a GPU Instance
 
-The following commands build MXNet with CUDA/CUDNN, S3, and distributed
+The following commands build MXNet with CUDA/CUDNN, S3 and distributed
 training.
 
 ```bash
@@ -81,7 +81,7 @@ To test whether everything is installed properly, we train a Convolutional neura
 python tests/python/gpu/test_conv.py
 ```
 
-If the MNISt data is placed on `s3://dmlc/mnist`, we can read the S3 data directly with the following command
+If the MNIST data is placed on `s3://dmlc/mnist`, we can read the S3 data directly with the following command
 
 ```bash
 sed -i.bak "s!data_dir = 'data'!data_dir = 's3://dmlc/mnist'!" tests/python/gpu/test_conv.py
@@ -149,4 +149,4 @@ benchmark for the distributed training. We may consider other [examples](https:/
 It is common to pack a dataset into multiple files, especially when working in a distributed environment. MXNet supports direct loading from multiple data shards. Simply put all the record files into a folder, and point the data path to the folder.
 
 #### Use YARN, MPI, SGE
-While ssh can be simple for cases when we do not have a cluster scheduling framework. MXNet is designed to be able to port to various platforms.  We also provide other scripts in [tracker](https://github.com/dmlc/dmlc-core/tree/master/tracker) to run on other cluster frameworks, including Hadoop(YARN) and SGE. Your contribution is more than welcomed to provide examples to run mxnet on your favorite distributed platform.
+While ssh can be simple for cases when we do not have a cluster scheduling framework. MXNet is designed to be able to port to various platforms.  We also provide other scripts in [tracker](https://github.com/dmlc/dmlc-core/tree/master/tracker) to run on other cluster frameworks, including Hadoop(YARN) and SGE. Your contribution is more than welcomed to provide examples to run MXNet on your favourite distributed platform.
diff --git a/docs/how_to/contribute.md b/docs/how_to/contribute.md
index 18568bc2a7dc..716d8869ae90 100644
--- a/docs/how_to/contribute.md
+++ b/docs/how_to/contribute.md
@@ -30,7 +30,7 @@ git rebase upstream/master
   it might be good to merge them together(use git rebase then squash) into more meaningful groups.
 * Send the pull request!
   - Fix the problems reported by automatic checks
-  - If you are contributing a new module, consider add a testcase in [tests](../tests)
+  - If you are contributing a new module, consider add a test case in [tests](../tests)
 
 Git Workflow Howtos
 -------------------
@@ -86,14 +86,14 @@ Documents
 
 Testcases
 ---------
-* All the testcases are in [tests](../tests)
-* We use python nose for python test cases and gtest for c++ unittests.
+* All the test cases are in [tests](../tests)
+* We use python nose for python test cases and gtest for c++ unit tests.
 
 Examples
 --------
 * Usecases and examples will be in [example](../example)
-* We are super excited to hear about your story, if you have blogposts,
-  tutorials code solutions using mxnet, please tell us and we will add
+* We are super excited to hear about your story, if you have blog posts,
+  tutorials code solutions using MXNet, please tell us and we will add
   a link in the example pages.
 
 Core Library
@@ -121,7 +121,7 @@ make rcpplint
 - When needed, you can disable the linter warning of certain line with ```// NOLINT(*)``` comments.
 
 ### Auto Generated API
-- Many mxnet API are exposed from Rcpp side in a dynamic way.
+- Many MXNet API are exposed from Rcpp side in a dynamic way.
 - The [mx_generated.R](R/mx_generated.R) is auto generated API and documents for these functions.
 - You can remake the file by typing the following command at root folder
 ```bash
@@ -153,7 +153,7 @@ make the-markdown-to-make.md
 - Add the generated figure to the ```dmlc/web-data``` repo.
   - If you already cloned the repo to doc, this means a ```git add```
 - Create PR for both the markdown  and ```dmlc/web-data```
-- You can also build the document locally by typing the followig command at ```doc```
+- You can also build the document locally by typing the following command at ```doc```
 ```bash
 make html
 ```
diff --git a/docs/how_to/env_var.md b/docs/how_to/env_var.md
index d15e11386bde..b102d12be057 100644
--- a/docs/how_to/env_var.md
+++ b/docs/how_to/env_var.md
@@ -3,6 +3,8 @@ Environment Variables
 MXNet have several settings that can be changed via environment variable.
 Usually you do not need to change these settings, but they are listed here for reference.
 
+## Set the number of threads
+
 * MXNET_GPU_WORKER_NTHREADS (default=2)
   - Maximum number of threads that do the computation job on each GPU.
 * MXNET_GPU_COPY_NTHREADS (default=1)
@@ -11,8 +13,11 @@ Usually you do not need to change these settings, but they are listed here for r
   - Maximum number of threads that do the CPU computation job.
 * MXNET_CPU_PRIORITY_NTHREADS (default=4)
 	- Number of threads given to prioritized CPU jobs.
+
+## Memory options
+
 * MXNET_EXEC_ENABLE_INPLACE (default=true)
-  - Whether to enable inplace optimization in symbolic execution.
+  - Whether to enable in place optimization in symbolic execution.
 * MXNET_EXEC_MATCH_RANGE (default=10)
   - The rough matching scale in symbolic execution memory allocator.
   - Set this to 0 if we do not want to enable memory sharing between graph nodes(for debug purpose).
@@ -20,17 +25,32 @@ Usually you do not need to change these settings, but they are listed here for r
   - Maximum number of temp workspace we can allocate to each device.
   - Set this to small number can save GPU memory.
   - It will also likely to decrease level of parallelism, which is usually OK.
+* MXNET_GPU_MEM_POOL_RESERVE (default=5)
+  - Percentage of GPU memory to reserve for things other than gpu array, such as kernel launch or cudnn handle space.
+  - Try setting this to a larger value if you see strange out of memory error from kernel launch, after multiple iterations, etc.
+
+## Engine type
+
 * MXNET_ENGINE_TYPE (default=ThreadedEnginePerDevice)
   - The type of underlying execution engine of MXNet.
   - List of choices
     - NaiveEngine: very simple engine that use master thread to do computation.
     - ThreadedEngine: a threaded engine that uses global thread pool to schedule jobs.
     - ThreadedEnginePerDevice: a threaded engine that allocates thread per GPU.
+
+## Control the data communication
+
 * MXNET_KVSTORE_REDUCTION_NTHREADS (default=4)
-	- Number of threads used for summing of big arrays.
+	- Number of CPU threads used for summing of big arrays.
 * MXNET_KVSTORE_BIGARRAY_BOUND (default=1e6)
 	- The minimum size of "big array".
 	- When the array size is bigger than this threshold, MXNET_KVSTORE_REDUCTION_NTHREADS threads will be used for reduction.
+* MXNET_ENABLE_GPU_P2P (default=1)
+    - If true, mxnet will try to use GPU peer-to-peer communication if available
+      when kvstore's type is `device`
+
+## Others
+
 * MXNET_CUDNN_AUTOTUNE_DEFAULT (default=0)
     - The default value of cudnn_tune for convolution layers.
     - Auto tuning is turn off by default. Set to 1 to turn on by default for benchmarking.
@@ -45,4 +65,3 @@ Settings for More GPU Parallelism
 - Set ```MXNET_GPU_WORKER_NTHREADS``` to larger number (e.g. 2)
   - You may want to set ```MXNET_EXEC_NUM_TEMP``` to reduce memory usage.
 - This may not speed things up, especially for image applications, because GPU is usually fully utilized even with serialized jobs.
-
diff --git a/docs/how_to/faq.md b/docs/how_to/faq.md
index 261526f1faef..f709292d35d6 100644
--- a/docs/how_to/faq.md
+++ b/docs/how_to/faq.md
@@ -1,10 +1,45 @@
-Frequently Asked Questions
-==========================
-This document contains the frequently asked questions to mxnet.
+# Frequently Asked Questions
 
-How to Copy Part of Parameters to Another Model
------------------------------------------------
-Most MXNet's model consists two parts, the argument arrays and symbol. You can simply copy the argument arrary to the argument array of another model. For example, in python model API, you can do
+We collect the frequently asked questions on [mxnet/issues](https://github.com/dmlc/mxnet/issues). For whom is going to post issues, please consider to check this page first. For contributors, please make the questions and answers simple; otherwise put the detailed answer in more proper place and then refer the link.
+
+## Build and install
+
+The answers for most questions can be found on the [build page](build.md)
+
+## Speed
+
+#### Took a long time to start running on GPU
+
+Try to disable opencv to use GPU, such as [build opencv from source with GPU module disabled](build.md#build-opencv-from-source). 
+
+#### Slow on a single GPU
+
+Check the following items:
+
+1. Check your CUDA/driver version is not too old. 
+2. Build with `USE_CUDNN=1`, often brings 50+% speed up. Try to use the newest version. 
+3. Set `export MXNET_CUDNN_AUTOTUNE_DEFAULT=1` before running, often 10%-15% speed up
+4. Disable ECC if using Tesla GPUs by `nvidia-smi -e 0`. Root permission and reboot may be needed.
+5. Set to maximal clock for Tesla cards by `nvidia-smi -ac ??`. See [this blog](https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/)
+6. Check no throttle reason `nvidia-smi -q -d PERFORMANCE` often caused by temperature. 
+
+#### No speed up for using more than one GPUs or machines. 
+
+Check the following items:
+1. Does your neural network already run fast, such as >1000 example/sec or >10 batches/sec? If yes, it's unlikely to get further speed up for adding more resources due to the communication overhead. 
+2. Are you using a small batch size? Try to increase it.
+3. Are you using more than 4 GPUs? Try to use `--kv-store=device`
+
+## Memory Usage
+
+#### Abnormal CPU memory usage
+
+May be due to the data pre-fetch. Refer to [issue 2111](https://github.com/dmlc/mxnet/issues/2111) Should be fixed later.
+
+## the following part needs refactor
+
+#### How to Copy Part of Parameters to Another Model
+Most MXNet's model consists two parts, the argument arrays and symbol. You can simply copy the argument array to the argument array of another model. For example, in python model API, you can do
 ```python
 copied_model =  mx.model.FeedForward(ctx=mx.gpu(), symbol=new_symbol,
                                      arg_params=old_arg_params, aux_params=old_aux_params,
@@ -12,13 +47,11 @@ copied_model =  mx.model.FeedForward(ctx=mx.gpu(), symbol=new_symbol,
 ```
 To copy model parameter from existing ```old_arg_params```, see also this [notebook](https://github.com/dmlc/mxnet/blob/master/example/notebooks/predict-with-pretrained-model.ipynb)
 
-How to Extract Feature Map of Certain Layer
-------------------------------------------
+#### How to Extract Feature Map of Certain Layer
 See this [notebook](https://github.com/dmlc/mxnet/blob/master/example/notebooks/predict-with-pretrained-model.ipynb)
 
 
-What is the relation between MXNet and CXXNet, Minerva, Purine2
----------------------------------------------------------------
+#### What is the relation between MXNet and CXXNet, Minerva, Purine2
 MXNet is created in collaboration by authors from the three projects.
 The project reflects what we have learnt from the past projects.
 It combines important flavour of the existing projects, being
@@ -29,12 +62,6 @@ ways of programming, and write CPU/GPU applications that are more
 memory efficient than cxxnet, purine and more flexible than minerva.
 
 
-What is the Relation to Tensorflow
-----------------------------------
-Both MXNet and Tensorflow use a computation graph abstraction, which is initially used by Theano, then also adopted by other packages such as CGT, caffe2, purine. Currently TensorFlow adopts an optimized symbolic API. While mxnet supports a more [mixed flavor](https://mxnet.readthedocs.org/en/latest/program_model.html), with a dynamic dependency scheduler to combine symbolic and imperative programming together. 
-In short, mxnet is lightweight and “mixed”, with flexiblity from imperative programing, while getting similar advantages by using a computation graph to make it very fast and memory efficient. That being said, most systems will involve and we expect both systems can learn and benefit from each other.
-
-
-How to Build the Project
-------------------------
-See [build instruction](build.md)
+#### What is the Relation to Tensorflow
+Both MXNet and [Tensorflow] (https://www.tensorflow.org/) use a computation graph abstraction, which is initially used by Theano, then also adopted by other packages such as CGT, caffe2, purine. Currently TensorFlow adopts an optimized symbolic API. While MXNet supports a more [mixed flavour](https://mxnet.readthedocs.org/en/latest/program_model.html), with a dynamic dependency scheduler to combine symbolic and imperative programming together. 
+In short, MXNet is lightweight and “mixed”, with flexibility from imperative programming, while getting similar advantages by using a computation graph to make it very fast and memory efficient. That being said, most systems will involve and we expect both systems can learn and benefit from each other.
diff --git a/docs/how_to/index.md b/docs/how_to/index.md
index 0bf00ee149c3..1eda66a46851 100644
--- a/docs/how_to/index.md
+++ b/docs/how_to/index.md
@@ -1,6 +1,6 @@
 # MXNet How To
 
-This page contains guidelines to use and develop mxnets.
+This page contains guidelines to use and develop MXNets.
 
 ## Installation
 - [How to Install MXNet](build.md)
@@ -10,8 +10,8 @@ This page contains guidelines to use and develop mxnets.
 - [Train LSTM with multiple GPUs in model parallelism](model_parallel_lstm.md)
 - [Run MXNet on smart devices](smart_device.md)
 - [Run MXNet on cloud](cloud.md)
-- [Use pretrained models](pretrained.md)
-- [Use mxnet on variable input length/size (bucketing)](bucketing.md)
+- [Use pre-trained models](pretrained.md)
+- [Use MXNet on variable input length/size (bucketing)](bucketing.md)
 - [Improve performance tips](perf.md)
 
 ## Develop and Hack MXNet
diff --git a/docs/how_to/model_parallel_lstm.md b/docs/how_to/model_parallel_lstm.md
index 8e8e99a13da7..b0d72908a88d 100644
--- a/docs/how_to/model_parallel_lstm.md
+++ b/docs/how_to/model_parallel_lstm.md
@@ -6,7 +6,7 @@ An example of LSTM training with model parallelism is provided in [example/model
 
 There's a very good [introduction](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) to LSTM by Christopher. 
 
-LSTM evaluation is inheritantly hard due to its complex data dependency. LSTM training, which has more data dependency in reverse order at its back propagation phase, is even harder to parallelize. 
+LSTM evaluation is inherently hard due to its complex data dependency. LSTM training, which has more data dependency in reverse order at its back propagation phase, is even harder to parallelize. 
 
 
 ## Model parallel: Using multiple GPUs as a pipeline 
@@ -15,7 +15,7 @@ Model parallelism has been under heated discussion in applied machine learning r
 
 <img width="517" alt="screen shot 2016-05-06 at 10 13 16 pm" src="https://cloud.githubusercontent.com/assets/5545640/15089697/d6f4fca0-13d7-11e6-9331-7f94fcc7b4c6.png">
 
-In the figure above, we assign different lstm model to different GPUs. After GPU1 finish computing layer 1 with first sentence. The output will be given to GPU 2. At the same time, GPU 1 will fetch the next sentence and start training. This is significantly different from data parallelism that there's no contention to update the shared model at the end of each iteration, and most of the communication happens during pipelining intermediate results between GPU's. 
+In the figure above, we assign different LSTM model to different GPUs. After GPU1 finish computing layer 1 with first sentence. The output will be given to GPU 2. At the same time, GPU 1 will fetch the next sentence and start training. This is significantly different from data parallelism that there is no contention to update the shared model at the end of each iteration, and most of the communication happens during pipelining intermediate results between GPU's. 
 
 In the current implementation, the layers are defined in [lstm_unroll()](https://github.com/dmlc/mxnet/blob/master/example/model-parallel-lstm/lstm.py). 
 
@@ -23,13 +23,13 @@ In the current implementation, the layers are defined in [lstm_unroll()](https:/
 
 Implementing Model Parallelism requires good knowledge of training task to partition the network throughout the GPUs. Although it requires detailed analysis that is beyond the scope of a course project, we found that we can lay down some general principles.
 
-- Place neighbor layers in the same GPU to avoid data transmition.
+- Place neighbor layers in the same GPU to avoid data transmission.
 - Balancing the workload between GPUs to avoid bottleneck in a pipeline situation.
 - Remember, different kind of layers have different computation-memory properties. 
 
 <img width="449" alt="screen shot 2016-05-07 at 1 51 02 am" src="https://cloud.githubusercontent.com/assets/5545640/15090455/37a30ab0-13f6-11e6-863b-efe2b10ec2e6.png">
 
-Let us have a quick look into the 2 pipeline above. They both have 8 layers with a decoder and an encoder layer. Clearly, based on our first principle, it is unwise to place all neighbor layers in separate GPUs. One other thing is we want to balance the workload accross GPUs. Here LSTM layers, although having less memory comsumptions than decoder/encoder layers, will take up more computation time because dependency of unrolled LSTM. Thus, the partition on the left will be better in speed than the right because of a more even workload in Model Parallelism.
+Let us have a quick look into the 2 pipeline above. They both have 8 layers with a decoder and an encoder layer. Clearly, based on our first principle, it is unwise to place all neighbor layers in separate GPUs. One other thing is we want to balance the workload across GPUs. Here LSTM layers, although having less memory consumptions than decoder/encoder layers, will take up more computation time because dependency of unrolled LSTM. Thus, the partition on the left will be better in speed than the right because of a more even workload in Model Parallelism.
 
 Currently the layer partition is implemented in [lstm.py](https://github.com/eric-haibin-lin/mxnet/blob/master/example/model-parallel-lstm/lstm.py#L187) and configured in [lstm_ptb.py](https://github.com/eric-haibin-lin/mxnet/blob/master/example/model-parallel-lstm/lstm.py#L187) using the `group2ctx` option.
 
diff --git a/docs/how_to/multi_devices.md b/docs/how_to/multi_devices.md
index 95ad8f8ba120..c3cdb2b0c9c8 100644
--- a/docs/how_to/multi_devices.md
+++ b/docs/how_to/multi_devices.md
@@ -1,6 +1,6 @@
 # Run MXNet on Multiple CPU/GPUs with Data Parallel
 
-MXNet supports trainig with multiple CPUs and GPUs since the very
+MXNet supports training with multiple CPUs and GPUs since the very
 beginning. Almost any program using MXNet's provided training modules, such as
 [python/mxnet.model](https://github.com/dmlc/mxnet/blob/master/python/mxnet/model.py),
 can be efficiently run over multiple devices.
@@ -16,12 +16,12 @@ updated model are communicated cross these devices.
 
 ### Workload Partitioning
 
-If using data parallelism, MXNet will evenly partition a minbatch in each
+If using data parallelism, MXNet will evenly partition a mini batch in each
 GPUs. Assume we train with batch size *b* and *k* GPUs, then in one iteration
 each GPU will perform forward and backward on a batch with size *b/k*. The
 gradients are then summed over all GPUs before updating the model.
 
-In ideal case, *k* GPUs will provide *k* time speedup comparing to the single
+In ideal case, *k* GPUs will provide *k* time speed up comparing to the single
 GPU. In addition, assume the model has size *m* and the temporal workspace is
 *t*, then the memory footprint of each GPU will be *m+t/k*. In other words, we
 can use a large batch size for multiple GPUs.
@@ -30,11 +30,11 @@ can use a large batch size for multiple GPUs.
 
 > To use GPUs, we need to compiled MXNet with GPU support. For
 > example, set `USE_CUDA=1` in `config.mk` before `make`. (see
-> [build](../get_started/build.html) for more options).
+> [MXNet installation guide](build.html) for more options).
 
 If a machine has one or more than one GPU cards installed, then each card is
-labeled by a number starting from 0. To use a particular GPU, one can often
-either specify the context `ctx` in codes or pass `--gpus` in commandlines. For
+labelled by a number starting from 0. To use a particular GPU, one can often
+either specify the context `ctx` in codes or pass `--gpus` in command line. For
 example, to use GPU 0 and 2 in python one can often create a model with
 ```python
 import mxnet as mx
@@ -76,7 +76,7 @@ kvstore type                gradient aggregation  weight updating
 ```
 
 Here
-- `local_update_cpu`: gradients are first copied to CPU memory, and aggregated
+- `local_update_cpu`: Gradients are first copied to CPU memory, and aggregated
   on CPU. Then we update the weight on CPU and copy back the updated weight to
   GPUs. It is suitable when the layer model size is not large, such as
   convolution layers.
@@ -98,7 +98,7 @@ Here
 
 The `kvstore` type is `local` in default. It will choose `local_update_cpu` if the
 weight size of each layer is less than 1Mb, which can be changed by
-the environment varialbe `MXNET_KVSTORE_BIGARRAY_BOUND`, and
+the environment variable `MXNET_KVSTORE_BIGARRAY_BOUND`, and
 `local_allreduce_cpu` otherwise.
 
 ## Distributed Training with Multiple Machines
@@ -115,8 +115,8 @@ and speed when using multiple machines.
   results for using batch size *n\*b* on a single machine.
 
 - `dist_async` remove the aggregation operation in `dist_sync`. The weight is
-  updated once received gradient from any machine. The updating is atomic,
-  namely no two updatings happen on the same weight at the same time. However,
+  updated once received gradient from any machine. The update is atomic,
+  namely no two updates happen on the same weight at the same time. However,
   the order is not guaranteed.
 
 Roughly speaking, `dist_sync` runs slower than `dist_async` due the extra
@@ -131,7 +131,7 @@ information about these two data consistency models.
 ### How to Launch a Job
 
 > To use distributed training, we need to compile with `USE_DIST_KVSTORE=1`
-> (see [build](../get_started/build.html) for more options).
+> (see [MXNet installation guide](build.html) for more options).
 
 Launching a distributed job is little bit different than running on a single
 machine. MXNet provides
diff --git a/docs/how_to/new_op.md b/docs/how_to/new_op.md
index a49efc156780..720dbd97ddf6 100644
--- a/docs/how_to/new_op.md
+++ b/docs/how_to/new_op.md
@@ -8,7 +8,7 @@ We try to do our best to provide high speed operators for most common use cases.
 
 * ~~(Deprecated) Use native language, mxnet.rtc and mxnet.ndarray. This gives you most of the performance of 3) and most of the convenience of 1), but requires more knowledge of MXNet. You can write CUDA kernels in python and compile with during runtime.~~
 
-* 1) Use CustomOp to write new operators in frontend language (i.e. Python) that runs on cpu or gpu. Depending on your implementation, this can range from very fast to very slow.
+* 1) Use CustomOp to write new operators in front end language (i.e. Python) that runs on CPU or GPU. Depending on your implementation, this can range from very fast to very slow.
 
 * 2) Use C++/MShadow(CUDA). This can be difficult if you are not familiar with MXNet, mashadow or Cuda, but it will give you the best performance.
 
@@ -28,7 +28,7 @@ class Softmax(mx.operator.CustomOp):
         y /= y.sum(axis=1).reshape((x.shape[0], 1))
         self.assign(out_data[0], req[0], mx.nd.array(y))
 ```
-Here we defined the computation for forward pass of our operator. The forward function takes a list of input and a list of output NDArrays. Here we called .asnumpy() on the input NDArray to convert it to cpu based numpy arrays for convenience.
+Here we defined the computation for forward pass of our operator. The forward function takes a list of input and a list of output NDArrays. Here we called .asnumpy() on the input NDArray to convert it to CPU based numpy arrays for convenience.
 
 Keep in mind that this can be very slow. If you want the best performance, keep data in NDArray format and use operations under mx.nd to do the computation.
 
@@ -49,7 +49,7 @@ First we register our new operator with the name 'softmax':
 @mx.operator.register("softmax")
 class SoftmaxProp(mx.operator.CustomOpProp):
 ```
-Then we call our base constructor with `need_top_grad=False` be cause softmax is a loss layer and we don't need gradient input from layers above:
+Then we call our base constructor with `need_top_grad=False` because softmax is a loss layer and we don't need gradient input from layers above:
 ```python
     def __init__(self):
         super(SoftmaxProp, self).__init__(need_top_grad=False)
@@ -75,7 +75,7 @@ Next we need to provide `infer_shape` to declare the shape of our output/weight
 ```
 The first dim of an input/output tensor is batch size. Our label is a set of integers, one for each data entry, and our output has the same shape as input. Infer_shape should always return three lists in the order inputs, outputs and auxiliary states (which we don't have here), even if one of them is empty.
 
-Finally, we need to define a create_operator function that will be called by the backend to create an instance of Softmax:
+Finally, we need to define a create_operator function that will be called by the back-end to create an instance of Softmax:
 ```python
     def create_operator(self, ctx, shapes, dtypes):
         return Softmax()
diff --git a/docs/how_to/perf.md b/docs/how_to/perf.md
index fcca8a5150a0..38b8b75b555f 100644
--- a/docs/how_to/perf.md
+++ b/docs/how_to/perf.md
@@ -1,18 +1,18 @@
 # Performance
 
-The following factors may significant affect the performance:
+The following factors may significantly affect the performance:
 
-1. Use a fast backend. A fast BLAS library, e.g. openblas, altas,
+1. Use a fast back-end. A fast BLAS library, e.g. openblas, altas,
 and mkl, is necessary if only using CPU. While for Nvidia GPUs, we strongly
 recommend to use CUDNN.
 2. Three important things for the input data:
-  1. data format. If you are using the `rec` format, then everything should be
+  1. Data format. If you are using the `rec` format, then everything should be
     fine.
-  2. decoding. In default MXNet uses 4 CPU threads for decoding the images, which
+  2. Decoding. In default MXNet uses 4 CPU threads for decoding the images, which
     are often able to decode over 1k images per second. You
     may increase the number of threads if either you are using a low-end CPU or
     you GPUs are very powerful.
-  3. place to store the data. Any local or distributed filesystem (HDFS, Amazon
+  3. Place to store the data. Any local or distributed filesystem (HDFS, Amazon
     S3) should be fine. There may be a problem if multiple machines read the
     data from the network shared filesystem (NFS) at the same time.
 3. Use a large batch size. We often choose the largest one which can fit into
diff --git a/docs/how_to/pretrained.md b/docs/how_to/pretrained.md
index 1c1868cf4cf1..e20f75ee21af 100644
--- a/docs/how_to/pretrained.md
+++ b/docs/how_to/pretrained.md
@@ -1,6 +1,6 @@
-Pretrained Model Gallary
+Pre-trained Model Gallery
 ========================
-This document contains the the pretrained in MXNet
+This document contains the the pre-trained in MXNet
 
 * [89.9% Top-5 Validation Accuracy for ImageNet 1,000 Classes Challenge](https://github.com/dmlc/mxnet-model-gallery/blob/master/imagenet-1k-inception-bn.md)
 * [37.2% Top-1 Training Accuracy for Full ImageNet 21,841 Classes](https://github.com/dmlc/mxnet-model-gallery/blob/master/imagenet-21k-inception.md)
diff --git a/docs/how_to/smart_device.md b/docs/how_to/smart_device.md
index 2e102ff130d6..579605749fc6 100644
--- a/docs/how_to/smart_device.md
+++ b/docs/how_to/smart_device.md
@@ -6,7 +6,7 @@ Deep learning (DL) systems are complex and often have a few of dependencies. It
 
 The idea of amalgamation comes from SQLite and other projects, which packs all the codes into a single source file. Then it is only needed to compile that single file to create the library, which makes porting to various platforms much easier. MXNet provides an [amalgamation](https://github.com/dmlc/mxnet/tree/master/amalgamation) script, thanks to [Jack Deng](https://github.com/jdeng), to combiles all codes needed for prediction using trained DL models into a single `.cc` file, which has around 30K lines of codes. The only dependency required is just a BLAS library.
 
-We also have a minimal version removed BLAS dependency, and the single file can be compiled into JavaScript by using [enscripten](https://github.com/kripken/emscripten).
+We also have a minimal version removed BLAS dependency, and the single file can be compiled into JavaScript by using [emscripten](https://github.com/kripken/emscripten).
 
 The compiled library can be used by any other programming language easily. The `.h` file contains a light prediction API, porting to another language with a C foreign function interface needs little effect. For example
 
@@ -17,7 +17,7 @@ The compiled library can be used by any other programming language easily. The `
 
 To do amalgamation, there are a few things we need to be careful about when building the project:
 
-- Minimize the dependency to other libraries and do.
+- Minimize the dependency to other libraries.
 - Use namespace to encapsulate the types and operators.
 - Avoid do commands such as ```using namespace xyz``` on the global scope.
 - Avoid cyclic include dependencies.
@@ -30,7 +30,7 @@ With amalgamation, deploying the system on smart devices (such as Android or iOS
 1. The model should be small enough to fit into the device’s memory
 2. The model should not be too expensive to run given the relative low computational power of these devices
 
-Next we will use the image recognition as an example to show how we try to get such a model. We start with the state-of-the-art inception model. We train it on imagnet dataset, using multiple server machines with GTX 980 cards. The resulted model fits into memory, but we find it can be too expensive to run. Then we remove some layers. then further remove somethings, but now the results are too pool. more explains, and the results table.
+Next we will use the image recognition as an example to show how we try to get such a model. We start with the state-of-the-art inception model. We train it on imagenet dataset, using multiple server machines with GTX 980 cards. The resulted model fits into memory, but we find it can be too expensive to run. Then we remove some layers. then further remove something, but now the results are too pool. more explains, and the results table.
 
 
 
@@ -39,7 +39,7 @@ Finally, we show an Android example, thanks to Leliana, [https://github.com/Leli
 <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/apk/subinception.png" height="488" width="274">
 
 
-By using amalgamation, we can easily port the prediction library to mobile devices,  with nearly no dependency. Compile on smart platform is no longer a painful task. After compiled library for smart platform, the last thing is call C-API in the target language (Jave/Swift).
+By using amalgamation, we can easily port the prediction library to mobile devices,  with nearly no dependency. Compile on smart platform is no longer a painful task. After compiled library for smart platform, the last thing is call C-API in the target language (Java/Swift).
 
 <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/apk/poolnet.png" height="488" width="274">
 
diff --git a/docs/how_to/torch.md b/docs/how_to/torch.md
index e272445a45ed..a155634a500a 100644
--- a/docs/how_to/torch.md
+++ b/docs/how_to/torch.md
@@ -43,7 +43,7 @@ act2 = mx.symbol.TorchModule(data_0=fc2, lua_string='nn.ReLU(false)', num_data=1
 fc3 = mx.symbol.TorchModule(data_0=act2, lua_string='nn.Linear(64, 10)', num_data=1, num_params=2, num_outputs=1, name='fc3')
 mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
 ```
-Let's break it down. First `data = mx.symbol.Variable('data')` defines a Variable as placeholder for input.
+Let's break it down. First `data = mx.symbol.Variable('data')` defines a Variable as place holder for input.
 Then it's fed through Torch's nn modules with `fc1 = mx.symbol.TorchModule(data_0=data, lua_string='nn.Linear(784, 128)', num_data=1, num_params=2, num_outputs=1, name='fc1')`.
 We can also replace the last line with:
 ```Python
diff --git a/docs/index.md b/docs/index.md
index b92c45a6cbc5..dc152b8eb64b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -13,4 +13,4 @@ These are used to generate the index used in search.
 - [Tutorials](system/index.md)
 
 # Chinese translation of docs
-- [Chinse translation of docs](index_zh.md)
+- [Chinese translation of docs](index_zh.md)
diff --git a/docs/packages/julia/index.md b/docs/packages/julia/index.md
index 03559465cee0..56b294f87f54 100644
--- a/docs/packages/julia/index.md
+++ b/docs/packages/julia/index.md
@@ -1,2 +1,2 @@
 # MXNet Julia Package
-Julia documents are available at [http://mxnetjl.readthedocs.org/](http://mxnetjl.readthedocs.org/)
+Julia documents are available at [http://dmlc.ml/MXNet.jl/latest/](http://dmlc.ml/MXNet.jl/latest/).
diff --git a/docs/packages/python/index.md b/docs/packages/python/index.md
index aa22ebcd2dce..70b72cb4cdc8 100644
--- a/docs/packages/python/index.md
+++ b/docs/packages/python/index.md
@@ -2,9 +2,9 @@ MXNet Python Package
 ====================
 This page contains links to all the python related documents on python package.
 To install the python package, checkout [Build and Installation Instruction](../../how_to/build.md).
-There are three types of documents you can find about mxnet.
+There are three types of documents you can find about MXNet.
 
-* [Tutorials](#tutorials) are self contained materials that introduces a certain use-cases of mxnet.
+* [Tutorials](#tutorials) are self contained materials that introduces a certain use-cases of MXNet.
 * [Code Examples](../../../example) contains example codes.
 * [Python API Documents](#python-api-documents) contains documents about specific module, as well as reference of all API functions.
 
@@ -14,11 +14,11 @@ Tutorials
 * [Symbolic Configuration and Execution in Pictures](symbol_in_pictures.md)
 * [How to Create New Operations (Layers)](../../how_to/new_op.md)
 
-Python API Documents
+Python API Reference
 --------------------
-* [High Level Model Training Related API](model.md)
-* [The Module API](module.md)
-* [NDArray API](ndarray.md)
-* [Symbolic API](symbol.md)
-* [KVStore API](kvstore.md)
-* [Data Loading API](io.md)
+* [Module API](module.md) a flexible high-level interface for training neural networks
+* [Model API](model.md) an alternate simple high-level interface for training neural networks
+* [Symbolic API](symbol.md) for operations on NDArrays to assemble neural networks from layers
+* [IO Data Loading API](io.md) for parsing and loading data
+* [NDArray API](ndarray.md) for vector/matrix/tensor operations
+* [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training
diff --git a/docs/packages/python/io.md b/docs/packages/python/io.md
index 00de252a60a3..8d6bcb25c4be 100644
--- a/docs/packages/python/io.md
+++ b/docs/packages/python/io.md
@@ -11,7 +11,7 @@ Introduction
 This page will introduce data input method in MXNet. MXNet use iterator to provide data to the neural network.  Iterators do some preprocessing and generate batch for the neural network.
 
 * We provide basic iterators for MNIST image and RecordIO image.
-* To hide the IO cost, prefetch strategy is used to allow parallelism of learning process and data fetching. Data will automatically fetched by an independent thread.
+* To hide the IO cost, pre-fetch strategy is used to allow parallelism of learning process and data fetching. Data will automatically fetched by an independent thread.
 
 Parameters For Data Iterator
 ----------------------------
@@ -21,7 +21,7 @@ Generally to create a data iterator, you need to provide five kinds of parameter
 * **Dataset Param** gives the basic information for the dataset, e.g. file path, input shape.
 * **Batch Param** gives the information to form a batch, e.g. batch size.
 * **Augmentation Param** tells which augmentation operations(e.g. crop, mirror) should be taken on an input image.
-* **Backend Param** controls the behavior of the backend threads to hide data loading cost.
+* **Backend Param** controls the behavior of the back-end threads to hide data loading cost.
 * **Auxiliary Param** provides options to help checking and debugging.
 
 Usually, **Dataset Param** and **Batch Param** MUST be given, otherwise data batch can't be create. Other parameters can be given according to algorithm and performance need. Examples and detail explanation of the options will be provided in the later Section.
@@ -57,7 +57,7 @@ The following code gives an example of creating a Cifar data iterator.
 >>>        batch_size=100,
 >>>        # Augmentation Parameter
 >>>        # Optional
->>>        # when offers mean_img, each image will substract the mean value at each pixel
+>>>        # when offers mean_img, each image will subtract the mean value at each pixel
 >>>        mean_img="data/cifar/cifar10_mean.bin",
 >>>        # Augmentation Parameter
 >>>        # Optional
@@ -81,7 +81,7 @@ The following code gives an example of creating a Cifar data iterator.
 >>>        prefetch_buffer=1)
 ```
 
-From the above code, we could find how to create a data iterator. First, you need to explicitly point out what kind of data(MNIST, ImageRecord etc) to be fetched. Then provide the options about the dataset, batching, image augmentation, multi-tread processing and prefetching. Our code will automatically check the validity of the params, if a compulsary param is missing, an error will occur.
+From the above code, we could find how to create a data iterator. First, you need to explicitly point out what kind of data(MNIST, ImageRecord etc.) to be fetched. Then provide the options about the dataset, batching, image augmentation, multi-tread processing and prefetching. Our code will automatically check the validity of the params, if a compulsory param is missing, an error will occur.
 
 How To Get Data
 ---------------
@@ -93,10 +93,10 @@ We provide the [script](../../tests/python/common/get_data.py) to download MNIST
 RecordIO implements a file format for a sequence of records. We recommend storing images as records and pack them together. The benefits are:
 
 * Storing images in compacted format, e.g. JPEG, for records can have different size. Compacted format will greatly reduce the dataset size in disk.
-* Packing data together allow continous reading on the disk.
+* Packing data together allow continuous reading on the disk.
 * RecordIO has a simple way of partition, which makes it easier for distributed setting. Example about this will be provided later.
 
-We provide the [im2rec tool](../../tools/im2rec.cc) to create Image RecordIO dataset by yourself. Here's the walkthrough:
+We provide the [im2rec tool](../../tools/im2rec.cc) to create Image RecordIO dataset by yourself. Here's the walk through:
 
 ### 0.Before you start
 Make sure you have downloaded the data. You don't need to resize the images by yourself, currently ```im2rec``` could resize it automatically. You could check the promoting message of ```im2rec``` for details.
@@ -132,9 +132,9 @@ A sample command:
 ```
 More details can be found by running ```./bin/im2rec```.
 
-### Extension: Mutliple Labels for a Single Image
+### Extension: Multiple Labels for a Single Image
 
-The `im2rec` tool and `mx.io.ImageRecordIter` also has a mutli-label support for a single image.
+The `im2rec` tool and `mx.io.ImageRecordIter` also has a multi-label support for a single image.
 Assume you have 4 labels for a single image, you can take the following steps to utilize the RecordIO tools.
 
 1. Write the the image list files as follows:
diff --git a/docs/packages/python/model.md b/docs/packages/python/model.md
index d3b561f58de7..e7ea7e3bc0d5 100644
--- a/docs/packages/python/model.md
+++ b/docs/packages/python/model.md
@@ -1,6 +1,6 @@
 MXNet Python Model API
 ======================
-The model API in mxnet is not really an API.
+The model API is a simplified way to train neural networks using common best practices.
 It is a thin wrapper build on top of [ndarray](ndarray.md) and [symbolic](symbol.md)
 modules to make neural network training easy.
 
diff --git a/docs/packages/python/symbol.md b/docs/packages/python/symbol.md
index d09ee2a2bf98..5c460c52394b 100644
--- a/docs/packages/python/symbol.md
+++ b/docs/packages/python/symbol.md
@@ -1,11 +1,13 @@
 # MXNet Python Symbolic API
-* [How to Commpose Symbols](#overloaded-operators) introduces operator overloading of symbols
-* [Symbol Attributes](#symbol-attributes) introduces how to attach attributes to symbols
+* [How to Compose Symbols](#overloaded-operators) introduces operator overloading of symbols.
+* [Symbol Attributes](#symbol-attributes) introduces how to attach attributes to symbols.
 * [Serialization](#serialization) introduces how to save and load symbols.
-* [Multiple Outputs](#multiple-outputs) introduces how to configure multiple outputs
+* [Executing Symbols](#executing-symbols) introduces how to evaluate the symbols with data.
+* [Execution API Reference](#execution-api-reference) gives reference to all the execution APIs.
+* [Multiple Outputs](#multiple-outputs) introduces how to configure multiple outputs.
 * [Symbol Creation API Reference](#symbol-creationapi-reference) gives reference to all functions.
-* [Symbol Object Document](#mxnet.symbol.Symbol) gives API reference to the Symbol Object
-* [Execution API Reference](#execution-api-reference) tell us on what executor can do.
+* [Symbol Object Document](#mxnet.symbol.Symbol) gives API reference to the Symbol Object.
+* [Testing Utility Reference](#testing-utility-reference) gives reference to the testing utilities.
 
 You are also highly encouraged to read [Symbolic Configuration and Execution in Pictures](symbol_in_pictures.md)
 with this document.
@@ -28,7 +30,7 @@ The following code gives an example of two layer neural network configuration.
 <class 'mxnet.symbol.Symbol'>
 ```
 
-The basic arithematic operators(plus, minus, div, multiplication) are overloaded for
+The basic arithmetic operators(plus, minus, div, multiplication) are overloaded for
 ***elementwise operations*** of symbols.
 
 The following code gives an example of computation graph that add two inputs together.
@@ -47,12 +49,12 @@ data = mx.sym.Variable('data', attr={'mood': 'angry'})
 op   = mx.sym.Convolution(data=data, name='conv', kernel=(1, 1),
                           num_filter=1, attr={'mood': 'so so'})
 ```
-Both key and values of the attribute dictionary should be strings, in order to properly communicate with the C++ backend. The attributes can be retrived via `attr(key)` or `list_attr()`:
+Both key and values of the attribute dictionary should be strings, in order to properly communicate with the C++ backend. The attributes can be retrieved via `attr(key)` or `list_attr()`:
 ```
 assert data.attr('mood') == 'angry'
 assert op.list_attr() == {'mood': 'so so'}
 ```
-In the case of a composite symbol, you can also retrieve all the attributes associated with that symbol *and its descendents* via `list_attr(recursive=True)`. Note in the returned dictionary, all the attribute names are with a prefix `'symbol_name' + '_'` in order to avoid naming conflicts.
+In the case of a composite symbol, you can also retrieve all the attributes associated with that symbol *and its descendants* via `list_attr(recursive=True)`. Note in the returned dictionary, all the attribute names are with a prefix `'symbol_name' + '_'` in order to avoid naming conflicts.
 ```python
 assert op.list_attr(recursive=True) == {'data_mood': 'angry', 'conv_mood': 'so so',
                                         'conv_weight_mood': 'so so', 'conv_bias_mood': 'so so'}
@@ -84,7 +86,7 @@ assert fc2.attr('data') == 'great'
 assert fc2.attr('init_bias') == '0.0'
 ```
 
-**Naming convention**: it is recommended to choose the attribute names to be valid variable names. Names with double underscope (e.g. `__shape__`) are reserved for internal use. The slash `'_'` is the character used to separate a symbol name and its attributes, as well as the separator between a symbol and a variable that is automatically created by that symbol. For example, the `weight` variable created automatically by a ```Convolution``` operator named `conv1` will be called `conv1_weight`.
+**Naming convention**: it is recommended to choose the attribute names to be valid variable names. Names with double under-scope (e.g. `__shape__`) are reserved for internal use. The slash `'_'` is the character used to separate a symbol name and its attributes, as well as the separator between a symbol and a variable that is automatically created by that symbol. For example, the `weight` variable created automatically by a ```Convolution``` operator named `conv1` will be called `conv1_weight`.
 
 **Components that uses attributes**: more and more components are using symbol attributes to collect useful annotations for the computational graph. Here is a (probably incomplete) list:
 
@@ -97,9 +99,9 @@ Serialization
 There are two ways to save and load the symbols. You can use pickle to serialize the ```Symbol``` objects.
 Alternatively, you can use [mxnet.symbol.Symbol.save](#mxnet.symbol.Symbol.save) and [mxnet.symbol.load](#mxnet.symbol.load), functions.
 The advantage of using save and load is that it is language agnostic, and also being cloud friendly.
-The symbol is saved in json format. You can also directly get a json string using [mxnet.symbol.Symbol.tojson](#mxnet.symbol.Symbol.tojson)
+The symbol is saved in JSON format. You can also directly get a JSON string using [mxnet.symbol.Symbol.tojson](#mxnet.symbol.Symbol.tojson)
 
-The following code gives an example of saving a symbol to S3 bucket, load it back and compare two symbols using json string.
+The following code gives an example of saving a symbol to S3 bucket, load it back and compare two symbols using JSON string.
 ```python
 >>> import mxnet as mx
 >>> a = mx.symbol.Variable('a')
@@ -111,7 +113,24 @@ The following code gives an example of saving a symbol to S3 bucket, load it bac
 True
 ```
 
-Multiple Ouputs
+Executing Symbols
+-----------------
+Once you have assembled a set of symbols into a computation graph, the MXNet engine can evaluate those symbols. 
+If you are training a neural network, this is typically
+all handled by the high level [Model class](model.md) and the [`fit()`](model.html#mxnet.model.FeedForward.fit) function.
+
+For neural networks used in "feed-forward", "prediction", or "inference" mode (all different terms for the same
+thing: running a trained network), the input arguments will be the 
+input data, and the weights of the neural network that were learned during training.  
+
+To manually execute a set of symbols, you need to create an [`Executor`](#mxnet.executor.Executor) object, 
+which is typically constructed by calling the [`simple_bind()`](#mxnet.symbol.Symbol.simple_bind) method on a symbol.  
+For an example of this, see the sample 
+[`notebook on how to use simple_bind()`](https://github.com/dmlc/mxnet/blob/master/example/notebooks/simple_bind.ipynb)
+To see how to manually 
+
+
+Multiple Outputs
 ---------------
 You can use [mxnet.symbol.Group](#mxnet.symbol.Group) function to group the symbols together.
 
@@ -160,3 +179,16 @@ Execution API Reference
 
     <script>auto_index("mxnet.executor");</script>
 ```
+
+
+Testing Utility Reference
+-------------------------
+
+```eval_rst
+.. automodule:: mxnet.test_utils
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.test_utils");</script>
+```
diff --git a/docs/packages/python/symbol_in_pictures.md b/docs/packages/python/symbol_in_pictures.md
index 64caae39bfea..ead3f1097735 100644
--- a/docs/packages/python/symbol_in_pictures.md
+++ b/docs/packages/python/symbol_in_pictures.md
@@ -15,7 +15,7 @@ graph that describes what computation is needed. The following picture shows how
 
 Configure Neural Nets
 ---------------------
-Besides fine-grained operations, mxnet also provide a way to perform big operations that is analogous to layers in neural nets.
+Besides fine-grained operations, MXNet also provide a way to perform big operations that is analogous to layers in neural nets.
 We can use these operators to describe a neural net configuration.
 
 ![Net Compose](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/symbol/compose_net.png)
diff --git a/docs/packages/python/tutorial.md b/docs/packages/python/tutorial.md
index 5cecfcd27c0b..22251a8cd4a1 100644
--- a/docs/packages/python/tutorial.md
+++ b/docs/packages/python/tutorial.md
@@ -199,7 +199,7 @@ machines.
 This is achieved by lazy evaluation. Any operation we write down is issued to a
 internal engine, and then returned. For example, if we run `a += 1`, it
 returns immediately after pushing the plus operation to the engine. This
-asynchronicity allows us to push more operations to the engine, so it can determine
+asynchronism allows us to push more operations to the engine, so it can determine
 the read and write dependency and find the best way to execute them in
 parallel.
 
@@ -268,7 +268,7 @@ We can also specify the automatic generated names explicitly:
 MXNet provides well-optimized symbols (see
 [src/operator](https://github.com/dmlc/mxnet/tree/master/src/operator)) for
 commonly used layers in deep learning. We can also easily define new operators
-in python.  The following example first performs an elementwise add between two
+in python.  The following example first performs an element-wise add between two
 symbols, then feeds them to the fully connected operator.
 
 ```python
@@ -289,7 +289,7 @@ forward composition exemplified above.
 >>> net2 = mx.symbol.FullyConnected(data=net2, name='net2', num_hidden=128)
 >>> composed_net = net(data=net2, name='compose')
 >>> composed_net.list_arguments()
-['data2', 'net2_weight', 'net2_bias', 'compose_fc1_weight', 'compose_fc1_bias']
+['data2', 'net2_weight', 'net2_bias', 'fc1_weight', 'fc1_bias']
 ```
 
 In the above example, *net* is used as a function to apply to an existing symbol
@@ -353,16 +353,16 @@ which provides a detailed explanation of the concepts in pictures.
 In short, it is designed to be very efficient in both memory and runtime.
 
 The major reason for us to introduce the Symbolic API is to bring the efficient C++
-operations in powerful toolkits such as cxxnet and caffe together with the
+operations in powerful tool-kits such as CXXNet and caffe together with the
 flexible dynamic NDArray operations. All the memory and computation resources are
 allocated statically during Bind, to maximize the runtime performance and memory
 utilization.
 
-The coarse grained operators are equivalent to cxxnet layers, which are
+The coarse grained operators are equivalent to CXXNet layers, which are
 extremely efficient.  We also provide fine grained operators for more flexible
-composition. Because we are also doing more inplace memory allocation, mxnet can
-be ***more memory efficient*** than cxxnet, and achieves the same runtime, with
-greater flexiblity.
+composition. Because we are also doing more in place memory allocation, MXNet can
+be ***more memory efficient*** than CXXNet, and achieves the same runtime, with
+greater flexibility.
 
 ## Distributed Key-value Store
 
@@ -491,9 +491,9 @@ This section will be updated when the distributed version is ready.
 <!-- * Use fine-grained operator to extend parts of of more flexible symbolic graph. -->
 <!-- * Do some dynamic NArray tricks, which are even more flexible, between the calls of forward and backward of executors. -->
 
-<!-- We believe that different ways offers you different levels of flexibilty and -->
+<!-- We believe that different ways offers you different levels of flexibility and -->
 <!-- efficiency. Normally you do not need to be flexible in all parts of the -->
 <!-- networks, so we allow you to use the fast optimized parts, and compose it -->
 <!-- flexibly with fine-grained operator or dynamic NArray. We believe such kind of -->
 <!-- mixture allows you to build the deep learning architecture both efficiently and -->
-<!-- flexibly as your choice. To mix is to maximize the peformance and flexiblity. -->
+<!-- flexibly as your choice. To mix is to maximize the performance and flexibility. -->
diff --git a/docs/packages/r/CharRnnModel.md b/docs/packages/r/CharRnnModel.md
index 201301c7981b..4b8dd42618f5 100644
--- a/docs/packages/r/CharRnnModel.md
+++ b/docs/packages/r/CharRnnModel.md
@@ -1,7 +1,7 @@
 Char RNN Example
 =============================================
 
-This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
+This example aims to show how to use LSTM model to build a char level language model, and generate text from it. We use a tiny Shakespeare text for demo purpose.
 
 Data can be found at [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) 
 
@@ -13,7 +13,7 @@ This tutorial is written in Rmarkdown.
 
 Load Data 
 ---------
-First of all, load in the data and preprocess it.
+First of all, load in the data and pre-process it.
 
 ```r
 require(mxnet)
@@ -164,7 +164,7 @@ X.val <- list(data=X.val.data, label=X.val.label)
 
 Training Model
 --------------
-In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. 
+In `mxnet`, we have a function called `mx.lstm` so that users can build a general LSTM model. 
 
 
 ```r
diff --git a/docs/packages/r/classifyRealImageWithPretrainedModel.md b/docs/packages/r/classifyRealImageWithPretrainedModel.md
index 7bc5fec1a08f..1b6a792d1fa0 100644
--- a/docs/packages/r/classifyRealImageWithPretrainedModel.md
+++ b/docs/packages/r/classifyRealImageWithPretrainedModel.md
@@ -3,8 +3,8 @@ Classify Real-World Images with Pre-trained Model
 MXNet is a flexible and efficient deep learning framework. One of the cool thing that a deep learning
 algorithm can do is to classify real world images.
 
-In this example we will show how to use a pretrained Inception-BatchNorm Network to predict the class of
-real world image. The network architecture is decribed in [1].
+In this example we will show how to use a pre-trained Inception-BatchNorm Network to predict the class of
+real world image. The network architecture is described in [1].
 
 The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://data.dmlc.ml/mxnet/data/Inception.zip)
 This model gives the recent state-of-art prediction accuracy on image net dataset.
@@ -15,7 +15,7 @@ This tutorial is written in Rmarkdown.
 - You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/classifyRealImageWithPretrainedModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd)
 
-Pacakge Loading
+Package Loading
 ---------------
 To get started, we load the mxnet package by require mxnet.
 
@@ -28,7 +28,7 @@ require(mxnet)
 ## Loading required package: methods
 ```
 
-In this example, we also need the imager package to load and preprocess the images in R.
+In this example, we also need the imager package to load and pre-process the images in R.
 
 
 ```r
@@ -66,7 +66,7 @@ require(imager)
 ##     save.image
 ```
 
-Load the Pretrained Model
+Load the Pre-trained Model
 -------------------------
 Make sure you unzip the pre-trained model in current folder. And we can use the model
 loading function to load the model into R.
@@ -83,7 +83,7 @@ We also need to load in the mean image, which is used for preprocessing using ``
 mean.img = as.array(mx.nd.load("Inception/mean_224.nd")[["mean_img"]])
 ```
 
-Load and Preprocess the Image
+Load and Pre-process the Image
 -----------------------------
 Now we are ready to classify a real image. In this example, we simply take the parrots image
 from imager package. But you can always change it to other images.
@@ -100,8 +100,8 @@ plot(im)
 
 Before feeding the image to the deep net, we need to do some preprocessing
 to make the image fit the input requirement of deepnet. The preprocessing
-include cropping, and substraction of the mean.
-Because mxnet is deeply integerated with R, we can do all the processing in R function.
+include cropping, and subtraction of the mean.
+Because MXNet is deeply integrated with R, we can do all the processing in R function.
 
 The preprocessing function:
 
diff --git a/docs/packages/r/index.md b/docs/packages/r/index.md
index 829ca1d995e7..a5e59f4a6810 100644
--- a/docs/packages/r/index.md
+++ b/docs/packages/r/index.md
@@ -11,7 +11,7 @@ Sounds exciting? This page contains links to all the related documents on R pack
 
 Get Started
 -----------
-Checkout the [Installation Guide](../get_started/build.md) contains instructions to install mxnet, and [Tutorials](#tutorials) for examples on how to use mxnet for various tasks.
+Checkout the [Installation Guide](../get_started/build.md) contains instructions to install MXNet, and [Tutorials](#tutorials) for examples on how to use MXNet for various tasks.
 
 Tutorials
 ---------
@@ -25,7 +25,7 @@ Tutorials
 Resources
 ---------
 There are several information to get you started
-* [Installation Guide](../get_started/build.md) contains instructions to install mxnet.
+* [Installation Guide](../get_started/build.md) contains instructions to install MXNet.
 * [Contributor Guide](http://mxnet.readthedocs.org/en/latest/contribute.html#r-package)
-  - The R package section gives various guidelines on how to contribute code, tutorial, rmarkdown examples to mxnet.
+  - The R package section gives various guidelines on how to contribute code, tutorial, rmarkdown examples to MXNet.
   - Your contribution is always welcomed!
diff --git a/docs/packages/r/mnistCompetition.md b/docs/packages/r/mnistCompetition.md
index a84ecb5ec326..a995d964ff4d 100644
--- a/docs/packages/r/mnistCompetition.md
+++ b/docs/packages/r/mnistCompetition.md
@@ -2,7 +2,7 @@ Handwritten Digits Classification Competition
 =============================================
 
 [MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set.
-We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
+We will present the basic usage of [MXNet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
 
 This tutorial is written in Rmarkdown. You can download the source [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/mnistCompetition.Rmd) and view a
 hosted version of tutorial [here](http://mxnet.readthedocs.io/en/latest/packages/r/mnistCompetition.html).
@@ -40,7 +40,7 @@ Here every image is represented as a single row in train/test. The greyscale of
 train.x <- t(train.x/255)
 test <- t(test/255)
 ```
-We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by mxnet (and the convention of R).
+We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by MXNet (and the convention of R).
 
 In the label part, we see the number of each digit is fairly even:
 
@@ -240,8 +240,8 @@ device.gpu <- lapply(0:(n.gpu-1), function(i) {
 })
 ```
 
-As you can see, we can pass a list of devices, to ask mxnet to train on multiple GPUs (you can do similar thing for cpu,
-but since internal computation of cpu is already multi-threaded, there is less gain than using GPUs).
+As you can see, we can pass a list of devices, to ask MXNet to train on multiple GPUs (you can do similar thing for CPU,
+but since internal computation of CPU is already multi-threaded, there is less gain than using GPUs).
 
 We start by training on CPU first. Because it takes a bit time to do so, we will only run it for one iteration.
 
diff --git a/docs/packages/r/ndarrayAndSymbolTutorial.md b/docs/packages/r/ndarrayAndSymbolTutorial.md
index c04fac237614..2fa91b3fcbbe 100644
--- a/docs/packages/r/ndarrayAndSymbolTutorial.md
+++ b/docs/packages/r/ndarrayAndSymbolTutorial.md
@@ -324,16 +324,16 @@ which provides a detailed explanation of concepts in pictures.
 
 ### How Efficient is Symbolic API
 
-In short, they design to be very efficienct in both memory and runtime.
+In short, they design to be very efficient in both memory and runtime.
 
 The major reason for us to introduce Symbolic API, is to bring the efficient C++
-operations in powerful toolkits such as cxxnet and caffe together with the
+operations in powerful tool-kits such as CXXNet and Caffe together with the
 flexible dynamic NArray operations. All the memory and computation resources are
 allocated statically during Bind, to maximize the runtime performance and memory
 utilization.
 
-The coarse grained operators are equivalent to cxxnet layers, which are
+The coarse grained operators are equivalent to 	CXXNet layers, which are
 extremely efficient.  We also provide fine grained operators for more flexible
-composition. Because we are also doing more inplace memory allocation, mxnet can
-be ***more memory efficient*** than cxxnet, and gets to same runtime, with
-greater flexiblity.
+composition. Because we are also doing more in-place memory allocation, MXNet can
+be ***more memory efficient*** than CXXNet, and gets to same runtime, with
+greater flexibility.
diff --git a/docs/sphinx_util.py b/docs/sphinx_util.py
index 93cdb5605c00..5720f8e543c0 100644
--- a/docs/sphinx_util.py
+++ b/docs/sphinx_util.py
@@ -5,15 +5,12 @@
 import docutils
 import subprocess
 
-READTHEDOCS_BUILD = (os.environ.get('READTHEDOCS', None) is not None)
-
 def run_build_mxnet(folder):
     """Run the doxygen make command in the designated folder."""
     try:
-        if READTHEDOCS_BUILD:
-            subprocess.call('cd %s; cp make/readthedocs.mk config.mk' % folder, shell = True)
-            subprocess.call('cd %s; rm -rf build' % folder, shell = True)
-        retcode = subprocess.call("cd %s; make" % folder, shell = True)
+        subprocess.call('cd %s; cp make/readthedocs.mk config.mk' % folder, shell = True)
+        subprocess.call('cd %s; rm -rf build' % folder, shell = True)
+        retcode = subprocess.call("cd %s; make -j$(nproc)" % folder, shell = True)
         if retcode < 0:
             sys.stderr.write("build terminated by signal %s" % (-retcode))
     except OSError as e:
@@ -25,19 +22,9 @@ def run_build_mxnet(folder):
 else:
     subprocess.call('cd ../recommonmark/; git pull', shell=True)
 
-if not os.path.exists('web-data'):
-     subprocess.call('rm -rf web-data;' +
-                     'git clone https://github.com/dmlc/web-data', shell = True)
-else:
-     subprocess.call('cd web-data; git pull', shell=True)
-
-
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 root_path = os.path.join(curr_path, '..')
 run_build_mxnet(root_path)
-sys.stderr.write('READTHEDOCS=%s\n' % (READTHEDOCS_BUILD))
-
-
 
 sys.path.insert(0, os.path.abspath('../recommonmark/'))
 
diff --git a/docs/system/engine.md b/docs/system/engine.md
index fa08ffb89ed0..c6dfce616f6a 100644
--- a/docs/system/engine.md
+++ b/docs/system/engine.md
@@ -30,7 +30,7 @@ struct RunContext {
 	void *stream;
 };
 ```
-Alternatively, one could use `mxnet::engine::DAGEngine::Fn` which is the same type defination.
+Alternatively, one could use `mxnet::engine::DAGEngine::Fn` which is the same type definition.
 
 All the functions will be executed by the internal threads of the engine. In such model, it is usually not suggested to push *blocking* functions to the engine (usually for dealing with I/O tasks like disk, web service, UI, etc.) since it will occupy the execution thread and reduce the total throughput. In such case, we provide another *asynchronous* function type:
 ```c++
diff --git a/docs/system/index.md b/docs/system/index.md
index e7afb51aa0bf..edf7b6788737 100644
--- a/docs/system/index.md
+++ b/docs/system/index.md
@@ -6,7 +6,7 @@ libraries in general. We believe that open sourcing this system design note can
 ## Deep Learning Design Notes
 
 This section will be updated with self-contained design notes on various aspect of deep learning systems,
-in terms of abstraction, optimization and trade-offs.
+in terms of abstraction, optimization and trade-off.
 
 * [Programming Models for Deep Learning](program_model.md)
 * [Dependency Engine for Deep Learning](note_engine.md)
@@ -20,14 +20,14 @@ The next parts will be specific to MXNet
 
 ![System Overview](https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/mxnet/system/overview.png)
 
-The above shows major modules of mxnet, and how do they interact with each
+The above shows major modules of MXNet, and how do they interact with each
 other. The modules are
 - [Runtime Dependency Engine](engine.md): Schedules and executes the
   operations according to their read/write dependency.
 - Storage Allocator: Efficiently allocate and recycles memory blocks for GPU and
   CPU.
 - Resource Manager: Manage global resources such as random number generator, temporal space.
-- NDArray: Dynamic asynchronize n-dimensional arrays, provide flexible
+- NDArray: Dynamic asynchronous n-dimensional arrays, provide flexible
   imperative programs for MXNet.
 - Symbolic Execution: Static symbolic graph executor, provide efficient symbolic
   graph execution and optimization.
diff --git a/docs/system/multi_node.md b/docs/system/multi_node.md
index f9aacb93a3bb..9e0e04ea3d57 100644
--- a/docs/system/multi_node.md
+++ b/docs/system/multi_node.md
@@ -70,13 +70,13 @@ They produce (almost) the same results, but may vary on speed.
 - `local_update_cpu`, gradients are first copied to main memory, next averaged on CPU,
   and then update the weight on CPU. It is suitable when the average size of
   weights are not large and there are a large number of weight. For example the
-  google Inception network.
+  GOOGLE Inception network.
 
 - `local_allreduce_cpu` is similar to `local_update_cpu` except that the
   averaged gradients are copied back to the devices, and then weights are
   updated on devices. It is faster than 1 when the weight size is large so we
   can use the device to accelerate the computation (but we increase the workload
-  by *k* times). Examples are AlexNet on imagenet.
+  by *k* times). Examples are AlexNet on Imagenet.
 
 - `local_allreduce_device` is similar to `local_allreduce_cpu` except that the
   gradient are averaged on a chosen device. It may take advantage of the
diff --git a/docs/system/note_data_loading.md b/docs/system/note_data_loading.md
index 41ae6a83d071..f1fac5140401 100644
--- a/docs/system/note_data_loading.md
+++ b/docs/system/note_data_loading.md
@@ -2,7 +2,7 @@ Design Efficient Deep Learning Data Loading Module
 ==================================================
 Data loading is an important part of the machine learning system, especially when the data is huge and do not fit into memory.  The general design goal of  data loading module is to achieve more efficient data loading, less effort on data preparation, clean and flexible interface.
 
-This tutorial will be organized as follows: in IO Design Insight section, we introduce some insights and guidelines in our data loading design; in Data Format section, we introduce our solution using dmlc-core's binary recordIO implementation; in Data Loading section, we introduce our method to hide IO cost by utilizing the Threadediter provided by dmlc-core; in the Interface Design section, we will show you the simple way to construct a MXNet data iterator in a few lines of python; in the Future Extension part, we discuss how to make data loading more flexible to support more learning tasks.
+This tutorial will be organized as follows: in IO Design Insight section, we introduce some insights and guidelines in our data loading design; in Data Format section, we introduce our solution using dmlc-core's binary recordIO implementation; in Data Loading section, we introduce our method to hide IO cost by utilizing the Threaded iter provided by dmlc-core; in the Interface Design section, we will show you the simple way to construct a MXNet data iterator in a few lines of python; in the Future Extension part, we discuss how to make data loading more flexible to support more learning tasks.
 
 We will cover the following key requirements, in detail in the later part of sections.
 
@@ -16,7 +16,7 @@ We will cover the following key requirements, in detail in the later part of sec
 IO design usually involves two kinds of work: data preparation and data loading. Data preparation usually influences the time consuming offline, while data loading influences the online performance. In this section, we will introduce our insight of IO design involving the two phases.
 
 ### Data Preparation
-Data preparation is to pack the data into certain format for later processing. When the data is huge, i.e. full ImageNet, this process may be time-consuming. Since that, there're several things we need to pay attention:
+Data preparation is to pack the data into certain format for later processing. When the data is huge, i.e. full ImageNet, this process may be time-consuming. Since that, there are several things we need to pay attention:
 
 - Pack the dataset into small numbers of files. A dataset may contain millions of data instances. Packed data distributes easily from machine to machine;
 - Do the packing once. No repacking is needed when the running setting has been changed (usually means the number of running machines);
@@ -24,7 +24,7 @@ Data preparation is to pack the data into certain format for later processing. W
 - Access to arbitrary parts easily. This is crucial for distributed machine learning when data parallelism is introduced. Things may get tricky when the data has been packed into several physical data files. The desired behavior could be: the packed data can be logically partite into arbitrary numbers of partitions, no matter how many physical data files there are. For example, we pack 1000 images into 4 physical files, each contains 250 images. Then we use 10 machines to training DNN, we should be able to load approximately 100 images per machine. Some machine may need images from different physical files.
 
 ### Data Loading
-Data loading is to load the packed data into RAM. One ultimate goal is to load as quickly as possible. Thus there're several things we need to pay attention:
+Data loading is to load the packed data into RAM. One ultimate goal is to load as quickly as possible. Thus there are several things we need to pay attention:
 - Continuous reading. This is to avoid arbitrary reading from disk;
 - Reduce the bytes to be loaded. This can be achieved by storing the data instance in a compact way, e.g. save the image in JPEG format;
 - Load and train in different threads. This is to hide the loading time cost;
@@ -63,8 +63,8 @@ The desired behavior of data loading could be: the packed data can be logically
 Since binary recordIO can easily locate the start and end of a record using the Magic Number, we can achieve the above goal using the InputSplit functionality provided by dmlc-core.
 
 InputSplit takes the following parameters:
-- FileSystem *filesys: dmlc-core encapsulate the IO operations for different filesystems, like hdfs, s3, local. User don't need to worry about the difference between filesystems any more;
-- Char *uri: the uri of files. Note that it could be a list of files, for we may pack the data into several physical parts. File uris are separated by ';'.
+- FileSystem *filesys: dmlc-core encapsulate the IO operations for different file systems, like hdfs, s3, local. User don't need to worry about the difference between file systems any more;
+- Char *uri: the uri of files. Note that it could be a list of files, for we may pack the data into several physical parts. File URIs are separated by ';'.
 - Unsigned nsplit: the number of logical splits. Nsplit could be different from the number of physical file parts;
 - Unsigned rank: which split to load in this process;
 
@@ -84,24 +84,24 @@ The splitting process is demonstrated below:
 By conducting the above operations, we now identify the records belong to each part, and the physical data files needed by each logical part. InputSplit greatly reduce the difficulty of data parallelism, where each process only read part of the data.
 
 Since logical partition doesn't rely on the number of physical data files, we can process huge dataset like ImageNet_22K in parallel easily as illustrated below. We don't need to consider distributed loading issue at the preparation time, just select the most efficient physical file number according to the dataset size and the computing resources you have.
-![parellelprepare](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/parallelprepare.jpg)
+![parallelprepare](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/parallelprepare.jpg)
 
 ## Data Loading and Preprocessing
 
-When the speed of loading and preprocessing can't catch up with the speed of training or evaluation, IO will become the bottleneck of the whole system. In this section, we will introduce our tricks to pursuit the ultimate efficiency to load and preprocess data packed in binary recordIO format. In our ImageNet practice, we can achieve the IO speed of **3000** images/s **with normal HDD**.
+When the speed of loading and preprocessing can't catch up with the speed of training or evaluation, IO will become the bottleneck of the whole system. In this section, we will introduce our tricks to pursuit the ultimate efficiency to load and pre-process data packed in binary recordIO format. In our ImageNet practice, we can achieve the IO speed of **3000** images/s **with normal HDD**.
 
 ### Loading and preprocessing on the fly
 
-When training deep neural networks, we sometimes can only load and preprocess the data along with training because of the following reasons:
+When training deep neural networks, we sometimes can only load and pre-process the data along with training because of the following reasons:
 - The whole size of the dataset exceed the RAM size, we can't load them in advance;
 - The preprocessing pipeline may produce different output for the same data at different epoch if we would like to introduce randomness in training;
 
-To achieve the goal of ultimate efficiency, multi-thread technic is introduced in the related procedures. We take imagenet training as an example, after loading a bunch of image records, we can start ***multiple threads to do the image decoding and image augmentation*** , as illustrated below:
+To achieve the goal of ultimate efficiency, multi-thread technique is introduced in the related procedures. We take Imagenet training as an example, after loading a bunch of image records, we can start ***multiple threads to do the image decoding and image augmentation*** , as illustrated below:
 ![process](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/io/process.jpg)
 
 ### Hide IO Cost Using Threadediter
 
-One way to hide IO cost is to prefetch the data for next batch on a stand-alone thread, while the main thread conducting feed-forward and backward. In order to support more complicated training schemas, MXNet provide a more general IO processing pipeline using threadediter provided by dmlc-core.
+One way to hide IO cost is to pre-fetch the data for next batch on a stand-alone thread, while the main thread conducting feed-forward and backward. In order to support more complicated training schemas, MXNet provide a more general IO processing pipeline using threadediter provided by dmlc-core.
 
 The key of threadediter is to start a stand-alone thread acts like a data provider, while the main thread acts like data consumer as illustrated below.
 
@@ -121,7 +121,7 @@ dataiter = mx.io.ImageRecordIter(
     data_shape=(3,28,28),
     # Batch Parameter, tells how many images in a batch
     batch_size=100,
-    # Augmentation Parameter, when offers mean_img, each image will substract the mean value at each pixel
+    # Augmentation Parameter, when offers mean_img, each image will subtract the mean value at each pixel
     mean_img="data/cifar/cifar10_mean.bin",
     # Augmentation Parameter, randomly crop a patch of the data_shape from the original image
     rand_crop=True,
diff --git a/docs/system/note_engine.md b/docs/system/note_engine.md
index c777cba2b572..e9b29792da09 100644
--- a/docs/system/note_engine.md
+++ b/docs/system/note_engine.md
@@ -13,7 +13,7 @@ the runtime dependency scheduling problem in deep learning. We will introduce th
 scheduling problem, how it can help make multi-device deep learning easier and faster, and
 discuss possible designs of a generic dependency engine that is library and operation independent.
 
-Most design details of this article inspires the dependency engine of mxnet, with the dependency tracking algorithm majorly contributed by [Yutian Li](https://github.com/hotpxl) and [Mingjie Wang](https://github.com/jermainewang).
+Most design details of this article inspires the dependency engine of MXNet, with the dependency tracking algorithm majorly contributed by [Yutian Li](https://github.com/hotpxl) and [Mingjie Wang](https://github.com/jermainewang).
 
 Dependency Scheduling Problem
 -----------------------------
@@ -83,7 +83,7 @@ interesting challenges for dependency engine. Consider the following example
 ![Dep Rand](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/engine/dep_rand.png)
 
 Here we are generating random numbers in a sequence. While it seems that the two random number
-generations can be parallelized. This is usually not the case. Because usually a pseudorandom
+generations can be parallelized. This is usually not the case. Because usually a pseudo random
 number generator (PRNG) is not thread-safe because it might contain some internal state to mutate
 when generating a new number. Even if the PRNG is thread-safe, it is still desirable to
 run the generation in the a serialized way, so we can get reproducible random numbers.
@@ -141,7 +141,7 @@ Few important notes:
 - There is a lag of computation between last backward to layer k to next forward call to layer k.
 	- We can do the weight synchronization of layer k ***in parallel*** with other computation in this lag.
 
-The points mentioned in above list is the exact optimization used by multi GPU deep learning libaries such as cxxnet.
+The points mentioned in above list is the exact optimization used by multi GPU deep learning libraries such as CXXNet.
 The idea is to overlap the weight synchronization(communication) with the computation.
 However, as you may find out it is really not easy to do that, as the copy need to be triggered as soon as backward of
 that layer completes, which then triggers the reduction, updates etc.
@@ -300,7 +300,7 @@ for example, they are usually good for better parallelization, and easier fault
 
 However, making things purely immutable makes several things hard:
 - It is harder to schedule the resource contention problems such as random number and deletion.
-- The engine usually need to manage resources (memory, random number) to avoid conflictions.
+- The engine usually need to manage resources (memory, random number) to avoid conflicts.
 	- It is harder to plug in user allocated space etc.
 - No pre-allocated static memory, again because the usual pattern is write to a pre-allocated layer space,
   which is not supported is data is immutable.
diff --git a/docs/system/note_memory.md b/docs/system/note_memory.md
index fe1c310d8b7d..64cc4a73f2e3 100644
--- a/docs/system/note_memory.md
+++ b/docs/system/note_memory.md
@@ -25,7 +25,7 @@ the gradient needed.
 
 ![Backward Graph](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/back_graph.png)
 
-Libraries like caffe, cxxnet, torch uses the backprop on same graph. While libraries like Theano, CGT takes the explicit
+Libraries like Caffe, CXXNet, torch uses the backprop on same graph. While libraries like Theano, CGT takes the explicit
 backward path approach. We will adopt the ***explicit backward path*** way in the article, because it brings several advantages
 in turns of optimization.
 
@@ -80,26 +80,26 @@ This means we need roughly ```2 n``` memory cells. This is the same in the expli
 the number of nodes in backward pass in roughly the same as forward pass.
 
 ### Inplace Operations
-One of the very first thing that we can do is inplace memory sharing of operations. This is usually done for
+One of the very first thing that we can do is in-place memory sharing of operations. This is usually done for
 simple operations such as activation functions. Consider the following case, where we want to
 compute the value of three chained sigmoid function.
 
 ![Inplace op](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline.png)
 
-Because we can compute sigmoid in the ```inplace``` manner, that is, use the same memory for input and output.
+Because we can compute sigmoid in the ```in-place``` manner, that is, use the same memory for input and output.
 We can simply allocate one copy of memory, and use it compute arbitrary length of sigmoid chain.
 
-However, the inplace optimization sometimes can be done in the wrong way, especially when the package tries
+However, the in-place optimization sometimes can be done in the wrong way, especially when the package tries
 to be a bit general. Consider the following case, where the value of B is not only used by C, but also F.
 
-![Inplace trap](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png)
+![In-place trap](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_inline_trap.png)
 
-We cannot perform inplace optimization because the value of B is still needed after ```C=sigmoid(B)``` is computed.
-So an algorithm that simply do inplace optimization for every sigmoid operation might fall into such trap,
+We cannot perform in-place optimization because the value of B is still needed after ```C=sigmoid(B)``` is computed.
+So an algorithm that simply do in-place optimization for every sigmoid operation might fall into such trap,
 and we need to be careful on when we can do it.
 
 ### Normal Memory Sharing
-Memories can also be shared besides the inplace operation. Consider the following case, because the
+Memories can also be shared besides the in-place operation. Consider the following case, because the
 value of B is no longer needed when we compute E, we can reuse the memory to hold the result of E.
 
 ![Normal Sharing](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_normal.png)
@@ -122,7 +122,7 @@ In the above example:
 Memory Allocation Algorithm
 ---------------------------
 We have discussed how the general techniques to optimize memory allocations in previous section.
-However, we also see that there are traps which we want to avoid like the inplace case.
+However, we also see that there are traps which we want to avoid like the in-place case.
 How can we allocate the memory correctly? This is not a new problem. For example, it is very similar
 to register allocation in compilers. So there could be a lot we can borrow. We do not attempt to give
 a comprehensive review of techniques here, but rather introduce some simple but useful trick to attack
@@ -144,7 +144,7 @@ and keep a counter of future operations that depends on the node.
 
 ![Alloc](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/memory/alloc_step.png)
 
-- An inplace optimization can be performed when only current operation depend on the source(i.e. counter=1)
+- An in-place optimization can be performed when only current operation depend on the source(i.e. counter=1)
 - A memory can be recycled into the box on the upper right corner when counter goes to 0
 - Every time, when we need new memory, we can either get it from the box, or allocate a new one.
 
@@ -217,7 +217,7 @@ Now comes the question on how much we can really save by using these techniques.
 
 The answer is we can roughly reduce the memory consumption ***by half*** using these techniques. This is on the coarse grained operation graphs that are already optimized with big operations. More memory reduction could be seen if we are optimizing a fine-grained computation network used by symbolic libraries such as Theano.
 
-Most of the ideas in this article inspires the design of mxnet.
+Most of the ideas in this article inspires the design of MXNet.
 We provide an [Memory Cost Estimation Script](https://github.com/dmlc/mxnet/tree/master/example/memcost),
 which you can play with to see how much memory we need under different strategies.
 
diff --git a/docs/system/operator.md b/docs/system/operator.md
index 491f0a2479c3..a5a62101444c 100644
--- a/docs/system/operator.md
+++ b/docs/system/operator.md
@@ -123,7 +123,7 @@ It is possible that one convolution has several implementations and users want t
   }
   ```
 
-* **Inplace Option:** To further save memory allocation cost, inplace update are welcomed. This usually happens for element-wise operations when input tensor and output tensor are of the same shape. This could be specified by the following interface:
+* **Inplace Option:** To further save memory allocation cost, in-place update are welcomed. This usually happens for element-wise operations when input tensor and output tensor are of the same shape. This could be specified by the following interface:
   ```c++
   virtual std::vector<std::pair<int, void*>> ElewiseOpProperty::ForwardInplaceOption(
       const std::vector<int> &in_data,
@@ -184,7 +184,7 @@ class ConvolutionOpProperty : public OperatorProperty {
 };
 ```
 
-### Parameterize Operator
+### Parametrize Operator
 When implementing convolution operator, we need to know the kernal size, the stride size, padding size and so on. These parameters should be passed to the operator before any `Forward` or `Backward` interface is called. To do so, user could define a `ConvolutionParam` structure:
 ```c++
 #include <dmlc/parameter.h>
diff --git a/docs/system/operator_util.md b/docs/system/operator_util.md
index 812967571d32..72eb18f6875c 100644
--- a/docs/system/operator_util.md
+++ b/docs/system/operator_util.md
@@ -25,7 +25,7 @@ of l1 loss and l2 loss. The loss itself can be written as:
 loss = outside_weight .* f(inside_weight .* (data - label))
 grad = outside_weight .* inside_weight .* f'(inside_weight .* (data - label))
 ```
-where `.*` stands for elementwise multiplication and `f`, `f'` is the smooth l1 loss function, 
+where `.*` stands for element wise multiplication and `f`, `f'` is the smooth l1 loss function, 
 which we suppose we have in `mshadow` for now. At first glance, it is impossible to implement 
 this particular loss as an unary or binary operator. But we have automatic differentiation in 
 the symbolic execution. That would simplify the loss to `f` and `f'` directly. In this way, this 
@@ -226,7 +226,7 @@ MXNET_REGISTER_SIMPLE_OP(smooth_l1, XPU)
 ```
 Remember from shape functions that a default behavior without `set_shape_function` will be forcing the inputs 
 (if binary) to be of the same shape and yield the same shape for output. The `set_enable_scalar` will be 
-discussed in addtional information.
+discussed in additional information.
 
 ### All in a List
 * Create a shape function for determining the output shape
@@ -259,7 +259,7 @@ functions and gradients, additional arguments are contained in `env.kwarg`, whic
 simplify parsing keyword arguments. Refer to the [guide on parameter structure](https://github.com/dmlc/dmlc-core/blob/master/doc/parameter.md)
 for more details.
 
-Addtional resources like `mshadow::Random<xpu>` and temporary memory space can also be requested and 
+Additional resources like `mshadow::Random<xpu>` and temporary memory space can also be requested and 
 accessed from `EnvArguments.resource`. The registration routine is `set_resource_request(ResourceRequest req)` 
 or `set_resource_request(const std::vector<ResourceRequest>)`, where `mxnet::ResourceRequest` is defined as in:
 ```cpp
@@ -290,7 +290,7 @@ can implement them as a `mxnet::op::mshadow_op`. `src/operator/mshadow_op.h` con
 as a good example. `mshadow_op` are expression mappers and deal with the scalar case of desired functions. Refer to 
 [mshadow expression API guide](https://github.com/dmlc/mshadow/tree/master/doc) for details.
 
-It could also be possible that the operation cannot be done in an elementwise way, like the softmax loss and gradient. 
+It could also be possible that the operation cannot be done in an element wise way, like the softmax loss and gradient. 
 Then there is a need to create a new tensor operation. Then we need to create a `mshadow` function and a `mshadow::cuda`
 function directly. Please refer to `mshadow` library for details or `src/operator/roi_pooling.cc` for an example.
 
@@ -314,5 +314,5 @@ struct smooth_l1_loss {
 The gradient is similar, which can be found in `src/operator/smooth_l1_unary-inl.h`.
 
 ### Beyond Two Operands
-This new unified API is designed to fulfill the fundamentals of an operation. For operators with more than two inputs, 
+This new unified API is designed to fulfil the fundamentals of an operation. For operators with more than two inputs, 
 more than one outputs, or in need of more features, please refer to the original [Operator API](operator.md).
diff --git a/docs/system/program_model.md b/docs/system/program_model.md
index 4ec4e5cd73c6..08f82edb4392 100644
--- a/docs/system/program_model.md
+++ b/docs/system/program_model.md
@@ -103,12 +103,12 @@ Imperative programs, on the other hand, need to ***be prepared for all possible
 there is a possibility that any of these variables could be used in the future, this prevents the system to share the memory space of these variables.
 
 Of course this argument is a bit idealized, since garbage collection can happen in imperative programs when things runs out of scope, and memory could be re-used.
-However, the constraint to be "prepared for all possible futures" indeed happens, and limits the optimizations we can do. This holds for non-trival cases such
+However, the constraint to be "prepared for all possible futures" indeed happens, and limits the optimizations we can do. This holds for non-trivial cases such
 as gradient calculation, which we will be discussing in next section.
 
 Another optimization that symbolic programs can do is operation folding. In the above programs, the multiplication and addition can be folded into one operation.
 Which is represented in the following graph. This means one GPU kernel will be executed(instead of two) if the computation runs on GPU.
-This is actually what we will do to hand crafted operations in optimized libraries such as cxxnet, caffe. Doing so will improve the computation efficiency.
+This is actually what we will do to hand crafted operations in optimized libraries such as CXXNet, Caffe. Doing so will improve the computation efficiency.
 
 ![Comp Graph Folded](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/prog_model/comp_graph_fold.png)
 
@@ -118,7 +118,7 @@ boundary on which value is needed and which is not. While imperative programs on
 
 ### Case Study on Backprop and AutoDiff
 
-In this section, we will compare the two programing models on the problem of auto differentiation, or backpropagation. Gradient calculation is actually
+In this section, we will compare the two programming models on the problem of auto differentiation, or backpropagation. Gradient calculation is actually
 the problem that all the deep learning library need to solve. It is possible to do gradient calculation in both imperative and symbolic style.
 
 Let us start with the imperative programs. The following snippet is a minimum python code that does automatic differentiation on the toy example we discussed.
@@ -182,7 +182,7 @@ They corresponds to the red nodes in the following figure.
 
 What the imperative program did was actually the same as the symbolic way. It implicitly saves a backward
 computation graph in the grad closure. When we invoked the ```d.grad```, we start from ```d(D)```,
-backtrace the graph to compute the gradient and collect the results back.
+backtrack the graph to compute the gradient and collect the results back.
 
 So we can find that in fact the gradient calculation in both symbolic and imperative programming follows the same
 pattern. What is the difference between the two then? Again recall the "have to prepared for all possible futures"
@@ -222,10 +222,10 @@ with context.NoGradient():
 However, the above example still have many possible futures, which means we cannot do the inplace calculation
 to re-use the memory in forward pass(a trick commonly used to reduce GPU memory usage).
 The techniques introduced in this section generates explicit backward pass.
-On some of the toolkits such as caffe, cxxnet. Backprop is done implicitly on the same graph.
+On some of the tool-kits such as Caffe, CXXNet. Backprop is done implicitly on the same graph.
 The discussions of this section also applies to these cases as well.
 
-Most configuration file based libraries such as cxxnet, caffe are designed for one or two generic requirement.
+Most configuration file based libraries such as CXXNet, Caffe are designed for one or two generic requirement.
 Get the activation of each layer, or get gradient of all the weights. Same problem stays for these libraries,
 the more generic operations the library have to support, the less optimization(memory sharing) we can do, based on the same data structure.
 
@@ -257,7 +257,7 @@ configuration layer on top of the imperative language.
 
 ### Parameter Update
 
-Most symbolic programs are data flow(computation) graphs. Dataflow graph can be used to descrie computation conveniently.
+Most symbolic programs are data flow(computation) graphs. Dataflow graph can be used to describe computation conveniently.
 However, it is not obvious how to use data flow graph to describe parameter updates, because parameter updates introduces mutation,
 which is not concept of data flow. What most symbolic programs do is to introduce a special update statement, to update some persistent
 states of the programs.
@@ -279,8 +279,8 @@ Big vs Small Operations
 Now we have pass through the battlefield between symbolic and imperative programs. Let us start to talk about the operations supported by deep learning libraries.
 Usually there are two types of operations supported by different deep learning libraries.
 - The big layer operations such as FullyConnected, BatchNormalize
-- The small operations such as elementwise addition, multiplications.
-The libraries like cxxnet, caffe support layer level operations. While the libraries like Theano, Minerva support fine grained operations.
+- The small operations such as element wise addition, multiplications.
+The libraries like CXXNet, Caffe support layer level operations. While the libraries like Theano, Minerva support fine grained operations.
 
 ### Smaller Operations can be More Flexible
 This is quite natural, in a sense that we can always use smaller operations to compose bigger operations.
@@ -289,7 +289,7 @@ For example, the sigmoid unit can be simply be composed by division and exponent
 sigmoid(x) = 1.0 / (1.0 + exp(-x))
 ```
 If we have the smaller operations as building blocks, we can express most of the problems we want.
-For readers who are more familar with cxxnet, caffe style layers. These operations is not different from a layer, except that they are smaller.
+For readers who are more familiar with CXXNet, Caffe style layers. These operations is not different from a layer, except that they are smaller.
 ```python
 SigmoidLayer(x) = EWiseDivisionLayer(1.0, AddScalarLayer(ExpLayer(-x), 1.0))
 ```
@@ -303,8 +303,8 @@ SigmoidLayer(x) = EWiseDivisionLayer(1.0, AddScalarLayer(ExpLayer(-x), 1.0))
 ```
 This will create overhead in terms of computation and memory (which could be optimized, with cost).
 
-So the libraries like cxxnet, caffe take a different approach. To support more coarse grained operations
-such as BatchNormalization, and the SigmoidLayer directly. In each of these layers, the calculation kernel is handcrafted
+So the libraries like CXXNet, Caffe take a different approach. To support more coarse grained operations
+such as BatchNormalization, and the SigmoidLayer directly. In each of these layers, the calculation kernel is hand crafted
 with one or only some CUDA kernel launches. This brings more efficiency to these implementations.
 
 ### Compilation and Optimization
@@ -312,23 +312,23 @@ with one or only some CUDA kernel launches. This brings more efficiency to these
 Can the small operations be optimized? Of course they can. This comes to the system optimization part of the compilation engine.
 There are two types of optimization that can be done on the computation graph
 - The memory allocation optimization, to reuse memory of the intermediate computations.
-- Operator fusion, to detect subgraph pattern such as the sigmoid and fuse them into a bigger operation kernel.
+- Operator fusion, to detect sub-graph pattern such as the sigmoid and fuse them into a bigger operation kernel.
 The memory allocation optimization was actually not restricted to small operations graphs, but can also be applied to bigger operations graph as well.
 
-However these optimization may not be essential for bigger operation libraries like cxxnet, caffe. As you never find the compilation step in them. Actually there is a (dumb) ```compilation step``` in these libraries, that basically translate the layers into a fixed forward, backprop execution plan, by running each operation one by one.
+However these optimization may not be essential for bigger operation libraries like CXXNet, Caffe. As you never find the compilation step in them. Actually there is a (dumb) ```compilation step``` in these libraries, that basically translate the layers into a fixed forward, backprop execution plan, by running each operation one by one.
 
-For computation graphs with smaller operations, these optimizations are crucial for performance. Because the operations are small, there are many subgraph patterns
+For computation graphs with smaller operations, these optimizations are crucial for performance. Because the operations are small, there are many sub-graph patterns
 that can be matched. Also because the final generated operations may not be able to enumerated, an explicit recompilation of the kernels is required, as opposed to
 the fixed amount of pre-compiled kernels in the big operation libraries. This is the cause of compilation overhead of the symbolic libraries that support small operations.
 The requirement of compilation optimization also creates overhead of engineering for the libraries that solely support smaller operations.
 
 Like in the symbolic vs imperative case. The bigger operation libraries "cheat" by asking user to provide restrictions(to the common layer provided),
-so user is actually the one that does the subgraph matching. This removes the compilation overhead to the real brain, which is usually not too bad.
+so user is actually the one that does the sub-graph matching. This removes the compilation overhead to the real brain, which is usually not too bad.
 
 ### Expression Template and Statically Typed Language
 
 As we can see we always have a need to write small operations and compose them together.
-Libraries like caffe use hand-carfted kernels to build up these bigger blocks. Otheriwse user have to compose up smaller operations from python side.
+Libraries like Caffe use hand-crafted kernels to build up these bigger blocks. Otherwise user have to compose up smaller operations from python side.
 
 Actually, there is a third choice, that works pretty well. This is called expression template. Basically, the idea is to use template programming to
 generate generic kernels from expression tree at compile time. You can refer to the [Expression Template Tutorial](https://github.com/dmlc/mshadow/blob/master/guide/exp-template/README.md)
@@ -399,11 +399,11 @@ of operation fusion and directly running them.
 
 ### Choose your Own Flavours
 
-As we have compare the flavours of deep learning programs. The goal of this article is to list these choices and compare their trade-offs.
+As we have compare the flavours of deep learning programs. The goal of this article is to list these choices and compare their trade-off.
 There may not be a universal solution for all. But you can always choose your flavour, or combine the flavours you like to create
 more interesting and intelligent deep learning libraries.
 
 Contribution to this Note
 -------------------------
 This note is part of our effort to [open-source system design notes](index.md)
-for deep learning libraries. You are more welcomed to contribute to this Note, by submitting a pull request.
+for deep learning libraries. You are more than welcome to contribute to this Note, by submitting a pull request.
diff --git a/docs/tutorials/imagenet_full.md b/docs/tutorials/imagenet_full.md
index f0e722ed975d..6b460e00a5f9 100644
--- a/docs/tutorials/imagenet_full.md
+++ b/docs/tutorials/imagenet_full.md
@@ -66,9 +66,9 @@ We should note that this result is by no means optimal, as we did not carefully
 
 
 ## The Code and Model
-The code and step guide is publically available at [https://github.com/dmlc/mxnet/tree/master/example/image-classification](https://github.com/dmlc/mxnet/tree/master/example/image-classification)
+The code and step guide is publicly available at [https://github.com/dmlc/mxnet/tree/master/example/image-classification](https://github.com/dmlc/mxnet/tree/master/example/image-classification)
 
-We also release a pretrained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception.md)
+We also release a pre-trained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception.md)
 
 ## How to Use The Model
 We should point out it 21k classes is much more challenging than 1k. Directly use the raw prediction is not a reasonable way.
diff --git a/docs/zh/mxnet-dep-engine-implemention.md b/docs/zh/mxnet-dep-engine-implemention.md
index 318bf3cdf7b9..fbd88b593ef5 100644
--- a/docs/zh/mxnet-dep-engine-implemention.md
+++ b/docs/zh/mxnet-dep-engine-implemention.md
@@ -257,8 +257,8 @@ inline bool ThreadedVar::CompleteWriteDependency(Dispatcher dispatcher) {
     }
   }
   // This is outside of lock scope
-  // Be very carful, pending_write_ and num_pending_reads_
-  // can change now, do not reply ont the two variables.
+  // Be very careful, pending_write_ and num_pending_reads_
+  // can change now, do not reply on the two variables.
   // The linked list \in [old_pending_write, end_of_read_chain)
   // is already detached from this Var.
   // So it is safe to modify these
@@ -400,7 +400,7 @@ inline void ThreadedEngine::OnComplete(ThreadedOpr* threaded_opr) {
     finished_cv_.notify_all();
   }
 
-  // delte operator if it is temperory
+  // delta operator if it is temporary
   if (threaded_opr->temporary) {
     ThreadedOpr::Delete(threaded_opr);
   }
diff --git a/docs/zh/overview.md b/docs/zh/overview.md
index a4632d8b2421..6818f0422f2f 100644
--- a/docs/zh/overview.md
+++ b/docs/zh/overview.md
@@ -1,11 +1,11 @@
 The following is an overview of MXNet in Chinese. For english readers, please
-refer to our [NIPS learnsys paper](http://learningsys.org/papers/LearningSys_2015_paper_1.pdf)
+refer to our [NIPS LearningSys paper](http://learningsys.org/papers/LearningSys_2015_paper_1.pdf)
 
 # MXNet设计和实现简介
 
 神经网络本质上是一种语言，我们通过它来表达对应用问题的理解。例如我们用卷积层来表达空间相关性，RNN来表达时间连续性。根据问题的复杂性和信息如何从输入到输出一步步提取，我们将不同大小的层按一定原则连接起来。近年来随着数据的激增和计算能力的大幅提升，神经网络也变得越来越深和大。例如最近几次imagnet竞赛的冠军都使用有数十至百层的网络。对于这一类神经网络我们通常称之为深度学习。从应用的角度而言，对深度学习最重要的是如何方便地表述神经网络，以及如何快速训练得到模型。
 
-对于一个优秀的深度学习系统，或者更广来说优秀的科学计算系统，最重要的是编程接口的设计。他们都采用将一个*领域特定语言(domain specific language)*嵌入到一个主语言中。例如numpy将矩阵运算嵌入到python中。这类嵌入一般分为两种，其中一种嵌入的较浅，其中每个语句都按原来的意思执行，且通常采用*命令式编程(imperative programming)*，其中numpy和Torch就是属于这种。而另一种则用一种深的嵌入方式，提供一整套针对具体应用的迷你语言。这一种通常使用*声明式语言(declarative programing)*，既用户只需要声明要做什么，而具体执行则由系统完成。这类系统包括Caffe，theano和刚公布的TensorFlow。
+对于一个优秀的深度学习系统，或者更广来说优秀的科学计算系统，最重要的是编程接口的设计。他们都采用将一个*领域特定语言(domain specific language)*嵌入到一个主语言中。例如numpy将矩阵运算嵌入到python中。这类嵌入一般分为两种，其中一种嵌入的较浅，其中每个语句都按原来的意思执行，且通常采用*命令式编程(imperative programming)*，其中numpy和Torch就是属于这种。而另一种则用一种深的嵌入方式，提供一整套针对具体应用的迷你语言。这一种通常使用*声明式语言(declarative programming)*，既用户只需要声明要做什么，而具体执行则由系统完成。这类系统包括Caffe，theano和刚公布的TensorFlow。
 
 这两种方式各有利弊，总结如下
 
diff --git a/docs/zh/packages/python/io.md b/docs/zh/packages/python/io.md
index 08165d84ed0d..ebc6df65b73f 100644
--- a/docs/zh/packages/python/io.md
+++ b/docs/zh/packages/python/io.md
@@ -58,7 +58,7 @@ Create A Data Iterator
 >>>        batch_size=100,
 >>>        # Augmentation Parameter
 >>>        # 可选
->>>        # when offers mean_img, each image will substract the mean value at each pixel
+>>>        # when offers mean_img, each image will subtract the mean value at each pixel
 >>>        mean_img="data/cifar/cifar10_mean.bin",
 >>>        # Augmentation Parameter
 >>>        # 可选
diff --git a/example/README.md b/example/README.md
index bd5f431df04f..e66a86ea0b4e 100644
--- a/example/README.md
+++ b/example/README.md
@@ -27,8 +27,8 @@ If you want to contribute to this list and the examples, please open a new pull
 ###<a name="language-binding-examples"></a>Languages Binding Examples
 ------------------
 * [C++ examples](https://github.com/dmlc/mxnet/tree/master/example/cpp) - Example code for using C++ interface, including NDArray, symbolic layer and models.
-* [MXNet Python](http://mxnet.readthedocs.org/en/latest/python/index.html) - Python library
-* [MXNetR](http://mxnet.readthedocs.org/en/latest/R-package/index.html) - R library
+* [MXNet Python](http://mxnet.readthedocs.io/en/latest/packages/python/index.html) - Python library
+* [MXNetR](http://mxnet.readthedocs.io/en/latest/packages/r/index.html) - R library
 * [MXNet.jl](http://mxnetjl.readthedocs.org/en/latest/) - Julia library
 * [gomxnet](https://github.com/jdeng/gomxnet) - Go binding
 * [MXNet JNI](https://github.com/dmlc/mxnet/tree/master/amalgamation/jni) - JNI(Android) library
@@ -52,6 +52,17 @@ If you want to contribute to this list and the examples, please open a new pull
 * "Solving classificiation + regression with MXnet in Multi Input + Multi Obj" by [xlvector](https://github.com/xlvector) [github link](https://gist.github.com/xlvector/c304d74f9dd6a3b68a3387985482baac) [Blog in Chinese](http://blog.xlvector.net/2016-05/mxnet-regression-classification-for-concret-continuous-features/)
 * "Learn to sort by LSTM" by [xlvector](https://github.com/xlvector) [github link](https://github.com/xlvector/learning-dl/tree/master/mxnet/lstm_sort) [Blog in Chinese](http://blog.xlvector.net/2016-05/mxnet-lstm-example/)
 * [Neural Art using extremely lightweight (<500K) neural network](https://github.com/pavelgonchar/neural-art-mini) Lightweight version of mxnet neural art implementation by [Pavel Gonchar](https://github.com/pavelgonchar)
+* [Neural Art with generative networks](https://github.com/zhaw/neural_style) by [zhaw](https://github.com/zhaw)
+* [Faster R-CNN in MXNet with distributed implementation and data parallelization](https://github.com/dmlc/mxnet/tree/master/example/rcnn)
+* [Asynchronous Methods for Deep Reinforcement Learning in MXNet](https://github.com/zmonoid/Asyn-RL-MXNet/blob/master/mx_asyn.py) by [zmonoid](https://github.com/zmonoid)
+* [Deep Q-learning in MXNet](https://github.com/zmonoid/DQN-MXNet) by [zmonoid](https://github.com/zmonoid)
+* [Face Detection with End-to-End Integration of a ConvNet and a 3D Model (ECCV16)](https://github.com/tfwu/FaceDetection-ConvNet-3D) by [tfwu](https://github.com/tfwu), source code for paper Yunzhu Li, Benyuan Sun, Tianfu Wu and Yizhou Wang, "Face Detection with End-to-End Integration of a ConvNet and a 3D Model", ECCV 2016 <https://arxiv.org/abs/1606.00850>
+* [End-to-End Chinese plate recognition base on MXNet](https://github.com/szad670401/end-to-end-for-chinese-plate-recognition) by [szad670401](https://github.com/szad670401)
+* [Reproduce ResNet-v2 (Identity Mappings in Deep Residual Networks) using MXNet](https://github.com/tornadomeet/ResNet) by [tornadomeet](https://github.com/tornadomeet)
+* [Learning similarity among images in MXNet](http://www.jianshu.com/p/70a66c8f73d3) by xlvector in Chinese. Github [link](https://github.com/xlvector/learning-dl/tree/master/mxnet/triple-loss)
+* [Matrix decomposition (SVD) with MXNet](http://www.jianshu.com/p/ebf7bf53ed3e) by xlvector in Chinese. Github [link](https://github.com/xlvector/mxnet/blob/svd/example/svd/svd.py)
+* [MultiGPU enabled image generative models (GAN and DCGAN)](https://github.com/tqchen/mxnet-gan) by [Tianqi Chen](https://github.com/tqchen)
+* [Baidu Warp CTC with Mxnet](https://github.com/dmlc/mxnet/tree/master/example/warpctc) by xlvector
 
 ###<a name="ipython-notebooks"></a>IPython Notebooks
 -----------------
@@ -81,14 +92,15 @@ If you want to contribute to this list and the examples, please open a new pull
 * [Video backup in Mainland China](http://pan.baidu.com/s/1eS58Gue) 
 * [iPython Notebook](https://github.com/dmlc/mxnet-gtc-tutorial) 
 
-###<a name="deep-learning-for-hackers"></a>Deep learning for hackers with MXnet
+###<a name="deep-learning-for-hackers"></a>Deep learning for hackers with MXNet
 
-* Deep learning for hackers with MXnet (1) GPU installation and MNIST [English](https://no2147483647.wordpress.com/2015/12/07/deep-learning-for-hackers-with-mxnet-1/) [Chinese](http://phunter.farbox.com/post/mxnet-tutorial1) - a tutorial of installing MXnet with GPU and introduction to deep learning by MNIST example.
-* Deep learning for hackers with MXnet (2): Neural art [English](https://no2147483647.wordpress.com/2015/12/21/deep-learning-for-hackers-with-mxnet-2/) [Chinese](http://phunter.farbox.com/post/mxnet-tutorial2) - a tutorial of generating Van Gogh style cat paintings.
+* Deep learning for hackers with MXNet (1) GPU installation and MNIST [English](https://no2147483647.wordpress.com/2015/12/07/deep-learning-for-hackers-with-mxnet-1/) [Chinese](http://phunter.farbox.com/post/mxnet-tutorial1) - a tutorial of installing MXnet with GPU and introduction to deep learning by MNIST example.
+* Deep learning for hackers with MXNet (2): Neural art [English](https://no2147483647.wordpress.com/2015/12/21/deep-learning-for-hackers-with-mxnet-2/) [Chinese](http://phunter.farbox.com/post/mxnet-tutorial2) - a tutorial of generating Van Gogh style cat paintings.
 
-### <a name="mxnet-aws"></a>MXnet setup on AWS
+### <a name="mxnet-aws"></a>MXNet on the cloud
 * [Setup Amazon AWS GPU instance with MXnet](https://no2147483647.wordpress.com/2016/01/16/setup-amazon-aws-gpu-instance-with-mxnet/) - AWS GPU instance setup with GPU (CUDA with latest cuDNN and S3 support)
-* [Intro Guide to AWS (MXnet with Julia)](http://www.datasciencebowl.com/aws_guide/) - A step-by-step guide of using spot instances with Amazon Web Services (AWS) to help you save money when training DSB models on Mxnet by [Mike Kim](http://www.datasciencebowl.com/author/mikekim/)
+* [Intro Guide to AWS (MXNet with Julia)](http://www.datasciencebowl.com/aws_guide/) - A step-by-step guide of using spot instances with Amazon Web Services (AWS) to help you save money when training DSB models on MXNet by [Mike Kim](http://www.datasciencebowl.com/author/mikekim/)
+* [Building Deep Neural Networks in the Cloud with Azure GPU VMs, MXNet and Microsoft R Server](https://blogs.technet.microsoft.com/machinelearning/2016/09/15/building-deep-neural-networks-in-the-cloud-with-azure-gpu-vms-mxnet-and-microsoft-r-server/) by [Cortana Intelligence and ML Blog Team](https://social.technet.microsoft.com/profile/Cortana+Intelligence+and+ML+Blog+Team) at Microsoft
 
 ### <a name="kaggle-tutorials"></a>Kaggle tutorials
 * [Kaggle 2nd Annual Data Science Bowl End-to-End Deep Learning Tutorial (Python)](https://www.kaggle.com/c/second-annual-data-science-bowl/forums/t/18079/end-to-end-deep-learning-tutorial-0-0392) - an end-to-end python tutorial for Kaggle heart disease diagnose competition (public leaderboard score 0.0392)
diff --git a/example/caffe/README.md b/example/caffe/README.md
new file mode 100644
index 000000000000..2a28e012a53a
--- /dev/null
+++ b/example/caffe/README.md
@@ -0,0 +1,49 @@
+# How to use Caffe operator in MXNet
+
+[Caffe](http://caffe.berkeleyvision.org/) has been a well-known and widely-used deep learning framework. Now MXNet has supported calling most caffe operators(layers) and loss functions directly in its symbolic graph! Using one's own customized caffe layer is also effortless.
+
+Besides Caffe, MXNet has already embedded Torch modules and its tensor mathematical functions. ([link](https://github.com/dmlc/mxnet/blob/master/docs/how_to/torch.md))
+
+This blog demonstrates two steps to use Caffe op in MXNet:
+
+* How to install MXNet with Caffe support.
+
+* How to embed Caffe op into MXNet's symbolic graph.
+
+## Install Caffe With MXNet interface
+* Download offical Caffe repository [BVLC/Caffe](https://github.com/BVLC/caffe).
+* Download [caffe patch for mxnet interface] (https://github.com/BVLC/caffe/pull/4527.patch). Move patch file under your caffe root folder and apply the patch by `git apply patch_file_name`.
+* Install caffe following [official guide](http://caffe.berkeleyvision.org/installation.html).
+
+## Compile with Caffe
+* In mxnet folder, open `config.mk` (if you haven't already, copy `make/config.mk` (Linux) or `make/osx.mk` (Mac) into MXNet root folder as `config.mk`) and uncomment the lines `CAFFE_PATH = $(HOME)/caffe` and `MXNET_PLUGINS += plugin/caffe/caffe.mk`. Modify `CAFFE_PATH` to your caffe installation if necessary. 
+* Run `make clean && make` to build with caffe support.
+
+## Caffe Operator (Layer)
+Caffe's neural network operator and loss functions are supported by MXNet through `mxnet.symbol.CaffeOp` and `mxnet.symbol.CaffeLoss` respectively.
+For example, the following code shows multi-layer perception network for classifying MNIST digits ([full code](https://github.com/dmlc/mxnet/blob/master/example/caffe/caffe_net.py)):
+
+### Python
+```Python
+data = mx.symbol.Variable('data')
+fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+```
+
+Let's break it down. First `data = mx.symbol.Variable('data')` defines a variable as placeholder for input.
+Then it's fed through Caffe operators with `fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")`.
+
+The inputs to caffe op are named as data_i for i=0 ... num_data-1 as `num_data` is the number of inputs. You may skip the argument, as the example does, if its value is 1. While `num_weight` is number of `blobs_`(weights). Its default value is 0, as many ops maintain no weight. `prototxt` is the configuration string.
+
+We could also replace the last line by:
+
+```Python
+label = mx.symbol.Variable('softmax_label')
+mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+```
+
+to use loss function in caffe.
diff --git a/example/caffe/caffe_net.py b/example/caffe/caffe_net.py
new file mode 100644
index 000000000000..c91d37bcbecb
--- /dev/null
+++ b/example/caffe/caffe_net.py
@@ -0,0 +1,112 @@
+import mxnet as mx
+from data import get_iterator 
+import argparse
+import train_model
+
+def get_mlp():
+    """
+    multi-layer perceptron
+    """
+    data = mx.symbol.Variable('data')
+    fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+    act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+    fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+    act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+    fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+    if use_caffe_loss:
+        label = mx.symbol.Variable('softmax_label')
+        mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+    else:
+        mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+    return mlp
+
+def get_lenet():
+    """
+    LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
+    Haffner. "Gradient-based learning applied to document recognition."
+    Proceedings of the IEEE (1998)
+    """
+    data = mx.symbol.Variable('data')
+
+    # first conv
+    conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 20 kernel_size: 5 stride: 1} }")
+    act1 = mx.symbol.CaffeOp(data_0=conv1, prototxt="layer{type:\"TanH\"}")
+    pool1 = mx.symbol.CaffeOp(data_0=act1, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+
+    # second conv
+    conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 50 kernel_size: 5 stride: 1} }")
+    act2 = mx.symbol.CaffeOp(data_0=conv2, prototxt="layer{type:\"TanH\"}")
+    pool2 = mx.symbol.CaffeOp(data_0=act2, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+
+    fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2, prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }")
+    act3 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+
+    # second fullc
+    fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2, prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }")
+    if use_caffe_loss:
+        label = mx.symbol.Variable('softmax_label')
+        lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+    else:
+        lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
+    return lenet
+
+def get_network_from_json_file(file_name):
+    network = mx.sym.load(file_name)
+    return network
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='train an image classifier on mnist')
+    parser.add_argument('--network', type=str, default='lenet',
+                        help='the cnn to use (mlp | lenet | <path to network json file>')
+    parser.add_argument('--caffe-loss', type=int, default=0,
+                        help='Use CaffeLoss symbol')
+    parser.add_argument('--caffe-data', type=bool, default=False,
+                        help='Use Caffe input-data layer (True | False)')
+    parser.add_argument('--data-dir', type=str, default='mnist/',
+                        help='the input data directory')
+    parser.add_argument('--gpus', type=str,
+                        help='the gpus will be used, e.g "0,1,2,3"')
+    parser.add_argument('--num-examples', type=int, default=60000,
+                        help='the number of training examples')
+    parser.add_argument('--batch-size', type=int, default=128,
+                        help='the batch size')
+    parser.add_argument('--lr', type=float, default=.1,
+                        help='the initial learning rate')
+    parser.add_argument('--model-prefix', type=str,
+                        help='the prefix of the model to load/save')
+    parser.add_argument('--save-model-prefix', type=str,
+                        help='the prefix of the model to save')
+    parser.add_argument('--num-epochs', type=int, default=10,
+                        help='the number of training epochs')
+    parser.add_argument('--load-epoch', type=int,
+                        help="load the model on an epoch using the model-prefix")
+    parser.add_argument('--kv-store', type=str, default='local',
+                        help='the kvstore type')
+    parser.add_argument('--lr-factor', type=float, default=1,
+                        help='times the lr with a factor for every lr-factor-epoch epoch')
+    parser.add_argument('--lr-factor-epoch', type=float, default=1,
+                        help='the number of epoch to factor the lr, could be .5')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    use_caffe_loss = args.caffe_loss
+    use_caffe_data = args.caffe_data
+
+    data_shape = ()
+    if args.network == 'mlp':
+        data_shape = (784, )
+        net = get_mlp()
+    elif args.network == 'lenet':
+        if not use_caffe_data:
+            data_shape = (1, 28, 28)
+        net = get_lenet()
+    else:
+        net = get_network_from_json_file(args.network)
+
+    # train
+    if use_caffe_loss:
+        train_model.fit(args, net, get_iterator(data_shape, use_caffe_data), mx.metric.Caffe())
+    else:
+        train_model.fit(args, net, get_iterator(data_shape, use_caffe_data))
diff --git a/example/caffe/data.py b/example/caffe/data.py
new file mode 100644
index 000000000000..0ed6ed9d0d79
--- /dev/null
+++ b/example/caffe/data.py
@@ -0,0 +1,92 @@
+import sys
+import os
+# code to automatically download dataset
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+import get_data
+import mxnet as mx
+
+def get_iterator(data_shape, use_caffe_data):
+    def get_iterator_impl_mnist(args, kv):
+        """return train and val iterators for mnist"""
+        # download data
+        get_data.GetMNIST_ubyte()
+        flat = False if len(data_shape) != 1 else True
+
+        train           = mx.io.MNISTIter(
+            image       = "data/train-images-idx3-ubyte",
+            label       = "data/train-labels-idx1-ubyte",
+            input_shape = data_shape,
+            batch_size  = args.batch_size,
+            shuffle     = True,
+            flat        = flat,
+            num_parts   = kv.num_workers,
+            part_index  = kv.rank)
+
+        val = mx.io.MNISTIter(
+            image       = "data/t10k-images-idx3-ubyte",
+            label       = "data/t10k-labels-idx1-ubyte",
+            input_shape = data_shape,
+            batch_size  = args.batch_size,
+            flat        = flat,
+            num_parts   = kv.num_workers,
+            part_index  = kv.rank)
+
+        return (train, val)
+
+    def get_iterator_impl_caffe(args, kv):
+        flat = False if len(data_shape) != 1 else True
+        train = mx.io.CaffeDataIter(
+            prototxt =
+            'layer { \
+                name: "mnist" \
+                type: "Data" \
+                top: "data" \
+                top: "label" \
+                include { \
+                    phase: TRAIN \
+                } \
+                transform_param { \
+                    scale: 0.00390625 \
+                } \
+                data_param { \
+                    source: "caffe/examples/mnist/mnist_train_lmdb" \
+                    batch_size: 64 \
+                    backend: LMDB \
+                } \
+            }',
+            flat           = flat,
+            num_examples   = 60000
+            # float32 is the default, so left out here in order to illustrate
+        )
+
+        val = mx.io.CaffeDataIter(
+            prototxt =
+            'layer { \
+                name: "mnist" \
+                type: "Data" \
+                top: "data" \
+                top: "label" \
+                include { \
+                    phase: TEST \
+                } \
+                transform_param { \
+                    scale: 0.00390625 \
+                } \
+                data_param { \
+                    source: "caffe/examples/mnist/mnist_test_lmdb" \
+                    batch_size: 100 \
+                    backend: LMDB \
+                } \
+            }',
+            flat           = flat,
+            num_examples   = 10000,
+            dtype          = "float32" # float32 is the default
+        )
+
+        return train, val
+
+    if use_caffe_data:
+        return get_iterator_impl_caffe
+    else:
+        return get_iterator_impl_mnist
diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py
new file mode 100644
index 000000000000..0fb295a2b916
--- /dev/null
+++ b/example/caffe/train_model.py
@@ -0,0 +1,100 @@
+import mxnet as mx
+import logging
+import os
+
+def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
+    # kvstore
+    kv = mx.kvstore.create(args.kv_store)
+
+    # logging
+    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
+    if 'log_file' in args and args.log_file is not None:
+        log_file = args.log_file
+        log_dir = args.log_dir
+        log_file_full_name = os.path.join(log_dir, log_file)
+        if not os.path.exists(log_dir):
+            os.mkdir(log_dir)
+        logger = logging.getLogger()
+        handler = logging.FileHandler(log_file_full_name)
+        formatter = logging.Formatter(head)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        logger.setLevel(logging.DEBUG)
+        logger.info('start with arguments %s', args)
+    else:
+        logging.basicConfig(level=logging.DEBUG, format=head)
+        logging.info('start with arguments %s', args)
+
+    # load model
+    model_prefix = args.model_prefix
+    if model_prefix is not None:
+        model_prefix += "-%d" % (kv.rank)
+    model_args = {}
+    if args.load_epoch is not None:
+        assert model_prefix is not None
+        tmp = mx.model.FeedForward.load(model_prefix, args.load_epoch)
+        model_args = {'arg_params' : tmp.arg_params,
+                      'aux_params' : tmp.aux_params,
+                      'begin_epoch' : args.load_epoch}
+    # save model
+    save_model_prefix = args.save_model_prefix
+    if save_model_prefix is None:
+        save_model_prefix = model_prefix
+    checkpoint = None if save_model_prefix is None else mx.callback.do_checkpoint(save_model_prefix)
+
+    # data
+    (train, val) = data_loader(args, kv)
+
+    # train
+    devs = mx.cpu() if args.gpus is None else [
+        mx.gpu(int(i)) for i in args.gpus.split(',')]
+
+    epoch_size = args.num_examples / args.batch_size
+
+    if args.kv_store == 'dist_sync':
+        epoch_size /= kv.num_workers
+        model_args['epoch_size'] = epoch_size
+
+    if 'lr_factor' in args and args.lr_factor < 1:
+        model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
+            step = max(int(epoch_size * args.lr_factor_epoch), 1),
+            factor = args.lr_factor)
+
+    if 'clip_gradient' in args and args.clip_gradient is not None:
+        model_args['clip_gradient'] = args.clip_gradient
+
+    # disable kvstore for single device
+    if 'local' in kv.type and (
+            args.gpus is None or len(args.gpus.split(',')) is 1):
+        kv = None
+
+    model = mx.model.FeedForward(
+        ctx                = devs,
+        symbol             = network,
+        num_epoch          = args.num_epochs,
+        learning_rate      = args.lr,
+        momentum           = 0.9,
+        wd                 = 0.00001,
+        initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34),
+        **model_args)
+
+    if eval_metrics == None:
+        eval_metrics = ['accuracy']
+        ## TopKAccuracy only allows top_k > 1
+        for top_k in [5, 10, 20]:
+            eval_metrics.append(mx.metric.create('top_k_accuracy', top_k = top_k))
+
+    if batch_end_callback is not None:
+        if not isinstance(batch_end_callback, list):
+            batch_end_callback = [batch_end_callback]
+    else:
+        batch_end_callback = []
+    batch_end_callback.append(mx.callback.Speedometer(args.batch_size, 50))
+
+    model.fit(
+       X                  = train,
+       eval_data          = val,
+       eval_metric        = eval_metrics,
+       kvstore            = kv,
+       batch_end_callback = batch_end_callback,
+       epoch_end_callback = checkpoint)
diff --git a/example/fcn-xs/README.md b/example/fcn-xs/README.md
index a902fcdee7ac..4ee0e9238f78 100644
--- a/example/fcn-xs/README.md
+++ b/example/fcn-xs/README.md
@@ -12,15 +12,16 @@ we have trained a simple fcn-xs model, the parameter is below:
 | fcn-32s | 1e-10 | 31 |
 | fcn-16s | 1e-12 | 27 |
 | fcn-8s | 1e-14 | 19 |
+(```when using the newest mxnet, you'd better using larger learning rate, such as 1e-4, 1e-5, 1e-6 instead, because the newest mxnet will do gradient normalization in SoftmaxOutput```)
 
 the training image number is only : 2027, and the Validation image number is: 462  
 
 ## How to train fcn-xs in mxnet
 #### step1: download the vgg16fc model and experiment data
-* vgg16fc model : you can download the ```VGG_FC_ILSVRC_16_layers-symbol.json``` and ```VGG_FC_ILSVRC_16_layers-0074.params``` from [yun.baidu](http://pan.baidu.com/s/1bgz4PC).  
+* vgg16fc model : you can download the ```VGG_FC_ILSVRC_16_layers-symbol.json``` and ```VGG_FC_ILSVRC_16_layers-0074.params```   [baidu yun](http://pan.baidu.com/s/1bgz4PC), [dropbox](https://www.dropbox.com/sh/578n5cxej7ofd6m/AACuSeSYGcKQDi1GoB72R5lya?dl=0).  
 this is the fully convolution style of the origin
 [VGG_ILSVRC_16_layers.caffemodel](http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_16_layers.caffemodel), and the corresponding [VGG_ILSVRC_16_layers_deploy.prototxt](https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-vgg_ilsvrc_16_layers_deploy-prototxt), the vgg16 model has [license](http://creativecommons.org/licenses/by-nc/4.0/) for non-commercial use only.
-* experiment data : you can download the ```VOC2012.rar``` from [yun.baidu](http://pan.baidu.com/s/1bgz4PC), and Extract it. the file/folder will be like:  
+* experiment data : you can download the ```VOC2012.rar```  [robots.ox.ac.uk](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar), and Extract it. the file/folder will be like:  
 ```JPEGImages folder```, ```SegmentationClass folder```, ```train.lst```, ```val.lst```, ```test.lst```
 
 #### step2: train fcn-xs model
diff --git a/example/fcn-xs/image_segmentaion.py b/example/fcn-xs/image_segmentaion.py
index 56c7482fcb81..a09744773596 100644
--- a/example/fcn-xs/image_segmentaion.py
+++ b/example/fcn-xs/image_segmentaion.py
@@ -3,27 +3,25 @@
 import mxnet as mx
 from PIL import Image
 
-pallete = [ 0,0,0,
-            128,0,0,
-            0,128,0,
-            128,128,0,
-            0,0,128,
-            128,0,128,
-            0,128,128,
-            128,128,128,
-            64,0,0,
-            192,0,0,
-            64,128,0,
-            192,128,0,
-            64,0,128,
-            192,0,128,
-            64,128,128,
-            192,128,128,
-            0,64,0,
-            128,64,0,
-            0,192,0,
-            128,192,0,
-            0,64,128 ]
+def getpallete(num_cls):
+        # this function is to get the colormap for visualizing the segmentation mask
+        n = num_cls
+        pallete = [0]*(n*3)
+        for j in xrange(0,n):
+                lab = j
+                pallete[j*3+0] = 0
+                pallete[j*3+1] = 0
+                pallete[j*3+2] = 0
+                i = 0
+                while (lab > 0):
+                        pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
+                        pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
+                        pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
+                        i = i + 1
+                        lab >>= 3
+         return pallete
+
+pallete = getpallete(256)
 img = "./person_bicycle.jpg"
 seg = img.replace("jpg", "png")
 model_previx = "FCN8s_VGG16"
diff --git a/example/fcn-xs/init_fcnxs.py b/example/fcn-xs/init_fcnxs.py
index 69295ce6be68..c90a45bb4358 100644
--- a/example/fcn-xs/init_fcnxs.py
+++ b/example/fcn-xs/init_fcnxs.py
@@ -11,7 +11,7 @@
 def upsample_filt(size):
     factor = (size + 1) // 2
     if size % 2 == 1:
-        center = factor - 1
+        center = factor - 1.0
     else:
         center = factor - 0.5
     og = np.ogrid[:size, :size]
diff --git a/example/fcn-xs/symbol_fcnxs.py b/example/fcn-xs/symbol_fcnxs.py
index ab283fa13f50..a24a81eb6818 100644
--- a/example/fcn-xs/symbol_fcnxs.py
+++ b/example/fcn-xs/symbol_fcnxs.py
@@ -2,8 +2,7 @@
 import mxnet as mx
 
 def filter_map(kernel=1, stride=1, pad=0):
-    # why not return (stride, (kernel-stride)/2-pad)??
-    return (stride, (kernel-1)/2-pad)
+    return (stride, (kernel-stride)/2-pad)
 
 def compose_fp(fp_first, fp_second):
     return (fp_first[0]*fp_second[0], fp_first[0]*fp_second[1]+fp_first[1])
diff --git a/example/image-classification/README.md b/example/image-classification/README.md
index 63888093a02a..4e06fd14abde 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -93,7 +93,7 @@ We can train a model using multiple machines.
   ```
 
 See more launch options, e.g. by `Yarn`, and how to write a distributed training
-program on this [tutorial](http://mxnet.readthedocs.org/en/latest/distributed_training.html)
+program on this [tutorial](http://mxnet.readthedocs.io/en/latest/how_to/multi_devices.html)
 
 ### Predict
 
diff --git a/example/image-classification/symbol_alexnet.R b/example/image-classification/symbol_alexnet.R
new file mode 100644
index 000000000000..ec768c9adb14
--- /dev/null
+++ b/example/image-classification/symbol_alexnet.R
@@ -0,0 +1,36 @@
+library(mxnet)
+
+get_symbol <- function(num_classes = 1000) {
+  input_data <- mx.symbol.Variable(name = "data")
+  # stage 1
+  conv1 <- mx.symbol.Convolution(data = input_data, kernel = c(11, 11), stride = c(4, 4), num_filter = 96)
+  relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu")
+  pool1 <- mx.symbol.Pooling(data = relu1, pool_type = "max", kernel = c(3, 3), stride = c(2, 2))
+  lrn1 <- mx.symbol.LRN(data = pool1, alpha = 0.0001, beta = 0.75, knorm = 1, nsize = 5)
+  # stage 2
+  conv2 <- mx.symbol.Convolution(data = lrn1, kernel = c(5, 5), pad = c(2, 2), num_filter = 256)
+  relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu")
+  pool2 <- mx.symbol.Pooling(data = relu2, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
+  lrn2 <- mx.symbol.LRN(data = pool2, alpha = 0.0001, beta = 0.75, knorm = 1, nsize = 5)
+  # stage 3
+  conv3 <- mx.symbol.Convolution(data = lrn2, kernel = c(3, 3), pad = c(1, 1), num_filter = 384)
+  relu3 <- mx.symbol.Activation(data = conv3, act_type = "relu")
+  conv4 <- mx.symbol.Convolution(data = relu3, kernel = c(3, 3), pad = c(1, 1), num_filter = 384)
+  relu4 <- mx.symbol.Activation(data = conv4, act_type = "relu")
+  conv5 <- mx.symbol.Convolution(data = relu4, kernel = c(3, 3), pad = c(1, 1), num_filter = 256)
+  relu5 <- mx.symbol.Activation(data = conv5, act_type = "relu")
+  pool3 <- mx.symbol.Pooling(data = relu5, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
+  # stage 4
+  flatten <- mx.symbol.Flatten(data = pool3)
+  fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 4096)
+  relu6 <- mx.symbol.Activation(data = fc1, act_type = "relu")
+  dropout1 <- mx.symbol.Dropout(data = relu6, p = 0.5)
+  # stage 5
+  fc2 <- mx.symbol.FullyConnected(data = dropout1, num_hidden = 4096)
+  relu7 <- mx.symbol.Activation(data = fc2, act_type = "relu")
+  dropout2 <- mx.symbol.Dropout(data = relu7, p = 0.5)
+  # stage 6
+  fc3 <- mx.symbol.FullyConnected(data = dropout2, num_hidden = num_classes)
+  softmax <- mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+  return(softmax)
+}
diff --git a/example/image-classification/symbol_googlenet.R b/example/image-classification/symbol_googlenet.R
new file mode 100644
index 000000000000..8e7ac7224d55
--- /dev/null
+++ b/example/image-classification/symbol_googlenet.R
@@ -0,0 +1,67 @@
+library(mxnet)
+
+ConvFactory <- function(data, num_filter, kernel, stride = c(1, 1), pad = c(0, 0),
+                        name = '', suffix = '') {
+    conv <- mx.symbol.Convolution(data = data, num_filter = num_filter, kernel = kernel, stride = stride,
+                                  pad = pad, name = paste('conv_', name, suffix, sep = ""))
+    act <- mx.symbol.Activation(data = conv, act_type = 'relu', name = paste('relu_', name, suffix, sep = ''))
+    return(act)
+}
+
+InceptionFactory <- function(data, num_1x1, num_3x3red, num_3x3,
+                             num_d5x5red, num_d5x5, pool, proj, name) {
+    # 1x1
+    c1x1 <- ConvFactory(data = data, num_filter = num_1x1, kernel = c(1, 1),
+                        name = paste(name, '_1x1', sep = ''))
+    # 3x3 reduce + 3x3
+    c3x3r = ConvFactory(data = data, num_filter = num_3x3red, kernel = c(1, 1),
+                        name = paste(name, '_3x3', sep = ''), suffix = '_reduce')
+    c3x3 = ConvFactory(data = c3x3r, num_filter = num_3x3, kernel = c(3, 3),
+                       pad = c(1, 1), name = paste(name, '_3x3', sep = ''))
+    # double 3x3 reduce + double 3x3
+    cd5x5r = ConvFactory(data = data, num_filter = num_d5x5red, kernel = c(1, 1),
+                         name = paste(name, '_5x5', sep = ''), suffix = '_reduce')
+    cd5x5 = ConvFactory(data = cd5x5r, num_filter = num_d5x5, kernel = c(5, 5), pad = c(2, 2),
+                        name = paste(name, '_5x5', sep = ''))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data = data, kernel = c(3, 3), stride = c(1, 1), 
+                                pad = c(1, 1), pool_type = pool,
+                                name = paste(pool, '_pool_', name, '_pool', sep = ''))
+
+    cproj = ConvFactory(data = pooling, num_filter = proj, kernel = c(1, 1), 
+                        name = paste(name, '_proj', sep = ''))
+    # concat
+    concat_lst <- list()
+    concat_lst <- c(c1x1, c3x3, cd5x5, cproj)
+    concat_lst$num.args = 4
+    concat_lst$name = paste('ch_concat_', name, '_chconcat', sep = '')
+    concat = mxnet:::mx.varg.symbol.Concat(concat_lst)
+    return(concat)
+}
+
+
+get_symbol <- function(num_classes = 1000) {
+  data <- mx.symbol.Variable("data")
+  conv1 <- ConvFactory(data, 64, kernel = c(7, 7), stride = c(2, 2), pad = c(3, 3), name = "conv1")
+  pool1 <- mx.symbol.Pooling(conv1, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
+  conv2 <- ConvFactory(pool1, 64, kernel = c(1, 1), stride = c(1, 1), name = "conv2")
+  conv3 <- ConvFactory(conv2, 192, kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1), name = "conv3")
+  pool3 <- mx.symbol.Pooling(conv3, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
+  
+  in3a <- InceptionFactory(pool3, 64, 96, 128, 16, 32, "max", 32, name = "in3a")
+  in3b <- InceptionFactory(in3a, 128, 128, 192, 32, 96, "max", 64, name = "in3b")
+  pool4 <- mx.symbol.Pooling(in3b, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
+  in4a <- InceptionFactory(pool4, 192, 96, 208, 16, 48, "max", 64, name = "in4a")
+  in4b <- InceptionFactory(in4a, 160, 112, 224, 24, 64, "max", 64, name = "in4b")
+  in4c <- InceptionFactory(in4b, 128, 128, 256, 24, 64, "max", 64, name = "in4c")
+  in4d <- InceptionFactory(in4c, 112, 144, 288, 32, 64, "max", 64, name = "in4d")
+  in4e <- InceptionFactory(in4d, 256, 160, 320, 32, 128, "max", 128, name = "in4e")
+  pool5 <- mx.symbol.Pooling(in4e, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
+  in5a <- InceptionFactory(pool5, 256, 160, 320, 32, 128, "max", 128, name = "in5a")
+  in5b <- InceptionFactory(in5a, 384, 192, 384, 48, 128, "max", 128, name = "in5b")
+  pool6 <- mx.symbol.Pooling(in5b, kernel = c(7, 7), stride = c(1, 1), pool_type = "avg" )
+  flatten <- mx.symbol.Flatten(data = pool6, name = 'flatten0')
+  fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = num_classes)
+  softmax <- mx.symbol.SoftmaxOutput(data = fc1, name = 'softmax')
+  return(softmax)
+}
diff --git a/example/image-classification/symbol_inception-bn.R b/example/image-classification/symbol_inception-bn.R
new file mode 100644
index 000000000000..59d871e08f4a
--- /dev/null
+++ b/example/image-classification/symbol_inception-bn.R
@@ -0,0 +1,113 @@
+library(mxnet)
+
+ConvFactory <- function(data, num_filter, kernel, stride = c(1, 1),
+                        pad = c(0, 0), name = '', suffix = '') {
+    conv <- mx.symbol.Convolution(data = data, num_filter = num_filter,
+                                  kernel = kernel, stride = stride, pad = pad,
+                                  name = paste('conv_', name, suffix, sep = ''))
+    
+    bn <- mx.symbol.BatchNorm(data = conv, name = paste('bn_', name, suffix, sep = ''))
+    act <- mx.symbol.Activation(data = bn, act_type = 'relu', name = paste('relu_', name, suffix, sep = ''))
+    return(act)
+}
+
+InceptionFactoryA <- function(data, num_1x1, num_3x3red, num_3x3, num_d3x3red,
+                              num_d3x3, pool, proj, name) {
+  # 1x1
+  c1x1 <- ConvFactory(data = data, num_filter = num_1x1, kernel = c(1, 1), name = paste(name, '_1x1', sep = '')
+    )
+  # 3x3 reduce + 3x3
+  c3x3r <- ConvFactory(data = data, num_filter = num_3x3red, kernel = c(1, 1),
+                       name = paste(name, '_3x3', sep = ''), suffix = '_reduce')
+
+  c3x3 <- ConvFactory(data = c3x3r, num_filter = num_3x3, kernel = c(3, 3),
+                      pad = c(1, 1), name = paste(name, '_3x3', sep = ''))
+  # double 3x3 reduce + double 3x3
+  cd3x3r <- ConvFactory(data = data, num_filter = num_d3x3red, kernel = c(1, 1),
+                        name = paste(name, '_double_3x3', sep = ''), suffix = '_reduce')
+
+  cd3x3 <- ConvFactory(data = cd3x3r, num_filter = num_d3x3, kernel = c(3, 3),
+                       pad = c(1, 1), name = paste(name, '_double_3x3_0', sep = ''))
+
+  cd3x3 <- ConvFactory(data = cd3x3, num_filter = num_d3x3, kernel = c(3, 3),
+                       pad = c(1, 1), name = paste(name, '_double_3x3_1', sep = ''))
+  # pool + proj
+  pooling <- mx.symbol.Pooling(data = data, kernel = c(3, 3), stride = c(1, 1),
+                               pad = c(1, 1), pool_type = pool,
+                               name = paste(pool, '_pool_', name, '_pool', sep = ''))
+  cproj <- ConvFactory(data = pooling, num_filter = proj, kernel = c(1, 1),
+                       name = paste(name, '_proj', sep = ''))
+  # concat
+  concat_lst <- list()
+  concat_lst <- c(c1x1, c3x3, cd3x3, cproj)
+  concat_lst$num.args = 4
+  concat_lst$name = paste('ch_concat_', name, '_chconcat', sep = '')
+  concat = mxnet:::mx.varg.symbol.Concat(concat_lst)
+  return(concat)
+}
+
+InceptionFactoryB <- function(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name) {
+    # 3x3 reduce + 3x3
+    c3x3r <- ConvFactory(data = data, num_filter = num_3x3red, kernel = c(1, 1),
+                         name = paste(name, '_3x3', sep = ''), suffix = '_reduce')
+    c3x3 <- ConvFactory(data = c3x3r, num_filter = num_3x3, kernel = c(3, 3),
+                        pad = c(1, 1), stride = c(2, 2), name = paste(name, '_3x3', sep = ''))
+    # double 3x3 reduce + double 3x3
+    cd3x3r <- ConvFactory(data = data, num_filter = num_d3x3red, kernel = c(1, 1),
+                         name = paste(name, '_double_3x3', sep = ''), suffix = '_reduce')
+    cd3x3 <- ConvFactory(data = cd3x3r, num_filter = num_d3x3, kernel = c(3, 3),
+                         pad = c(1, 1), stride = c(1, 1), name = paste(name, '_double_3x3_0', sep = ''))
+    cd3x3 = ConvFactory(data = cd3x3, num_filter = num_d3x3, kernel = c(3, 3),
+                        pad = c(1, 1), stride = c(2, 2), name = paste(name, '_double_3x3_1', sep = ''))
+    # pool + proj
+    pooling = mx.symbol.Pooling(data = data, kernel = c(3, 3), stride = c(2, 2),
+                                pad = c(1, 1), pool_type = "max",
+                                name = paste('max_pool_', name, '_pool', sep = ''))
+    # concat
+    concat_lst <- list()
+    concat_lst <- c(c3x3, cd3x3, pooling)
+    concat_lst$num.args = 3
+    concat_lst$name = paste('ch_concat_', name, '_chconcat', sep = '')
+    concat = mxnet:::mx.varg.symbol.Concat(concat_lst)
+    return(concat)
+}
+
+get_symbol <- function(num_classes = 1000) {
+  # data
+  data = mx.symbol.Variable(name = "data")
+  # stage 1
+  conv1 = ConvFactory(data = data, num_filter = 64, kernel = c(7, 7),
+                      stride = c(2, 2), pad = c(3, 3), name = 'conv1')
+  pool1 = mx.symbol.Pooling(data = conv1, kernel = c(3, 3), stride = c(2, 2),
+                            name = 'pool1', pool_type = 'max')
+  # stage 2
+  conv2red = ConvFactory(data = pool1, num_filter = 64, kernel = c(1, 1),
+                         stride = c(1, 1), name = 'conv2red')
+  conv2 = ConvFactory(data = conv2red, num_filter = 192, kernel = c(3, 3),
+                      stride = c(1, 1), pad = c(1, 1), name = 'conv2')
+  pool2 = mx.symbol.Pooling(data = conv2, kernel = c(3, 3), stride = c(2, 2),
+                            name = 'pool2', pool_type = 'max')
+  # stage 2
+  in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
+  in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
+  in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
+  # stage 3
+  in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
+  in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
+  in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
+  in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
+  in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
+  # stage 4
+  in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
+  in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
+  # global avg pooling
+  avg = mx.symbol.Pooling(data = in5b, kernel = c(7, 7), stride = c(1, 1),
+                          name = "global_pool", pool_type = 'avg')
+  # linear classifier
+  flatten = mx.symbol.Flatten(data = avg, name = 'flatten')
+  fc1 = mx.symbol.FullyConnected(data = flatten,
+                                 num_hidden = num_classes,
+                                 name = 'fc1')
+  softmax = mx.symbol.SoftmaxOutput(data = fc1, name = 'softmax')
+  return(softmax)
+}
diff --git a/example/image-classification/symbol_lenet.R b/example/image-classification/symbol_lenet.R
new file mode 100644
index 000000000000..b9991912bf52
--- /dev/null
+++ b/example/image-classification/symbol_lenet.R
@@ -0,0 +1,24 @@
+library(mxnet)
+
+get_symbol <- function(num_classes = 1000) {
+  data <- mx.symbol.Variable('data')
+  # first conv
+  conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 20)
+
+  tanh1 <- mx.symbol.Activation(data = conv1, act_type = "tanh")
+  pool1 <- mx.symbol.Pooling(data = tanh1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
+  
+  # second conv
+  conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 50)
+  tanh2 <- mx.symbol.Activation(data = conv2, act_type = "tanh")
+  pool2 <- mx.symbol.Pooling(data = tanh2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
+  # first fullc
+  flatten <- mx.symbol.Flatten(data = pool2)
+  fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 500)
+  tanh3 <- mx.symbol.Activation(data = fc1, act_type = "tanh")
+  # second fullc
+  fc2 <- mx.symbol.FullyConnected(data = tanh3, num_hidden = num_classes)
+  # loss
+  lenet <- mx.symbol.SoftmaxOutput(data = fc2, name = 'softmax')
+  return(lenet)
+}
diff --git a/example/image-classification/symbol_mlp.R b/example/image-classification/symbol_mlp.R
new file mode 100644
index 000000000000..55aaf1f776ff
--- /dev/null
+++ b/example/image-classification/symbol_mlp.R
@@ -0,0 +1,12 @@
+library(mxnet)
+
+get_symbol <- function(num_classes = 1000) {
+  data <- mx.symbol.Variable('data')
+  fc1 <- mx.symbol.FullyConnected(data = data, name = 'fc1', num_hidden = 128)
+  act1 <- mx.symbol.Activation(data = fc1, name = 'relu1', act_type = "relu")
+  fc2 <- mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+  act2 <- mx.symbol.Activation(data = fc2, name = 'relu2', act_type = "relu")
+  fc3 <- mx.symbol.FullyConnected(data = act2, name = 'fc3', num_hidden = num_classes)
+  mlp <- mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+  return(mlp)
+}
diff --git a/example/image-classification/symbol_resnet-28-small.R b/example/image-classification/symbol_resnet-28-small.R
new file mode 100644
index 000000000000..4ef9e950059d
--- /dev/null
+++ b/example/image-classification/symbol_resnet-28-small.R
@@ -0,0 +1,82 @@
+library(mxnet)
+
+conv_factory <- function(data, num_filter, kernel, stride,
+                         pad, act_type = 'relu', conv_type = 0) {
+    if (conv_type == 0) {
+      conv = mx.symbol.Convolution(data = data, num_filter = num_filter,
+                                   kernel = kernel, stride = stride, pad = pad)
+      bn = mx.symbol.BatchNorm(data = conv)
+      act = mx.symbol.Activation(data = bn, act_type = act_type)
+      return(act)
+    } else if (conv_type == 1) {
+      conv = mx.symbol.Convolution(data = data, num_filter = num_filter,
+                                   kernel = kernel, stride = stride, pad = pad)
+      bn = mx.symbol.BatchNorm(data = conv)
+      return(bn)
+    }
+}
+
+residual_factory <- function(data, num_filter, dim_match) {
+  if (dim_match) {
+    identity_data = data
+    conv1 = conv_factory(data = data, num_filter = num_filter, kernel = c(3, 3),
+                         stride = c(1, 1), pad = c(1, 1), act_type = 'relu', conv_type = 0)
+    
+    conv2 = conv_factory(data = conv1, num_filter = num_filter, kernel = c(3, 3),
+                         stride = c(1, 1), pad = c(1, 1), conv_type = 1)
+    new_data = identity_data + conv2
+    act = mx.symbol.Activation(data = new_data, act_type = 'relu')
+    return(act)
+  } else {
+    conv1 = conv_factory(data = data, num_filter = num_filter, kernel = c(3, 3),
+                         stride = c(2, 2), pad = c(1, 1), act_type = 'relu', conv_type = 0)
+    conv2 = conv_factory(data = conv1, num_filter = num_filter, kernel = c(3, 3),
+                         stride = c(1, 1), pad = c(1, 1), conv_type = 1)
+    
+    # adopt project method in the paper when dimension increased
+    project_data = conv_factory(data = data, num_filter = num_filter, kernel = c(1, 1),
+                                stride = c(2, 2), pad = c(0, 0), conv_type = 1)
+    new_data = project_data + conv2
+    act = mx.symbol.Activation(data = new_data, act_type = 'relu')
+    return(act)
+  }
+}
+
+residual_net <- function(data, n) {
+  #fisrt 2n layers
+  for (i in 1:n) {
+    data = residual_factory(data = data, num_filter = 16, dim_match = TRUE)
+  }
+  
+  
+  #second 2n layers
+  for (i in 1:n) {
+    if (i == 1) {
+      data = residual_factory(data = data, num_filter = 32, dim_match = FALSE)
+    } else {
+      data = residual_factory(data = data, num_filter = 32, dim_match = TRUE)
+    }
+  }
+  #third 2n layers
+  for (i in 1:n) {
+    if (i == 1) {
+      data = residual_factory(data = data, num_filter = 64, dim_match = FALSE)
+    } else {
+      data = residual_factory(data = data, num_filter = 64, dim_match = TRUE)
+    }
+  }
+  return(data)
+}
+
+get_symbol <- function(num_classes = 10) {
+  conv <- conv_factory(data = mx.symbol.Variable(name = 'data'), num_filter = 16,
+                      kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1),
+                      act_type = 'relu', conv_type = 0)
+  n <- 3 # set n = 3 means get a model with 3*6+2=20 layers, set n = 9 means 9*6+2=56 layers
+  resnet <- residual_net(conv, n) #
+  pool <- mx.symbol.Pooling(data = resnet, kernel = c(7, 7), pool_type = 'avg')
+  flatten <- mx.symbol.Flatten(data = pool, name = 'flatten')
+  fc <- mx.symbol.FullyConnected(data = flatten, num_hidden = num_classes, name = 'fc1')
+  softmax <- mx.symbol.SoftmaxOutput(data = fc, name = 'softmax')
+  return(softmax)
+}
diff --git a/example/image-classification/symbol_resnet.R b/example/image-classification/symbol_resnet.R
new file mode 100644
index 000000000000..224fa91c8b25
--- /dev/null
+++ b/example/image-classification/symbol_resnet.R
@@ -0,0 +1,70 @@
+library(mxnet)
+
+get_conv <- function(name, data, num_filter, kernel, stride,
+                     pad, with_relu, bn_momentum) {
+  conv = mx.symbol.Convolution(name = name, data = data, num_filter = num_filter,
+                               kernel = kernel, stride = stride, pad = pad, no_bias = TRUE)
+  bn = mx.symbol.BatchNorm(name = paste(name, '_bn', sep = ''), data = conv,
+                           fix_gamma = FALSE, momentum = bn_momentum, eps = 2e-5)
+  if (with_relu) {
+    return(mx.symbol.Activation(name = paste(name, '_relu', sep = ''),
+                                data = bn, act_type = 'relu'))
+  } else {
+    return(bn)
+  }
+}
+
+make_block <- function(name, data, num_filter, dim_match, bn_momentum) {
+  if (dim_match) {
+    conv1 = get_conv(name = paste(name, '_conv1', sep = ''), data = data,
+                     num_filter = num_filter, kernel = c(3, 3), stride = c(1, 1),
+                     pad = c(1, 1), with_relu = TRUE, bn_momentum = bn_momentum)
+  } else {
+    conv1 = get_conv(name = paste(name, '_conv1', sep = ''), data = data,
+                     num_filter = num_filter, kernel = c(3, 3), stride = c(2, 2),
+                     pad = c(1, 1), with_relu = TRUE, bn_momentum = bn_momentum)
+  }
+  
+  conv2 = get_conv(name = paste(name, '_conv2', sep = ''), data = conv1,
+                   num_filter = num_filter, kernel = c(3, 3), stride = c(1, 1),
+                   pad = c(1, 1), with_relu = FALSE, bn_momentum = bn_momentum)
+  if (dim_match) {
+    shortcut = data
+  } else {
+    shortcut = mx.symbol.Convolution(name = paste(name, '_proj', sep = ''),
+                                     data = data, num_filter = num_filter, kernel = c(2, 2), 
+                                     stride = c(2, 2), pad = c(0, 0), no_bias = TRUE)
+  }
+  fused = shortcut + conv2
+  return(mx.symbol.Activation(name = paste(name, '_relu', sep = ''), data = fused, act_type = 'relu'))
+}
+
+get_body <- function(data, num_level, num_block, num_filter, bn_momentum) {
+  for (level in 1:num_level) {
+    for (block in 1:num_block) {
+      data = make_block(
+        name = paste('level', level, '_block', block, sep = ''),
+        data = data,
+        num_filter = num_filter * 2 ^ (level - 1),
+        dim_match = (level == 1 || block > 1),
+        bn_momentum = bn_momentum
+      )
+    }
+  }
+  return(data)
+}
+
+get_symbol <- function(num_class, num_level = 3, num_block = 9,
+                       num_filter = 16, bn_momentum = 0.9, pool_kernel = c(8, 8)) {
+  data = mx.symbol.Variable(name = 'data')
+  zscore = mx.symbol.BatchNorm(name = 'zscore', data = data, 
+                               fix_gamma = TRUE, momentum = bn_momentum)
+  conv = get_conv(name = 'conv0', data = zscore, num_filter = num_filter,
+                  kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1),
+                  with_relu = TRUE, bn_momentum = bn_momentum)
+  body = get_body(conv, num_level, num_block, num_filter, bn_momentum)
+  pool = mx.symbol.Pooling(data = body, kernel = pool_kernel, pool_type = 'avg')
+  flat = mx.symbol.Flatten(data = pool)
+  fc = mx.symbol.FullyConnected(data = flat, num_hidden = num_class, name = 'fc')
+  return(mx.symbol.SoftmaxOutput(data = fc, name = 'softmax'))
+}
diff --git a/example/image-classification/symbol_unet.R b/example/image-classification/symbol_unet.R
new file mode 100644
index 000000000000..e15b48a4a005
--- /dev/null
+++ b/example/image-classification/symbol_unet.R
@@ -0,0 +1,81 @@
+library(mxnet)
+
+convolution_module <- function(net, kernel_size, pad_size,
+                               filter_count, stride = c(1, 1), work_space = 2048,
+                               batch_norm = TRUE, down_pool = FALSE, up_pool = FALSE,
+                               act_type = "relu", convolution = TRUE) {
+    if (up_pool) {
+      net = mx.symbol.Deconvolution(net, kernel = c(2, 2), pad = c(0, 0),
+                                    stride = c(2, 2), num_filter = filter_count, workspace = work_space)
+      net = mx.symbol.BatchNorm(net)
+      if (act_type != "") {
+        net = mx.symbol.Activation(net, act_type = act_type)
+      }
+    }
+    if (convolution) {
+      conv = mx.symbol.Convolution(data = net, kernel = kernel_size, stride = stride,
+                                   pad = pad_size, num_filter = filter_count, workspace = work_space)
+      net = conv
+    }
+    
+    if (batch_norm) {
+      net = mx.symbol.BatchNorm(net)
+    }
+    
+    if (act_type != "") {
+      net = mx.symbol.Activation(net, act_type = act_type)
+    }
+    
+    if (down_pool) {
+      pool = mx.symbol.Pooling(net, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
+      net = pool
+    }
+    return(net)
+}
+
+get_symbol <- function(num_classes = 10) {
+  data = mx.symbol.Variable('data')
+  kernel_size = c(3, 3)
+  pad_size = c(1, 1)
+  filter_count = 32
+  pool1 = convolution_module(data, kernel_size, pad_size, filter_count = filter_count, down_pool = TRUE)
+  net = pool1
+  pool2 = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 2, down_pool = TRUE)
+  net = pool2
+  pool3 = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, down_pool = TRUE)
+  net = pool3
+  pool4 = convolution_module(net,
+                             kernel_size,
+                             pad_size,
+                             filter_count = filter_count * 4,
+                             down_pool = TRUE)
+  net = pool4
+  net = mx.symbol.Dropout(net)
+  pool5 = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 8, down_pool = TRUE)
+  net = pool5
+  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, up_pool = TRUE)
+  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, up_pool = TRUE)
+  
+  # dirty "CROP" to wanted size... I was on old MxNet branch so used conv instead of crop for cropping
+  net = convolution_module(net, c(4, 4), c(0, 0), filter_count = filter_count * 4)
+  
+  net = mx.symbol.Concat(c(pool3, net), num.args = 2)
+  net = mx.symbol.Dropout(net)
+  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4)
+  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4, up_pool = TRUE)
+  
+  net = mx.symbol.Concat(c(pool2, net), num.args = 2)
+  net = mx.symbol.Dropout(net)
+  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4)
+  net = convolution_module(net, kernel_size, pad_size,
+                           filter_count = filter_count * 4, up_pool = TRUE)
+  convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 4)
+  net = mx.symbol.Concat(c(pool1, net), num.args = 2)
+  net = mx.symbol.Dropout(net)
+  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 2)
+  net = convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 2, up_pool = TRUE)
+  net = mx.symbol.Flatten(net)
+  net = mx.symbol.FullyConnected(data = net, num_hidden = num_classes)
+  net = mx.symbol.SoftmaxOutput(data = net, name = 'softmax')
+  return(net)
+}
\ No newline at end of file
diff --git a/example/image-classification/symbol_vgg.R b/example/image-classification/symbol_vgg.R
new file mode 100644
index 000000000000..4ebd1017a3f6
--- /dev/null
+++ b/example/image-classification/symbol_vgg.R
@@ -0,0 +1,58 @@
+library(mxnet)
+
+get_symbol <- function(num_classes = 1000) {
+  ## define alexnet
+  data = mx.symbol.Variable(name = "data")
+  # group 1
+  conv1_1 = mx.symbol.Convolution(data = data, kernel = c(3, 3), pad = c(1, 1),
+                                  num_filter = 64, name = "conv1_1")
+  relu1_1 = mx.symbol.Activation(data = conv1_1, act_type = "relu", name = "relu1_1")
+  pool1 = mx.symbol.Pooling(data = relu1_1, pool_type = "max", kernel = c(2, 2),
+                            stride = c(2, 2), name = "pool1")
+  # group 2
+  conv2_1 = mx.symbol.Convolution(data = pool1, kernel = c(3, 3), pad = c(1, 1),
+                                  num_filter = 128, name = "conv2_1")
+  relu2_1 = mx.symbol.Activation(data = conv2_1, act_type = "relu", name = "relu2_1")
+  pool2 = mx.symbol.Pooling(data = relu2_1, pool_type = "max", kernel = c(2, 2),
+                            stride = c(2, 2), name = "pool2")
+  # group 3
+  conv3_1 = mx.symbol.Convolution(data = pool2, kernel = c(3, 3), pad = c(1, 1),
+                                  num_filter = 256, name = "conv3_1")
+  relu3_1 = mx.symbol.Activation(data = conv3_1, act_type = "relu", name = "relu3_1")
+  conv3_2 = mx.symbol.Convolution(data = relu3_1, kernel = c(3, 3), pad = c(1, 1),
+                                  num_filter = 256, name = "conv3_2")
+  relu3_2 = mx.symbol.Activation(data = conv3_2, act_type = "relu", name = "relu3_2")
+  pool3 = mx.symbol.Pooling(data = relu3_2, pool_type = "max", kernel = c(2, 2),
+                            stride = c(2, 2), name = "pool3")
+  # group 4
+  conv4_1 = mx.symbol.Convolution(data = pool3, kernel = c(3, 3), pad = c(1, 1),
+                                  num_filter = 512, name = "conv4_1")
+  relu4_1 = mx.symbol.Activation(data = conv4_1, act_type = "relu", name = "relu4_1")
+  conv4_2 = mx.symbol.Convolution(data = relu4_1, kernel = c(3, 3), pad = c(1, 1),
+                                  num_filter = 512, name = "conv4_2")
+  relu4_2 = mx.symbol.Activation(data = conv4_2, act_type = "relu", name = "relu4_2")
+  pool4 = mx.symbol.Pooling(data = relu4_2, pool_type = "max",
+                            kernel = c(2, 2), stride = c(2, 2), name = "pool4")
+  # group 5
+  conv5_1 = mx.symbol.Convolution(data = pool4, kernel = c(3, 3),
+                                  pad = c(1, 1), num_filter = 512, name = "conv5_1")
+  relu5_1 = mx.symbol.Activation(data = conv5_1, act_type = "relu", name = "relu5_1")
+  conv5_2 = mx.symbol.Convolution(data = relu5_1, kernel = c(3, 3),
+                                  pad = c(1, 1), num_filter = 512, name = "conv5_2")
+  relu5_2 = mx.symbol.Activation(data = conv5_2, act_type = "relu", name = "relu5_2")
+  pool5 = mx.symbol.Pooling(data = relu5_2, pool_type = "max",
+                            kernel = c(2, 2), stride = c(2, 2), name = "pool5")
+  # group 6
+  flatten = mx.symbol.Flatten(data = pool5, name = "flatten")
+  fc6 = mx.symbol.FullyConnected(data = flatten, num_hidden = 4096, name = "fc6")
+  relu6 = mx.symbol.Activation(data = fc6, act_type = "relu", name = "relu6")
+  drop6 = mx.symbol.Dropout(data = relu6, p = 0.5, name = "drop6")
+  # group 7
+  fc7 = mx.symbol.FullyConnected(data = drop6, num_hidden = 4096, name = "fc7")
+  relu7 = mx.symbol.Activation(data = fc7, act_type = "relu", name = "relu7")
+  drop7 = mx.symbol.Dropout(data = relu7, p = 0.5, name = "drop7")
+  # output
+  fc8 = mx.symbol.FullyConnected(data = drop7, num_hidden = num_classes, name = "fc8")
+  softmax = mx.symbol.SoftmaxOutput(data = fc8, name = 'softmax')
+  return(softmax)
+}
diff --git a/example/image-classification/train_cifar10.py b/example/image-classification/train_cifar10.py
index dc3580cd3181..b0c1c2848969 100644
--- a/example/image-classification/train_cifar10.py
+++ b/example/image-classification/train_cifar10.py
@@ -50,8 +50,7 @@ def _download(data_dir):
 net = importlib.import_module('symbol_' + args.network).get_symbol(10)
 
 # data
-def get_iterator(args, kv):
-    data_shape = (3, 28, 28)
+def get_iterator(args, kv, data_shape=(3, 28, 28)):
     if '://' not in args.data_dir:
         _download(args.data_dir)
 
@@ -77,5 +76,6 @@ def get_iterator(args, kv):
 
     return (train, val)
 
-# train
-train_model.fit(args, net, get_iterator)
+if __name__ == '__main__':
+    # train
+    train_model.fit(args, net, get_iterator)
diff --git a/example/module/sequential_module.py b/example/module/sequential_module.py
index def0558d0669..bc567af31704 100644
--- a/example/module/sequential_module.py
+++ b/example/module/sequential_module.py
@@ -44,13 +44,13 @@
 n_epoch = 2
 batch_size = 100
 train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
+        image="../image-classification/mnist/train-images-idx3-ubyte",
+        label="../image-classification/mnist/train-labels-idx1-ubyte",
         data_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10)
 val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
+        image="../image-classification/mnist/t10k-images-idx3-ubyte",
+        label="../image-classification/mnist/t10k-labels-idx1-ubyte",
         data_shape=(784,),
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
diff --git a/example/nce-loss/README.md b/example/nce-loss/README.md
new file mode 100644
index 000000000000..88e54910bc7c
--- /dev/null
+++ b/example/nce-loss/README.md
@@ -0,0 +1,35 @@
+#Examples of NCE Loss
+
+nce-loss is used to speedup multi-class classification when class num is huge.
+
+## Toy example
+
+* toy_softmax.py: a multi class example using softmax output
+* toy_nce.py: a multi-class example using nce loss
+
+## Word2Vec
+
+* word2vec.py: a CBOW word2vec example using nce loss
+
+You can run it by
+
+```
+./get_text8.sh
+python word2vec.py
+
+```
+
+## LSTM
+
+* lstm_word.py: a lstm example use nce loss
+
+You can run it by
+
+```
+./get_text8.sh
+python lstm_word.py
+```
+
+## References
+
+You can refer to [http://www.jianshu.com/p/e439b43ea464](http://www.jianshu.com/p/e439b43ea464) for more details. (In Chinese)
diff --git a/example/nce-loss/get_text8.sh b/example/nce-loss/get_text8.sh
new file mode 100755
index 000000000000..ccd4a08e69bb
--- /dev/null
+++ b/example/nce-loss/get_text8.sh
@@ -0,0 +1,4 @@
+mkdir -p ./data/
+cd ./data/
+wget http://mattmahoney.net/dc/text8.zip
+unzip text8.zip
diff --git a/example/nce-loss/lstm_word.py b/example/nce-loss/lstm_word.py
new file mode 100644
index 000000000000..6b4116c84ce8
--- /dev/null
+++ b/example/nce-loss/lstm_word.py
@@ -0,0 +1,222 @@
+# pylint:skip-file
+import sys, random, time, math
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+from nce import *
+from operator import itemgetter
+from optparse import OptionParser
+
+LSTMState = namedtuple("LSTMState", ["c", "h"])
+LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias",
+                                     "h2h_weight", "h2h_bias"])
+LSTMModel = namedtuple("LSTMModel", ["rnn_exec", "symbol",
+                                     "init_states", "last_states",
+                                     "seq_data", "seq_labels", "seq_outputs",
+                                     "param_blocks"])
+
+def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
+    """LSTM Cell symbol"""
+    if dropout > 0.:
+        indata = mx.sym.Dropout(data=indata, p=dropout)
+    i2h = mx.sym.FullyConnected(data=indata,
+                                weight=param.i2h_weight,
+                                bias=param.i2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_i2h" % (seqidx, layeridx))
+    h2h = mx.sym.FullyConnected(data=prev_state.h,
+                                weight=param.h2h_weight,
+                                bias=param.h2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_h2h" % (seqidx, layeridx))
+    gates = i2h + h2h
+    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
+                                      name="t%d_l%d_slice" % (seqidx, layeridx))
+    in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid")
+    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
+    forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid")
+    out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid")
+    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
+    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")
+    return LSTMState(c=next_c, h=next_h)
+
+
+def get_net(vocab_size, seq_len, num_label, num_lstm_layer, num_hidden):
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
+                                     i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
+                                     h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
+                                     h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+        
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    label_weight = mx.sym.Variable('label_weight')
+    embed_weight = mx.sym.Variable('embed_weight')
+    label_embed_weight = mx.sym.Variable('label_embed_weight')
+    data_embed = mx.sym.Embedding(data = data, input_dim = vocab_size,
+                                  weight = embed_weight,
+                                  output_dim = 100, name = 'data_embed')
+    datavec = mx.sym.SliceChannel(data = data_embed,
+                                  num_outputs = seq_len,
+                                  squeeze_axis = True, name = 'data_slice')
+    labelvec = mx.sym.SliceChannel(data = label,
+                                   num_outputs = seq_len,
+                                   squeeze_axis = True, name = 'label_slice')
+    labelweightvec = mx.sym.SliceChannel(data = label_weight,
+                                         num_outputs = seq_len,
+                                         squeeze_axis = True, name = 'label_weight_slice')
+    probs = []
+    for seqidx in range(seq_len):
+        hidden = datavec[seqidx]
+        
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata = hidden,
+                              prev_state = last_states[i],
+                              param = param_cells[i],
+                              seqidx = seqidx, layeridx = i)
+            hidden = next_state.h
+            last_states[i] = next_state
+            
+        probs.append(nce_loss(data = hidden,
+                              label = labelvec[seqidx],
+                              label_weight = labelweightvec[seqidx],
+                              embed_weight = label_embed_weight,
+                              vocab_size = vocab_size,
+                              num_hidden = 100,
+                              num_label = num_label))
+    return mx.sym.Group(probs)
+
+
+def load_data(name):
+    buf = open(name).read()
+    tks = buf.split(' ')
+    vocab = {}
+    freq = [0]
+    data = []
+    for tk in tks:
+        if len(tk) == 0:
+            continue
+        if tk not in vocab:
+            vocab[tk] = len(vocab) + 1
+            freq.append(0)
+        wid = vocab[tk]
+        data.append(wid)
+        freq[wid] += 1
+    negative = []
+    for i, v in enumerate(freq):
+        if i == 0 or v < 5:
+            continue
+        v = int(math.pow(v * 1.0, 0.75))
+        negative += [i for _ in range(v)]
+    return data, negative, vocab, freq
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+
+class DataIter(mx.io.DataIter):
+    def __init__(self, name, batch_size, seq_len, num_label, init_states):
+        super(DataIter, self).__init__()
+        self.batch_size = batch_size
+        self.data, self.negative, self.vocab, self.freq = load_data(name)
+        self.vocab_size = 1 + len(self.vocab)
+        print self.vocab_size
+        self.seq_len = seq_len
+        self.num_label = num_label
+        self.init_states = init_states
+        self.init_state_names = [x[0] for x in self.init_states]
+        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
+        self.provide_data = [('data', (batch_size, seq_len))] + init_states
+        self.provide_label = [('label', (self.batch_size, seq_len, num_label)),
+                              ('label_weight', (self.batch_size, seq_len, num_label))]
+        
+    def sample_ne(self):
+        return self.negative[random.randint(0, len(self.negative) - 1)]
+
+    def __iter__(self):
+        print 'begin'
+        batch_data = []
+        batch_label = []
+        batch_label_weight = []
+        for i in range(0, len(self.data) - self.seq_len - 1, self.seq_len):
+            data = self.data[i: i+self.seq_len]
+            label = [[self.data[i+k+1]] \
+                     + [self.sample_ne() for _ in range(self.num_label-1)]\
+                     for k in range(self.seq_len)]
+            label_weight = [[1.0] \
+                            + [0.0 for _ in range(self.num_label-1)]\
+                            for k in range(self.seq_len)]
+
+            batch_data.append(data)
+            batch_label.append(label)
+            batch_label_weight.append(label_weight)
+            if len(batch_data) == self.batch_size:
+                data_all = [mx.nd.array(batch_data)] + self.init_state_arrays
+                label_all = [mx.nd.array(batch_label), mx.nd.array(batch_label_weight)]
+                data_names = ['data'] + self.init_state_names
+                label_names = ['label', 'label_weight']
+                batch_data = []
+                batch_label = []
+                batch_label_weight = []
+                yield SimpleBatch(data_names, data_all, label_names, label_all)
+
+    def reset(self):
+        pass
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option("-g", "--gpu", action = "store_true", dest = "gpu", default = False,
+                      help = "use gpu")
+    batch_size = 1024
+    seq_len = 5
+    num_label = 6
+    num_lstm_layer = 2
+    num_hidden = 100
+
+    init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
+    init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
+    init_states = init_c + init_h
+
+    data_train = DataIter("./data/text8", batch_size, seq_len, num_label,
+                          init_states)
+    
+    network = get_net(data_train.vocab_size, seq_len, num_label, num_lstm_layer, num_hidden)
+    options, args = parser.parse_args()
+    devs = mx.cpu()
+    if options.gpu == True:
+        devs = mx.gpu()
+    model = mx.model.FeedForward(ctx = devs,
+                                 symbol = network,
+                                 num_epoch = 20,
+                                 learning_rate = 0.3,
+                                 momentum = 0.9,
+                                 wd = 0.0000,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    metric = NceLSTMAuc()
+    model.fit(X = data_train,
+              eval_metric = metric,
+              batch_end_callback = mx.callback.Speedometer(batch_size, 50),)
+
diff --git a/example/nce-loss/nce.py b/example/nce-loss/nce.py
new file mode 100644
index 000000000000..5b278a1eac3e
--- /dev/null
+++ b/example/nce-loss/nce.py
@@ -0,0 +1,91 @@
+# pylint:skip-file
+import sys
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from operator import itemgetter
+
+def nce_loss(data, label, label_weight, embed_weight, vocab_size, num_hidden, num_label):
+    label_embed = mx.sym.Embedding(data = label, input_dim = vocab_size,
+                                   weight = embed_weight,
+                                   output_dim = num_hidden, name = 'label_embed')
+    data = mx.sym.Reshape(data = data, shape = (-1, 1, num_hidden))
+    pred = mx.sym.broadcast_mul(data, label_embed)
+    pred = mx.sym.sum(data = pred, axis = 2)
+    return mx.sym.LogisticRegressionOutput(data = pred,
+                                           label = label_weight)
+
+
+class NceAccuracy(mx.metric.EvalMetric):
+    def __init__(self):
+        super(NceAccuracy, self).__init__('nce-accuracy')
+
+    def update(self, labels, preds):
+        label_weight = labels[1].asnumpy()
+        preds = preds[0].asnumpy()
+        for i in range(preds.shape[0]):
+            if np.argmax(label_weight[i]) == np.argmax(preds[i]):
+                self.sum_metric += 1
+            self.num_inst += 1
+
+class NceAuc(mx.metric.EvalMetric):
+    def __init__(self):
+        super(NceAuc, self).__init__('nce-auc')
+
+    def update(self, labels, preds):
+        label_weight = labels[1].asnumpy()
+        preds = preds[0].asnumpy()
+        tmp = []
+        for i in range(preds.shape[0]):
+            for j in range(preds.shape[1]):
+                tmp.append((label_weight[i][j], preds[i][j]))
+        tmp = sorted(tmp, key = itemgetter(1), reverse = True)
+        m = 0.0
+        n = 0.0
+        z = 0.0
+        k = 0
+        for a, b in tmp:
+            if a > 0.5:
+                m += 1.0
+                z += len(tmp) - k
+            else:
+                n += 1.0
+            k += 1
+        z -= m * (m + 1.0) / 2.0
+        z /= m
+        z /= n
+        self.sum_metric += z
+        self.num_inst += 1
+
+class NceLSTMAuc(mx.metric.EvalMetric):
+    def __init__(self):
+        super(NceLSTMAuc, self).__init__('nce-lstm-auc')
+
+    def update(self, labels, preds):
+        preds = np.array([x.asnumpy() for x in preds])
+        preds = preds.reshape((preds.shape[0] * preds.shape[1], preds.shape[2]))
+        label_weight = labels[1].asnumpy()
+        label_weight = label_weight.transpose((1, 0, 2))
+        label_weight = label_weight.reshape((preds.shape[0], preds.shape[1]))
+
+        tmp = []
+        for i in range(preds.shape[0]):
+            for j in range(preds.shape[1]):
+                tmp.append((label_weight[i][j], preds[i][j]))
+        tmp = sorted(tmp, key = itemgetter(1), reverse = True)
+        m = 0.0
+        n = 0.0
+        z = 0.0
+        k = 0
+        for a, b in tmp:
+            if a > 0.5:
+                m += 1.0
+                z += len(tmp) - k
+            else:
+                n += 1.0
+            k += 1
+        z -= m * (m + 1.0) / 2.0
+        z /= m
+        z /= n
+        self.sum_metric += z
+        self.num_inst += 1
diff --git a/example/nce-loss/toy_nce.py b/example/nce-loss/toy_nce.py
new file mode 100644
index 000000000000..89a5aaf9c35f
--- /dev/null
+++ b/example/nce-loss/toy_nce.py
@@ -0,0 +1,111 @@
+# pylint:skip-file
+import sys, random, time
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+from nce import *
+
+def get_net(vocab_size, num_label):
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    label_weight = mx.sym.Variable('label_weight')
+    embed_weight = mx.sym.Variable('embed_weight')
+    pred = mx.sym.FullyConnected(data = data, num_hidden = 100)
+    ret = nce_loss(data = pred,
+                    label = label,
+                    label_weight = label_weight,
+                    embed_weight = embed_weight,
+                    vocab_size = vocab_size,
+                    num_hidden = 100,
+                    num_label = num_label)    
+    return ret
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+
+class DataIter(mx.io.DataIter):
+    def __init__(self, count, batch_size, vocab_size, num_label, feature_size):
+        super(DataIter, self).__init__()
+        self.batch_size = batch_size
+        self.count = count
+        self.vocab_size = vocab_size
+        self.num_label = num_label
+        self.feature_size = feature_size
+        self.provide_data = [('data', (batch_size, feature_size))]
+        self.provide_label = [('label', (self.batch_size, num_label)),
+                              ('label_weight', (self.batch_size, num_label))]
+
+    def mock_sample(self):
+        ret = np.zeros(self.feature_size)
+        rn = set()
+        while len(rn) < 3:
+            rn.add(random.randint(0, self.feature_size - 1))
+        s = 0
+        for k in rn:
+            ret[k] = 1.0
+            s *= self.feature_size
+            s += k
+        la = [s % self.vocab_size] +\
+             [random.randint(0, self.vocab_size - 1) for _ in range(self.num_label - 1)]
+        return ret, la
+
+    def __iter__(self):
+        for _ in range(self.count / self.batch_size):
+            data = []
+            label = []
+            label_weight = []
+            for i in range(self.batch_size):
+                d, l = self.mock_sample()
+                data.append(d)
+                label.append(l)
+                label_weight.append([1.0] + [0.0 for _ in range(self.num_label - 1)])
+            data_all = [mx.nd.array(data)]
+            label_all = [mx.nd.array(label), mx.nd.array(label_weight)]
+            data_names = ['data']
+            label_names = ['label', 'label_weight']
+            yield SimpleBatch(data_names, data_all, label_names, label_all)
+
+    def reset(self):
+        pass
+
+if __name__ == '__main__':
+    batch_size = 128
+    vocab_size = 10000
+    feature_size = 100
+    num_label = 6
+    
+    data_train = DataIter(100000, batch_size, vocab_size, num_label, feature_size)
+    data_test = DataIter(1000, batch_size, vocab_size, num_label, feature_size)
+    
+    network = get_net(vocab_size, num_label)
+    devs = [mx.cpu()]
+    model = mx.model.FeedForward(ctx = devs,
+                                 symbol = network,
+                                 num_epoch = 20,
+                                 learning_rate = 0.03,
+                                 momentum = 0.9,
+                                 wd = 0.00001,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    metric = NceAccuracy()
+    model.fit(X = data_train, eval_data = data_test,
+              eval_metric = metric,
+              batch_end_callback = mx.callback.Speedometer(batch_size, 50),)
+
diff --git a/example/nce-loss/toy_softmax.py b/example/nce-loss/toy_softmax.py
new file mode 100644
index 000000000000..83d0a1e6a43c
--- /dev/null
+++ b/example/nce-loss/toy_softmax.py
@@ -0,0 +1,98 @@
+# pylint:skip-file
+import sys, random, time
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+
+ToyModel = namedtuple("ToyModel", ["ex", "symbol", "param_blocks"])
+
+def get_net(vocab_size):
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    pred = mx.sym.FullyConnected(data = data, num_hidden = 100)
+    pred = mx.sym.FullyConnected(data = pred, num_hidden = vocab_size)
+    sm = mx.sym.SoftmaxOutput(data = pred, label = label)
+    return sm
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+
+class DataIter(mx.io.DataIter):
+    def __init__(self, count, batch_size, vocab_size, num_label, feature_size):
+        super(DataIter, self).__init__()
+        self.batch_size = batch_size
+        self.count = count
+        self.vocab_size = vocab_size
+        self.num_label = num_label
+        self.feature_size = feature_size
+        self.provide_data = [('data', (batch_size, feature_size))]
+        self.provide_label = [('label', (self.batch_size,))]
+
+    def mock_sample(self):
+        ret = np.zeros(self.feature_size)
+        rn = set()
+        while len(rn) < 3:
+            rn.add(random.randint(0, self.feature_size - 1))
+        s = 0
+        for k in rn:
+            ret[k] = 1.0
+            s *= self.feature_size
+            s += k
+        return ret, s % self.vocab_size
+
+    def __iter__(self):
+        for _ in range(self.count / self.batch_size):
+            data = []
+            label = []
+            for i in range(self.batch_size):
+                d, l = self.mock_sample()
+                data.append(d)
+                label.append(l)
+            data_all = [mx.nd.array(data)]
+            label_all = [mx.nd.array(label)]
+            data_names = ['data']
+            label_names = ['label']
+            yield SimpleBatch(data_names, data_all, label_names, label_all)
+
+    def reset(self):
+        pass
+
+if __name__ == '__main__':
+    batch_size = 128
+    vocab_size = 10000
+    feature_size = 100
+    num_label = 6
+    
+    data_train = DataIter(100000, batch_size, vocab_size, num_label, feature_size)
+    data_test = DataIter(1000, batch_size, vocab_size, num_label, feature_size)
+    
+    network = get_net(vocab_size)
+    devs = mx.cpu()
+    model = mx.model.FeedForward(ctx = devs,
+                                 symbol = network,
+                                 num_epoch = 20,
+                                 learning_rate = 0.03,
+                                 momentum = 0.9,
+                                 wd = 0.0000,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    model.fit(X = data_train, eval_data = data_test,
+              batch_end_callback = mx.callback.Speedometer(batch_size, 50),)
+
diff --git a/example/nce-loss/wordvec.py b/example/nce-loss/wordvec.py
new file mode 100644
index 000000000000..02e986e76813
--- /dev/null
+++ b/example/nce-loss/wordvec.py
@@ -0,0 +1,149 @@
+# pylint:skip-file
+import sys, random, time, math
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+from nce import *
+from operator import itemgetter
+from optparse import OptionParser
+
+def get_net(vocab_size, num_input, num_label):
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    label_weight = mx.sym.Variable('label_weight')
+    embed_weight = mx.sym.Variable('embed_weight')
+    data_embed = mx.sym.Embedding(data = data, input_dim = vocab_size,
+                                  weight = embed_weight,
+                                  output_dim = 100, name = 'data_embed')
+    datavec = mx.sym.SliceChannel(data = data_embed,
+                                     num_outputs = num_input,
+                                     squeeze_axis = 1, name = 'data_slice')
+    pred = datavec[0]
+    for i in range(1, num_input):
+        pred = pred + datavec[i]
+    return nce_loss(data = pred,
+                    label = label,
+                    label_weight = label_weight,
+                    embed_weight = embed_weight,
+                    vocab_size = vocab_size,
+                    num_hidden = 100,
+                    num_label = num_label)    
+
+def load_data(name):
+    buf = open(name).read()
+    tks = buf.split(' ')
+    vocab = {}
+    freq = [0]
+    data = []
+    for tk in tks:
+        if len(tk) == 0:
+            continue
+        if tk not in vocab:
+            vocab[tk] = len(vocab) + 1
+            freq.append(0)
+        wid = vocab[tk]
+        data.append(wid)
+        freq[wid] += 1
+    negative = []
+    for i, v in enumerate(freq):
+        if i == 0 or v < 5:
+            continue
+        v = int(math.pow(v * 1.0, 0.75))
+        negative += [i for _ in range(v)]
+    return data, negative, vocab, freq
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+
+class DataIter(mx.io.DataIter):
+    def __init__(self, name, batch_size, num_label):
+        super(DataIter, self).__init__()
+        self.batch_size = batch_size
+        self.data, self.negative, self.vocab, self.freq = load_data(name)
+        self.vocab_size = 1 + len(self.vocab)
+        print self.vocab_size
+        self.num_label = num_label
+        self.provide_data = [('data', (batch_size, num_label - 1))]
+        self.provide_label = [('label', (self.batch_size, num_label)),
+                              ('label_weight', (self.batch_size, num_label))]
+        
+    def sample_ne(self):
+        return self.negative[random.randint(0, len(self.negative) - 1)]
+
+    def __iter__(self):
+        print 'begin'
+        batch_data = []
+        batch_label = []
+        batch_label_weight = []
+        start = random.randint(0, self.num_label - 1)
+        for i in range(start, len(self.data) - self.num_label - start, self.num_label):
+            context = self.data[i: i + self.num_label / 2] \
+                      + self.data[i + 1 + self.num_label / 2: i + self.num_label]
+            target_word = self.data[i + self.num_label / 2]
+            if self.freq[target_word] < 5:
+                continue
+            target = [target_word] \
+                     + [self.sample_ne() for _ in range(self.num_label - 1)]
+            target_weight = [1.0] + [0.0 for _ in range(self.num_label - 1)]
+            batch_data.append(context)
+            batch_label.append(target)
+            batch_label_weight.append(target_weight)
+            if len(batch_data) == self.batch_size:
+                data_all = [mx.nd.array(batch_data)]
+                label_all = [mx.nd.array(batch_label), mx.nd.array(batch_label_weight)]
+                data_names = ['data']
+                label_names = ['label', 'label_weight']
+                batch_data = []
+                batch_label = []
+                batch_label_weight = []
+                yield SimpleBatch(data_names, data_all, label_names, label_all)
+
+    def reset(self):
+        pass
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option("-g", "--gpu", action = "store_true", dest = "gpu", default = False,
+                      help = "use gpu")
+    batch_size = 256
+    num_label = 5
+    
+    data_train = DataIter("./data/text8", batch_size, num_label)
+    
+    network = get_net(data_train.vocab_size, num_label - 1, num_label)
+    
+    options, args = parser.parse_args()
+    devs = mx.cpu()
+    if options.gpu == True:
+        devs = mx.gpu()
+    model = mx.model.FeedForward(ctx = devs,
+                                 symbol = network,
+                                 num_epoch = 20,
+                                 learning_rate = 0.3,
+                                 momentum = 0.9,
+                                 wd = 0.0000,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    metric = NceAuc()
+    model.fit(X = data_train,
+              eval_metric = metric,
+              batch_end_callback = mx.callback.Speedometer(batch_size, 50),)
+
diff --git a/example/notebooks/simple_bind.ipynb b/example/notebooks/simple_bind.ipynb
index b1b470ea9286..4f7d30e2d5f2 100644
--- a/example/notebooks/simple_bind.ipynb
+++ b/example/notebooks/simple_bind.ipynb
@@ -4,14 +4,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## MXNet Symbol.simple_bind example\n",
+    "\n",
     "In this example, we will show how to use ```simple_bind``` API. \n",
     "\n",
-    "Note it is a low level API, by using low level API we are able to touch more details about MXNet"
+    "Note it is a low level API.  By using such a low level API, we are able to interact with more details about MXNet."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
     "collapsed": true
    },
@@ -20,6 +22,7 @@
     "import mxnet as mx\n",
     "import numpy as np\n",
     "import logging\n",
+    "import pprint\n",
     "\n",
     "logger = logging.getLogger()\n",
     "logger.setLevel(logging.DEBUG)"
@@ -35,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false
    },
@@ -49,71 +52,82 @@
        "<!-- Generated by graphviz version 2.36.0 (20140111.2315)\n",
        " -->\n",
        "<!-- Title: plot Pages: 1 -->\n",
-       "<svg width=\"144pt\" height=\"506pt\"\n",
-       " viewBox=\"0.00 0.00 144.00 506.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
-       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 502)\">\n",
+       "<svg width=\"144pt\" height=\"616pt\"\n",
+       " viewBox=\"0.00 0.00 144.00 616.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
+       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 612)\">\n",
        "<title>plot</title>\n",
-       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-502 140,-502 140,4 -4,4\"/>\n",
-       "<!-- fc1 -->\n",
-       "<g id=\"node1\" class=\"node\"><title>fc1</title>\n",
-       "<ellipse fill=\"#fb8072\" stroke=\"black\" cx=\"68\" cy=\"-29\" rx=\"68.2532\" ry=\"29\"/>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-32.8\" font-family=\"Times,serif\" font-size=\"14.00\">FullyConnected</text>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-17.8\" font-family=\"Times,serif\" font-size=\"14.00\">128</text>\n",
+       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-612 140,-612 140,4 -4,4\"/>\n",
+       "<!-- data -->\n",
+       "<g id=\"node1\" class=\"node\"><title>data</title>\n",
+       "<ellipse fill=\"#8dd3c7\" stroke=\"black\" cx=\"68\" cy=\"-29\" rx=\"47\" ry=\"29\"/>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-25.3\" font-family=\"Times,serif\" font-size=\"14.00\">data</text>\n",
        "</g>\n",
-       "<!-- bn1 -->\n",
-       "<g id=\"node2\" class=\"node\"><title>bn1</title>\n",
-       "<ellipse fill=\"#bebada\" stroke=\"black\" cx=\"68\" cy=\"-139\" rx=\"47\" ry=\"29\"/>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-135.3\" font-family=\"Times,serif\" font-size=\"14.00\">BatchNorm</text>\n",
+       "<!-- fc1 -->\n",
+       "<g id=\"node2\" class=\"node\"><title>fc1</title>\n",
+       "<ellipse fill=\"#fb8072\" stroke=\"black\" cx=\"68\" cy=\"-139\" rx=\"68.2532\" ry=\"29\"/>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-142.8\" font-family=\"Times,serif\" font-size=\"14.00\">FullyConnected</text>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-127.8\" font-family=\"Times,serif\" font-size=\"14.00\">128</text>\n",
        "</g>\n",
-       "<!-- bn1&#45;&gt;fc1 -->\n",
-       "<g id=\"edge1\" class=\"edge\"><title>bn1&#45;&gt;fc1</title>\n",
+       "<!-- fc1&#45;&gt;data -->\n",
+       "<g id=\"edge1\" class=\"edge\"><title>fc1&#45;&gt;data</title>\n",
        "<path fill=\"none\" stroke=\"black\" d=\"M68,-99.8131C68,-86.1516 68,-71.0092 68,-58.3283\"/>\n",
        "<polygon fill=\"black\" stroke=\"black\" points=\"68,-109.906 63.5001,-99.9062 68,-104.906 68.0001,-99.9062 68.0001,-99.9062 68.0001,-99.9062 68,-104.906 72.5001,-99.9062 68,-109.906 68,-109.906\"/>\n",
-       "<text text-anchor=\"middle\" x=\"78.5\" y=\"-80.3\" font-family=\"Times,serif\" font-size=\"14.00\">128</text>\n",
+       "<text text-anchor=\"middle\" x=\"78.5\" y=\"-80.3\" font-family=\"Times,serif\" font-size=\"14.00\">784</text>\n",
        "</g>\n",
-       "<!-- act1 -->\n",
-       "<g id=\"node3\" class=\"node\"><title>act1</title>\n",
-       "<ellipse fill=\"#ffffb3\" stroke=\"black\" cx=\"68\" cy=\"-249\" rx=\"48.4635\" ry=\"29\"/>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-252.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-237.8\" font-family=\"Times,serif\" font-size=\"14.00\">tanh</text>\n",
+       "<!-- bn1 -->\n",
+       "<g id=\"node3\" class=\"node\"><title>bn1</title>\n",
+       "<ellipse fill=\"#bebada\" stroke=\"black\" cx=\"68\" cy=\"-249\" rx=\"47\" ry=\"29\"/>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-245.3\" font-family=\"Times,serif\" font-size=\"14.00\">BatchNorm</text>\n",
        "</g>\n",
-       "<!-- act1&#45;&gt;bn1 -->\n",
-       "<g id=\"edge2\" class=\"edge\"><title>act1&#45;&gt;bn1</title>\n",
+       "<!-- bn1&#45;&gt;fc1 -->\n",
+       "<g id=\"edge2\" class=\"edge\"><title>bn1&#45;&gt;fc1</title>\n",
        "<path fill=\"none\" stroke=\"black\" d=\"M68,-209.813C68,-196.152 68,-181.009 68,-168.328\"/>\n",
        "<polygon fill=\"black\" stroke=\"black\" points=\"68,-219.906 63.5001,-209.906 68,-214.906 68.0001,-209.906 68.0001,-209.906 68.0001,-209.906 68,-214.906 72.5001,-209.906 68,-219.906 68,-219.906\"/>\n",
        "<text text-anchor=\"middle\" x=\"78.5\" y=\"-190.3\" font-family=\"Times,serif\" font-size=\"14.00\">128</text>\n",
        "</g>\n",
-       "<!-- fc2 -->\n",
-       "<g id=\"node4\" class=\"node\"><title>fc2</title>\n",
-       "<ellipse fill=\"#fb8072\" stroke=\"black\" cx=\"68\" cy=\"-359\" rx=\"68.2532\" ry=\"29\"/>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-362.8\" font-family=\"Times,serif\" font-size=\"14.00\">FullyConnected</text>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-347.8\" font-family=\"Times,serif\" font-size=\"14.00\">10</text>\n",
+       "<!-- act1 -->\n",
+       "<g id=\"node4\" class=\"node\"><title>act1</title>\n",
+       "<ellipse fill=\"#ffffb3\" stroke=\"black\" cx=\"68\" cy=\"-359\" rx=\"48.4635\" ry=\"29\"/>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-362.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-347.8\" font-family=\"Times,serif\" font-size=\"14.00\">tanh</text>\n",
        "</g>\n",
-       "<!-- fc2&#45;&gt;act1 -->\n",
-       "<g id=\"edge3\" class=\"edge\"><title>fc2&#45;&gt;act1</title>\n",
+       "<!-- act1&#45;&gt;bn1 -->\n",
+       "<g id=\"edge3\" class=\"edge\"><title>act1&#45;&gt;bn1</title>\n",
        "<path fill=\"none\" stroke=\"black\" d=\"M68,-319.813C68,-306.152 68,-291.009 68,-278.328\"/>\n",
        "<polygon fill=\"black\" stroke=\"black\" points=\"68,-329.906 63.5001,-319.906 68,-324.906 68.0001,-319.906 68.0001,-319.906 68.0001,-319.906 68,-324.906 72.5001,-319.906 68,-329.906 68,-329.906\"/>\n",
        "<text text-anchor=\"middle\" x=\"78.5\" y=\"-300.3\" font-family=\"Times,serif\" font-size=\"14.00\">128</text>\n",
        "</g>\n",
-       "<!-- softmax -->\n",
-       "<g id=\"node5\" class=\"node\"><title>softmax</title>\n",
-       "<ellipse fill=\"#fccde5\" stroke=\"black\" cx=\"68\" cy=\"-469\" rx=\"55.0152\" ry=\"29\"/>\n",
-       "<text text-anchor=\"middle\" x=\"68\" y=\"-465.3\" font-family=\"Times,serif\" font-size=\"14.00\">SoftmaxOutput</text>\n",
+       "<!-- fc2 -->\n",
+       "<g id=\"node5\" class=\"node\"><title>fc2</title>\n",
+       "<ellipse fill=\"#fb8072\" stroke=\"black\" cx=\"68\" cy=\"-469\" rx=\"68.2532\" ry=\"29\"/>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-472.8\" font-family=\"Times,serif\" font-size=\"14.00\">FullyConnected</text>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-457.8\" font-family=\"Times,serif\" font-size=\"14.00\">10</text>\n",
        "</g>\n",
-       "<!-- softmax&#45;&gt;fc2 -->\n",
-       "<g id=\"edge4\" class=\"edge\"><title>softmax&#45;&gt;fc2</title>\n",
+       "<!-- fc2&#45;&gt;act1 -->\n",
+       "<g id=\"edge4\" class=\"edge\"><title>fc2&#45;&gt;act1</title>\n",
        "<path fill=\"none\" stroke=\"black\" d=\"M68,-429.813C68,-416.152 68,-401.009 68,-388.328\"/>\n",
        "<polygon fill=\"black\" stroke=\"black\" points=\"68,-439.906 63.5001,-429.906 68,-434.906 68.0001,-429.906 68.0001,-429.906 68.0001,-429.906 68,-434.906 72.5001,-429.906 68,-439.906 68,-439.906\"/>\n",
-       "<text text-anchor=\"middle\" x=\"75\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">10</text>\n",
+       "<text text-anchor=\"middle\" x=\"78.5\" y=\"-410.3\" font-family=\"Times,serif\" font-size=\"14.00\">128</text>\n",
+       "</g>\n",
+       "<!-- softmax -->\n",
+       "<g id=\"node6\" class=\"node\"><title>softmax</title>\n",
+       "<ellipse fill=\"#fccde5\" stroke=\"black\" cx=\"68\" cy=\"-579\" rx=\"55.0152\" ry=\"29\"/>\n",
+       "<text text-anchor=\"middle\" x=\"68\" y=\"-575.3\" font-family=\"Times,serif\" font-size=\"14.00\">SoftmaxOutput</text>\n",
+       "</g>\n",
+       "<!-- softmax&#45;&gt;fc2 -->\n",
+       "<g id=\"edge5\" class=\"edge\"><title>softmax&#45;&gt;fc2</title>\n",
+       "<path fill=\"none\" stroke=\"black\" d=\"M68,-539.813C68,-526.152 68,-511.009 68,-498.328\"/>\n",
+       "<polygon fill=\"black\" stroke=\"black\" points=\"68,-549.906 63.5001,-539.906 68,-544.906 68.0001,-539.906 68.0001,-539.906 68.0001,-539.906 68,-544.906 72.5001,-539.906 68,-549.906 68,-549.906\"/>\n",
+       "<text text-anchor=\"middle\" x=\"75\" y=\"-520.3\" font-family=\"Times,serif\" font-size=\"14.00\">10</text>\n",
        "</g>\n",
        "</g>\n",
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.dot.Digraph at 0x7f4da2a90e10>"
+       "<graphviz.dot.Digraph at 0x7fe7b800ad10>"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -141,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {
     "collapsed": true
    },
@@ -164,37 +178,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After generating executor, we get get data from executor"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "# get argument arrays\n",
-    "arg_arrays = executor.arg_arrays\n",
-    "# get grad arrays\n",
-    "grad_arrays = executor.grad_arrays\n",
-    "# get aux_states arrays. Note: currently only BatchNorm symbol has auxiliary states, which is moving_mean and moving_var\n",
-    "aux_arrays = executor.aux_arrays\n",
-    "# get outputs from executor\n",
-    "output_arrays = executor.outputs"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The sequence of arrays is in same sequence of symbol arguments"
+    "After generating executor, get lists of inputs (args), outputs, etc\n",
+    "The order of these arrays is in same sequence as symbol arguments"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {
     "collapsed": false
    },
@@ -203,64 +193,73 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "('args: ', {'bn1_beta': <mxnet.ndarray.NDArray object at 0x7f4da2aa92d0>, 'fc2_weight': <mxnet.ndarray.NDArray object at 0x7f4da2aa9310>, 'fc1_weight': <mxnet.ndarray.NDArray object at 0x7f4d98d4e690>, 'softmax_label': <mxnet.ndarray.NDArray object at 0x7f4da2aa9550>, 'bn1_gamma': <mxnet.ndarray.NDArray object at 0x7f4da2aa9250>, 'fc2_bias': <mxnet.ndarray.NDArray object at 0x7f4da2aa9290>, 'data': <mxnet.ndarray.NDArray object at 0x7f4d98d4eb50>, 'fc1_bias': <mxnet.ndarray.NDArray object at 0x7f4da2a90150>})\n",
+      "args: {'bn1_beta': <mxnet.ndarray.NDArray object at 0x7fe7b7d146d0>,\n",
+      " 'bn1_gamma': <mxnet.ndarray.NDArray object at 0x7fe7b7d147d0>,\n",
+      " 'data': <mxnet.ndarray.NDArray object at 0x7fe804ad70d0>,\n",
+      " 'fc1_bias': <mxnet.ndarray.NDArray object at 0x7fe7b7d14590>,\n",
+      " 'fc1_weight': <mxnet.ndarray.NDArray object at 0x7fe7b7d14750>,\n",
+      " 'fc2_bias': <mxnet.ndarray.NDArray object at 0x7fe7b7d14650>,\n",
+      " 'fc2_weight': <mxnet.ndarray.NDArray object at 0x7fe7b7d14690>,\n",
+      " 'softmax_label': <mxnet.ndarray.NDArray object at 0x7fe7b7d14610>}\n",
       "--------------------\n",
-      "('grads: ', {'bn1_beta': <mxnet.ndarray.NDArray object at 0x7f4da2aa93d0>, 'fc2_weight': <mxnet.ndarray.NDArray object at 0x7f4da2aa9510>, 'fc1_weight': <mxnet.ndarray.NDArray object at 0x7f4da2aa9590>, 'softmax_label': None, 'bn1_gamma': <mxnet.ndarray.NDArray object at 0x7f4da2aa9390>, 'fc2_bias': <mxnet.ndarray.NDArray object at 0x7f4da2aa94d0>, 'data': None, 'fc1_bias': <mxnet.ndarray.NDArray object at 0x7f4da2aa9350>})\n",
+      "grads: {'bn1_beta': <mxnet.ndarray.NDArray object at 0x7fe7b7d14910>,\n",
+      " 'bn1_gamma': <mxnet.ndarray.NDArray object at 0x7fe7b7d148d0>,\n",
+      " 'data': <mxnet.ndarray.NDArray object at 0x7fe7b7d14810>,\n",
+      " 'fc1_bias': <mxnet.ndarray.NDArray object at 0x7fe7b7d14890>,\n",
+      " 'fc1_weight': <mxnet.ndarray.NDArray object at 0x7fe7b7d14850>,\n",
+      " 'fc2_bias': <mxnet.ndarray.NDArray object at 0x7fe7b7d14990>,\n",
+      " 'fc2_weight': <mxnet.ndarray.NDArray object at 0x7fe7b7d14950>,\n",
+      " 'softmax_label': <mxnet.ndarray.NDArray object at 0x7fe7b7d149d0>}\n",
       "--------------------\n",
-      "('aux_states: ', {'bn1_moving_mean': <mxnet.ndarray.NDArray object at 0x7f4da2aa9490>, 'bn1_moving_var': <mxnet.ndarray.NDArray object at 0x7f4da2aa9450>})\n",
+      "aux_states: {'bn1_moving_mean': <mxnet.ndarray.NDArray object at 0x7fe7b7d14a10>,\n",
+      " 'bn1_moving_var': <mxnet.ndarray.NDArray object at 0x7fe7b7d14a50>}\n",
       "--------------------\n",
-      "('outputs: ', {'softmax_output': <mxnet.ndarray.NDArray object at 0x7f4da2aa9890>})\n"
+      "outputs: {'softmax_output': <mxnet.ndarray.NDArray object at 0x7fe7b7d14c10>}\n"
      ]
     }
    ],
    "source": [
-    "args = dict(zip(softmax.list_arguments(), arg_arrays))\n",
-    "grads = dict(zip(softmax.list_arguments(), grad_arrays))\n",
-    "outputs = dict(zip(softmax.list_outputs(), output_arrays))\n",
-    "aux_states = dict(zip(softmax.list_auxiliary_states(), aux_arrays))\n",
+    "args = executor.arg_dict\n",
+    "# Equivalently you could do this:\n",
+    "#args = dict(zip(softmax.list_arguments(), executor.arg_arrays))\n",
+    "grads = executor.grad_dict\n",
+    "aux_states = executor.aux_dict\n",
+    "\n",
+    "# For outputs we need to assemble the dict by hand:\n",
+    "outputs = dict(zip(softmax.list_outputs(), executor.outputs))\n",
+    "\n",
     "# we can print the args we have\n",
-    "print(\"args: \", args)\n",
+    "print(\"args: %s\" % pprint.pformat(args))\n",
     "print(\"-\" * 20)\n",
-    "print(\"grads: \", grads)\n",
+    "print(\"grads: %s\" % pprint.pformat(grads))\n",
     "print(\"-\" * 20)\n",
-    "print(\"aux_states: \", aux_states)\n",
+    "print(\"aux_states: %s\" % pprint.pformat(aux_states))\n",
     "print(\"-\" * 20)\n",
-    "print(\"outputs: \", outputs)"
+    "print(\"outputs: %s\" % pprint.pformat(outputs))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The next step is intilize weight. We can set weight directly by using ```mx.random``` or numpy ndarray"
+    "The next step is intilize the weights. We can set weight directly by using ```mx.random``` or numpy ndarray"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
-    "# helper function\n",
-    "def Init(key, arr):\n",
-    "    if \"weight\" in key:\n",
-    "        arr[:] = mx.random.uniform(-0.07, 0.07, arr.shape)\n",
-    "        # or\n",
-    "        # arr[:] = np.random.uniform(-0.07, 0.07, arr.shape)\n",
-    "    elif \"gamma\" in key:\n",
-    "        # for batch norm slope\n",
-    "        arr[:] = 1.0\n",
-    "    elif \"bias\" in key:\n",
-    "        arr[:] = 0\n",
-    "    elif \"beta\" in key:\n",
-    "        # for batch norm bias\n",
-    "        arr[:] = 0\n",
-    "\n",
-    "# Init args\n",
-    "for key, arr in args.items():\n",
-    "    Init(key, arr)"
+    "args['fc1_weight'][:] = mx.random.uniform(-0.07, 0.07, args['fc1_weight'].shape)\n",
+    "args['fc2_weight'][:] = np.random.uniform(-0.07, 0.07, args['fc2_weight'].shape)  # equivalent\n",
+    "args['bn1_beta'][:] = 1.0\n",
+    "args['bn1_gamma'][:] = 1.0\n",
+    "args['fc1_bias'][:] = 0\n",
+    "args['fc2_bias'][:] = 0\n",
+    "# Don't initialize data or softmax_label"
    ]
   },
   {
@@ -272,7 +271,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {
     "collapsed": true
    },
@@ -291,27 +290,34 @@
     "    elif \"bias\" in key or \"beta\" in key:\n",
     "        weight[:] -= 2.0 * lr * (grad * norm)\n",
     "    else:\n",
-    "        pass\n",
-    "    \n",
-    "    "
+    "        pass"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Then we will make a data iterator. We can either use build in iterator to load from binary file or build a numpy iterator.\n",
+    "Then we will make a data iterator. We can either use built-in iterator to load from binary file or build a numpy iterator.\n",
     "\n",
     "For special case, you are free to write your own iterator in python"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python2.7/dist-packages/numpy/core/fromnumeric.py:2652: VisibleDeprecationWarning: `rank` is deprecated; use the `ndim` attribute or function instead. To find the rank of a matrix see `numpy.linalg.matrix_rank`.\n",
+      "  VisibleDeprecationWarning)\n"
+     ]
+    }
+   ],
    "source": [
     "# We use utils function in sklearn to get MNIST dataset in pickle\n",
     "from sklearn.datasets import fetch_mldata\n",
@@ -341,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {
     "collapsed": true
    },
@@ -361,7 +367,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "metadata": {
     "collapsed": false
    },
@@ -371,14 +377,14 @@
      "output_type": "stream",
      "text": [
       "INFO:root:Finish training iteration 0\n",
-      "INFO:root:Train Acc: 0.9506\n",
-      "INFO:root:Val Acc: 0.9480\n",
+      "INFO:root:Train Acc: 0.8894\n",
+      "INFO:root:Val Acc: 0.9319\n",
       "INFO:root:Finish training iteration 1\n",
-      "INFO:root:Train Acc: 0.9579\n",
-      "INFO:root:Val Acc: 0.9521\n",
+      "INFO:root:Train Acc: 0.9372\n",
+      "INFO:root:Val Acc: 0.9472\n",
       "INFO:root:Finish training iteration 2\n",
-      "INFO:root:Train Acc: 0.9639\n",
-      "INFO:root:Val Acc: 0.9551\n"
+      "INFO:root:Train Acc: 0.9508\n",
+      "INFO:root:Val Acc: 0.9541\n"
      ]
     }
    ],
@@ -461,7 +467,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.10"
+   "version": "2.7.6"
   }
  },
  "nbformat": 4,
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index 60f5527cb907..13756a7dfb31 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -1,9 +1,9 @@
 # Faster R-CNN in MXNet with distributed implementation and data parallelization
 
-Region Proposal Network solves object detection as a regression problem 
-from the objectness perspective. Bounding boxes are predicted by applying 
-learned bounding box deltas to base boxes, namely anchor boxes across 
-different positions in feature maps. Training process directly learns a 
+Region Proposal Network solves object detection as a regression problem
+from the objectness perspective. Bounding boxes are predicted by applying
+learned bounding box deltas to base boxes, namely anchor boxes across
+different positions in feature maps. Training process directly learns a
 mapping from raw image intensities to bounding box transformation targets.
 
 Fast R-CNN treats general object detection as a classification problem and
@@ -13,7 +13,7 @@ detection results. Cropping feature maps instead of image input accelerates
 computation utilizing shared convolution maps. Bounding box displacements
 are simultaneously learned in the training process.
 
-Faster R-CNN utilize an alternate optimization training process between RPN 
+Faster R-CNN utilize an alternate optimization training process between RPN
 and Fast R-CNN. Fast R-CNN weights are used to initiate RPN for training.
 
 ## Getting Started
@@ -47,6 +47,10 @@ and Fast R-CNN. Fast R-CNN weights are used to initiate RPN for training.
 * Start training by running `python -m tools.train_rcnn --proposal ss` to use the selective search proposal.
 * Start testing by running `python -m tools.test_rcnn --proposal ss`.
 
+## Approximate Joint Training
+* Support training faster-rcnn model using end2end training method which is implemented by approximate joint training, and it is almost the same as [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn)
+* Start end2end training by running `python -u train_end2end.py`, please using `python train_end2end.py --help` to query how to setting the training paramters, such as you can setting the step of dropping lr by `--factor-step`.
+
 ## Information
 * Download link to trained model
   Baidu Yun: http://pan.baidu.com/s/1boRhGvH (ixiw) or Dropbox: https://www.dropbox.com/s/jrr83q0ai2ckltq/final-0000.params.tar.gz?dl=0
diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py
index fb110849663b..b0f806bd8fa7 100644
--- a/example/rcnn/demo.py
+++ b/example/rcnn/demo.py
@@ -2,21 +2,19 @@
 import os
 import numpy as np
 import cv2
-
 import mxnet as mx
-
 from helper.processing.image_processing import resize, transform
 from helper.processing.nms import nms
 from rcnn.config import config
 from rcnn.detector import Detector
 from rcnn.symbol import get_vgg_test
-from rcnn.tester import vis_all_detection
+from rcnn.tester import vis_all_detection, save_all_detection
 from utils.load_model import load_param
 
 
 def get_net(prefix, epoch, ctx):
-    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-    sym = get_vgg_test()
+    args, auxs, num_class = load_param(prefix, epoch, convert=True, ctx=ctx)
+    sym = get_vgg_test(num_classes=num_class)
     detector = Detector(sym, ctx, args, auxs)
     return detector
 
@@ -29,13 +27,14 @@ def get_net(prefix, epoch, ctx):
            'sheep', 'sofa', 'train', 'tvmonitor')
 
 
-def demo_net(detector, image_name):
+def demo_net(detector, image_name, vis=False):
     """
     wrapper for detector
     :param detector: Detector
     :param image_name: image name
     :return: None
     """
+
     config.TEST.HAS_RPN = True
     assert os.path.exists(image_name), image_name + ' not found'
     im = cv2.imread(image_name)
@@ -60,7 +59,10 @@ def demo_net(detector, image_name):
         all_boxes[cls_ind] = dets[keep, :]
 
     boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))]
-    vis_all_detection(im_array, boxes_this_image, CLASSES, 0)
+    if vis:
+        vis_all_detection(im_array, boxes_this_image, CLASSES, 0)
+    else:
+        save_all_detection(im_array, boxes_this_image, CLASSES, 0)
 
 
 def parse_args():
@@ -78,4 +80,3 @@ def parse_args():
     ctx = mx.gpu(args.gpu_id)
     detector = get_net(args.prefix, args.epoch, ctx)
     demo_net(detector, args.image)
-    demo_net(detector, args.image)
diff --git a/example/rcnn/helper/processing/bbox_transform.py b/example/rcnn/helper/processing/bbox_transform.py
index 0757a70eedd7..76fc937f80d7 100644
--- a/example/rcnn/helper/processing/bbox_transform.py
+++ b/example/rcnn/helper/processing/bbox_transform.py
@@ -32,7 +32,7 @@ def bbox_transform(ex_rois, gt_rois):
     return targets
 
 
-def bbox_pred(boxes, box_deltas):
+def bbox_pred(boxes, box_deltas, is_train=False):
     """
     Transform the set of class-agnostic boxes into class-specific boxes
     by applying the predicted offsets (box_deltas)
@@ -53,9 +53,15 @@ def bbox_pred(boxes, box_deltas):
     dy = box_deltas[:, 1::4]
     dw = box_deltas[:, 2::4]
     dh = box_deltas[:, 3::4]
-
+    if is_train:
+        dx = np.array(map(lambda x: np.sign(x)*10 if abs(x) > 10 else x, dx))
+        dy = np.array(map(lambda x: np.sign(x)*10 if abs(x) > 10 else x, dy))
     pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
     pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+
+    if is_train:
+        dw = np.array(map(lambda x: np.sign(x)*8 if abs(x) > 8 else x, dw))
+        dh = np.array(map(lambda x: np.sign(x)*8 if abs(x) > 8 else x, dh))
     pred_w = np.exp(dw) * widths[:, np.newaxis]
     pred_h = np.exp(dh) * heights[:, np.newaxis]
 
@@ -88,3 +94,18 @@ def clip_boxes(boxes, im_shape):
     # y2 < im_shape[0]
     boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
     return boxes
+
+def clip_pad(boxes, pad_shape):
+    """
+    Clip boxes of the pad area.
+    :param boxes: [n, c, H, W]
+    :param im_shape: [h, w]
+    :return: [n, c, h, w]
+    """
+    H, W = boxes.shape[2:]
+    h, w = pad_shape
+    if h < H:
+        boxes = boxes[:, :, :h, :].copy()
+    if w < W:
+        boxes = boxes[:, :, :, :w].copy()
+    return boxes
diff --git a/example/rcnn/helper/processing/image_processing.py b/example/rcnn/helper/processing/image_processing.py
index 5646c557929f..d03c73a643a0 100644
--- a/example/rcnn/helper/processing/image_processing.py
+++ b/example/rcnn/helper/processing/image_processing.py
@@ -21,7 +21,7 @@ def resize(im, target_size, max_size):
     return im, im_scale
 
 
-def transform(im, pixel_means):
+def transform(im, pixel_means, need_mean=False):
     """
     transform into mxnet tensor
     substract pixel size and transform to correct format
@@ -32,7 +32,8 @@ def transform(im, pixel_means):
     im = im.copy()
     im[:, :, (0, 1, 2)] = im[:, :, (2, 1, 0)]
     im = im.astype(float)
-    im -= pixel_means
+    if need_mean:
+        im -= pixel_means
     im_tensor = im[np.newaxis, :]
     # put channel first
     channel_swap = (0, 3, 1, 2)
diff --git a/example/rcnn/rcnn/callback.py b/example/rcnn/rcnn/callback.py
index 7b05628829ad..59df2442dffd 100644
--- a/example/rcnn/rcnn/callback.py
+++ b/example/rcnn/rcnn/callback.py
@@ -1,5 +1,6 @@
 import time
 import logging
+from rcnn.config import config
 
 
 class Speedometer(object):
@@ -24,6 +25,8 @@ def __call__(self, param):
                     name, value = param.eval_metric.get()
                     logging.info("Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f,\t%s=%f,\t%s=%f",
                                  param.epoch, count, speed, name[0], value[0], name[1], value[1], name[2], value[2])
+                    if config.END2END:
+                        print "\t\t\t\t\t\t\tTrain-{}={},\t{}={},\t{}={}".format(name[3], value[3], name[4], value[4], name[5], value[5])
                 else:
                     logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
                                  param.epoch, count, speed)
diff --git a/example/rcnn/rcnn/config.py b/example/rcnn/rcnn/config.py
index fb9826c1b186..74776f1fe93c 100644
--- a/example/rcnn/rcnn/config.py
+++ b/example/rcnn/rcnn/config.py
@@ -43,6 +43,7 @@
 
 # used for end2end training
 # RPN proposal
+config.END2END = 0
 config.TRAIN.RPN_NMS_THRESH = 0.7
 config.TRAIN.RPN_PRE_NMS_TOP_N = 12000
 config.TRAIN.RPN_POST_NMS_TOP_N = 6000
@@ -51,7 +52,9 @@
 config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = False
 config.TRAIN.BBOX_MEANS = (0.0, 0.0, 0.0, 0.0)
 config.TRAIN.BBOX_STDS = (0.1, 0.1, 0.2, 0.2)
-
+config.TRAIN.BBOX_MEANS_INV = (0.0, 0.0, 0.0, 0.0)
+config.TRAIN.BBOX_STDS_INV = (10.0, 10.0, 5.0, 5.0)
+config.TRAIN.IMS_PER_BATCH = 1
 config.TEST = edict()
 
 # R-CNN testing
diff --git a/example/rcnn/rcnn/loader.py b/example/rcnn/rcnn/loader.py
index cea0900245a3..dfe24baf6378 100644
--- a/example/rcnn/rcnn/loader.py
+++ b/example/rcnn/rcnn/loader.py
@@ -70,9 +70,14 @@ def reset(self):
                 horz_inds = np.where(horz)[0]
                 vert_inds = np.where(vert)[0]
                 inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds)))
-                inds = np.reshape(inds, (-1, 2))
-                row_perm = np.random.permutation(np.arange(inds.shape[0]))
-                inds = np.reshape(inds[row_perm, :], (-1, ))
+                if inds.shape[0] % 2:
+                    inds_ = np.reshape(inds[:-1], (-1, 2))
+                    row_perm = np.random.permutation(np.arange(inds_.shape[0]))
+                    inds[:-1] = np.reshape(inds_[row_perm, :], (-1, ))
+                else:
+                    inds = np.reshape(inds, (-1, 2))
+                    row_perm = np.random.permutation(np.arange(inds.shape[0]))
+                    inds = np.reshape(inds[row_perm, :], (-1, ))
                 self.index = inds
             else:
                 np.random.shuffle(self.index)
@@ -140,7 +145,7 @@ def get_batch(self):
 
 class AnchorLoader(mx.io.DataIter):
     def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, mode='train', ctx=None, work_load_list=None,
-                 feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2), allowed_border=0):
+                 feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2), allowed_border=0, need_mean=True):
         """
         This Iter will provide roi data to Fast R-CNN network
         :param feat_sym: to infer shape of assign_output
@@ -168,6 +173,7 @@ def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, mode='train', c
         self.anchor_ratios = anchor_ratios
         self.allowed_border = allowed_border
 
+        self.need_mean = need_mean
         self.cur = 0
         self.size = len(roidb)
         self.index = np.arange(self.size)
@@ -180,21 +186,29 @@ def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, mode='train', c
         self.get_batch()
         self.data_name = ['data', 'im_info']
         self.label_name = ['label', 'bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']
+        if config.END2END == 1:
+            self.label_name.append('gt_boxes')
 
     @property
     def provide_data(self):
         if self.mode == 'train':
-            return [('data', self.data[0].shape)]
+            provide_data_ = [('data', self.data[0].shape)]
+            if config.END2END == 1:
+                provide_data_.append(('im_info', self.data[1].shape))
+            return provide_data_
         else:
             return [(k, v.shape) for k, v in self.data.items()]
 
     @property
     def provide_label(self):
         if self.mode == 'train':
-            return [('label', self.label[0].shape),
-                    ('bbox_target', self.label[1].shape),
-                    ('bbox_inside_weight', self.label[2].shape),
-                    ('bbox_outside_weight', self.label[3].shape)]
+            provide_label_ = [('label', self.label[0].shape),
+                              ('bbox_target', self.label[1].shape),
+                              ('bbox_inside_weight', self.label[2].shape),
+                              ('bbox_outside_weight', self.label[3].shape)]
+            if config.END2END == 1:
+                provide_label_.append(('gt_boxes', self.label[4].shape))
+            return provide_label_
         else:
             return [(k, v.shape) for k, v in self.data.items()]
 
@@ -209,9 +223,14 @@ def reset(self):
                 horz_inds = np.where(horz)[0]
                 vert_inds = np.where(vert)[0]
                 inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds)))
-                inds = np.reshape(inds, (-1, 2))
-                row_perm = np.random.permutation(np.arange(inds.shape[0]))
-                inds = np.reshape(inds[row_perm, :], (-1, ))
+                if inds.shape[0] % 2:
+                    inds_ = np.reshape(inds[:-1], (-1, 2))
+                    row_perm = np.random.permutation(np.arange(inds_.shape[0]))
+                    inds[:-1] = np.reshape(inds_[row_perm, :], (-1, ))
+                else:
+                    inds = np.reshape(inds, (-1, 2))
+                    row_perm = np.random.permutation(np.arange(inds.shape[0]))
+                    inds = np.reshape(inds[row_perm, :], (-1, ))
                 self.index = inds
             else:
                 np.random.shuffle(self.index)
@@ -243,7 +262,7 @@ def get_batch(self):
         cur_to = min(cur_from + self.batch_size, self.size)
         roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)]
         if self.mode == 'test':
-            self.data, self.label = minibatch.get_minibatch(roidb, self.num_classes, self.mode)
+            self.data, self.label = minibatch.get_minibatch(roidb, self.num_classes, self.mode, need_mean=self.need_mean)
         else:
             work_load_list = self.work_load_list
             ctx = self.ctx
@@ -257,7 +276,7 @@ def get_batch(self):
             label_list = []
             for islice in slices:
                 iroidb = [roidb[i] for i in range(islice.start, islice.stop)]
-                data, label = minibatch.get_minibatch(iroidb, self.num_classes, self.mode)
+                data, label = minibatch.get_minibatch(iroidb, self.num_classes, self.mode, need_mean=self.need_mean)
                 data_list.append(data)
                 label_list.append(label)
 
@@ -278,17 +297,26 @@ def get_batch(self):
                 label = minibatch.assign_anchor(feat_shape, label['gt_boxes'], data['im_info'],
                                                 self.feat_stride, self.anchor_scales,
                                                 self.anchor_ratios, self.allowed_border)
-                del data['im_info']
+                # del data['im_info']
                 new_label_list.append(label)
 
+            assert len(label_list) == len(new_label_list),\
+                   "len(label_list)={},len(new_label_list)=".format(len(label_list), len(new_label_list))
             all_data = dict()
             for key in ['data']:
                 all_data[key] = tensor_vstack([batch[key] for batch in data_list])
+            if config.END2END == 1:
+                for key in ['im_info']:
+                    all_data[key] = tensor_vstack([batch[key] for batch in data_list])
 
             all_label = dict()
             all_label['label'] = tensor_vstack([batch['label'] for batch in new_label_list], pad=-1)
             for key in ['bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']:
                 all_label[key] = tensor_vstack([batch[key] for batch in new_label_list])
+            if config.END2END == 1:
+                for key in ['gt_boxes']:
+                    # should reshape the first dim to 1, because for every device, the batch size should be 1
+                    all_label[key] = tensor_vstack([batch[key].reshape(1, -1) for batch in new_label_list], pad=-1)
 
             self.data = [mx.nd.array(all_data['data'])]
 
@@ -296,3 +324,7 @@ def get_batch(self):
                           mx.nd.array(all_label['bbox_target']),
                           mx.nd.array(all_label['bbox_inside_weight']),
                           mx.nd.array(all_label['bbox_outside_weight'])]
+
+            if config.END2END == 1:
+                self.data.append(mx.nd.array(all_data['im_info']))
+                self.label.append(mx.nd.array(all_label['gt_boxes']))
diff --git a/example/rcnn/rcnn/metric.py b/example/rcnn/rcnn/metric.py
index b8bd90875604..90de820bf371 100644
--- a/example/rcnn/rcnn/metric.py
+++ b/example/rcnn/rcnn/metric.py
@@ -1,3 +1,8 @@
+"""
+if config.END2END = 1, then preds =
+[cls_label, rpn_cls_loss, rpn_bbox_loss, cls_loss, bbox_loss]
+"""
+
 import mxnet as mx
 import numpy as np
 
@@ -5,11 +10,15 @@
 
 
 class AccuracyMetric(mx.metric.EvalMetric):
-    def __init__(self, use_ignore=False, ignore=None):
-        super(AccuracyMetric, self).__init__('Accuracy')
+    def __init__(self, use_ignore=False, ignore=None, ex_rpn=False):
+        if ex_rpn:
+            super(AccuracyMetric, self).__init__('RPN-Accuracy')
+        else:
+            super(AccuracyMetric, self).__init__('Accuracy')
         self.use_ignore = use_ignore
         self.ignore = ignore
-        self.has_rpn = config.TRAIN.HAS_RPN
+        self.ex_rpn = ex_rpn  # used in end2end joint training, export rpn loss
+        self.has_rpn = config.TRAIN.HAS_RPN and config.END2END != 1
         if self.has_rpn:
             assert self.use_ignore and self.ignore is not None
 
@@ -21,20 +30,37 @@ def update(self, labels, preds):
             pred_label = pred_label[non_ignore_inds]
             label = label[non_ignore_inds]
         else:
-            last_dim = preds[0].shape[-1]
-            pred_label = preds[0].asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32')
-            label = labels[0].asnumpy().reshape(-1,).astype('int32')
+            if config.END2END != 1:
+                last_dim = preds[0].shape[-1]
+                pred_label = preds[0].asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32')
+                label = labels[0].asnumpy().reshape(-1,).astype('int32')
+            else:
+                if self.ex_rpn:
+                    pred_label = mx.ndarray.argmax_channel(preds[1]).asnumpy().astype('int32').reshape(1,-1)
+                    label = labels[0].asnumpy().astype('int32')
+                    # import pdb;pdb.set_trace()
+                    non_ignore_inds = np.where(label != self.ignore)
+                    pred_label = pred_label[non_ignore_inds]
+                    label = label[non_ignore_inds]
+                else:
+                    last_dim = preds[3].shape[-1]
+                    pred_label = preds[3].asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32')
+                    label = preds[0].asnumpy().reshape(-1,).astype('int32')
 
         self.sum_metric += (pred_label.flat == label.flat).sum()
         self.num_inst += len(pred_label.flat)
 
 
 class LogLossMetric(mx.metric.EvalMetric):
-    def __init__(self, use_ignore=False, ignore=None):
-        super(LogLossMetric, self).__init__('LogLoss')
+    def __init__(self, use_ignore=False, ignore=None, ex_rpn=False):
+        if ex_rpn:
+            super(LogLossMetric, self).__init__('RPN-LogLoss')
+        else:
+            super(LogLossMetric, self).__init__('LogLoss')
         self.use_ignore = use_ignore
         self.ignore = ignore
-        self.has_rpn = config.TRAIN.HAS_RPN
+        self.ex_rpn = ex_rpn
+        self.has_rpn = config.TRAIN.HAS_RPN and config.END2END != 1
         if self.has_rpn:
             assert self.use_ignore and self.ignore is not None
 
@@ -46,10 +72,23 @@ def update(self, labels, preds):
             label = label[non_ignore_inds]
             cls = pred_cls[label, non_ignore_inds]
         else:
-            last_dim = preds[0].shape[-1]
-            pred_cls = preds[0].asnumpy().reshape(-1, last_dim)
-            label = labels[0].asnumpy().reshape(-1,).astype('int32')
-            cls = pred_cls[np.arange(label.shape[0]), label]
+            if config.END2END != 1:
+                last_dim = preds[0].shape[-1]
+                pred_cls = preds[0].asnumpy().reshape(-1, last_dim)
+                label = labels[0].asnumpy().reshape(-1,).astype('int32')
+                cls = pred_cls[np.arange(label.shape[0]), label]
+            else:
+                if self.ex_rpn:
+                    pred_cls = preds[1].asnumpy()[0].reshape(2, -1)
+                    label = labels[0].asnumpy().astype('int32')[0]
+                    non_ignore_inds = np.where(label != self.ignore)[0]
+                    label = label[non_ignore_inds]
+                    cls = pred_cls[label, non_ignore_inds]
+                else:
+                    last_dim = preds[3].shape[-1]
+                    pred_cls = preds[3].asnumpy().reshape(-1, last_dim)
+                    label = preds[0].asnumpy().reshape(-1,).astype('int32')
+                    cls = pred_cls[np.arange(label.shape[0]), label]
         cls += config.EPS
         cls_loss = -1 * np.log(cls)
         cls_loss = np.sum(cls_loss)
@@ -58,17 +97,30 @@ def update(self, labels, preds):
 
 
 class SmoothL1LossMetric(mx.metric.EvalMetric):
-    def __init__(self):
-        super(SmoothL1LossMetric, self).__init__('SmoothL1Loss')
-        self.has_rpn = config.TRAIN.HAS_RPN
+    def __init__(self, ex_rpn=False):
+        if ex_rpn:
+            super(SmoothL1LossMetric, self).__init__('RPN-SmoothL1Loss')
+        else:
+            super(SmoothL1LossMetric, self).__init__('SmoothL1Loss')
+        self.ex_rpn = ex_rpn
+        self.has_rpn = config.TRAIN.HAS_RPN and config.END2END != 1
 
     def update(self, labels, preds):
         bbox_loss = preds[1].asnumpy()
         if self.has_rpn:
             bbox_loss = bbox_loss.reshape((bbox_loss.shape[0], -1))
         else:
-            first_dim = bbox_loss.shape[0] * bbox_loss.shape[1]
-            bbox_loss = bbox_loss.reshape(first_dim, -1)
+            if config.END2END != 1:
+                first_dim = bbox_loss.shape[0] * bbox_loss.shape[1]
+                bbox_loss = bbox_loss.reshape(first_dim, -1)
+            else:
+                if self.ex_rpn:
+                    bbox_loss = preds[2].asnumpy()
+                    bbox_loss = bbox_loss.reshape((bbox_loss.shape[0], -1))
+                else:
+                    bbox_loss = preds[-1].asnumpy()
+                    first_dim = bbox_loss.shape[0] * bbox_loss.shape[1]
+                    bbox_loss = bbox_loss.reshape(first_dim, -1)
         self.num_inst += bbox_loss.shape[0]
         bbox_loss = np.sum(bbox_loss)
         self.sum_metric += bbox_loss
diff --git a/example/rcnn/rcnn/minibatch.py b/example/rcnn/rcnn/minibatch.py
index 920d27eef22b..e52ed5500999 100644
--- a/example/rcnn/rcnn/minibatch.py
+++ b/example/rcnn/rcnn/minibatch.py
@@ -33,7 +33,7 @@
 from rcnn.config import config
 
 
-def get_minibatch(roidb, num_classes, mode='test'):
+def get_minibatch(roidb, num_classes, mode='test', need_mean=True):
     """
     return minibatch of images in roidb
     :param roidb: a list of dict, whose length controls batch size
@@ -44,7 +44,7 @@ def get_minibatch(roidb, num_classes, mode='test'):
     # build im_array: [num_images, c, h, w]
     num_images = len(roidb)
     random_scale_indexes = npr.randint(0, high=len(config.SCALES), size=num_images)
-    im_array, im_scales = get_image_array(roidb, config.SCALES, random_scale_indexes)
+    im_array, im_scales = get_image_array(roidb, config.SCALES, random_scale_indexes, need_mean=need_mean)
 
     if mode == 'train':
         cfg_key = 'TRAIN'
@@ -124,7 +124,7 @@ def get_minibatch(roidb, num_classes, mode='test'):
     return data, label
 
 
-def get_image_array(roidb, scales, scale_indexes):
+def get_image_array(roidb, scales, scale_indexes, need_mean=True):
     """
     build image array from specific roidb
     :param roidb: images to be processed
@@ -141,7 +141,7 @@ def get_image_array(roidb, scales, scale_indexes):
             im = im[:, ::-1, :]
         target_size = scales[scale_indexes[i]]
         im, im_scale = image_processing.resize(im, target_size, config.MAX_SIZE)
-        im_tensor = image_processing.transform(im, config.PIXEL_MEANS)
+        im_tensor = image_processing.transform(im, config.PIXEL_MEANS, need_mean=need_mean)
         processed_ims.append(im_tensor)
         im_scales.append(im_scale)
     array = image_processing.tensor_vstack(processed_ims)
@@ -347,9 +347,9 @@ def _compute_targets(ex_rois, gt_rois):
         positive_weights = np.ones((1, 4)) * 1.0 / num_examples
         negative_weights = np.ones((1, 4)) * 1.0 / num_examples
     else:
-        assert ((config.TRAIN.RPN_POSTIVE_WEIGHT > 0) & (config.TRAIN.RPN_POSTIVE_WEIGHT < 1))
-        positive_weights = config.TRAIN.RPN_POSTIVE_WEIGHT / np.sum(labels == 1)
-        negative_weights = (1.0 - config.TRAIN.RPN_POSTIVE_WEIGHT) / np.sum(labels == 1)
+        assert ((config.TRAIN.RPN_POSITIVE_WEIGHT > 0) & (config.TRAIN.RPN_POSITIVE_WEIGHT < 1))
+        positive_weights = config.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1)
+        negative_weights = (1.0 - config.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 1)
     bbox_outside_weights[labels == 1, :] = positive_weights
     bbox_outside_weights[labels == 0, :] = negative_weights
 
@@ -388,4 +388,8 @@ def _compute_targets(ex_rois, gt_rois):
              'bbox_target': bbox_targets,
              'bbox_inside_weight': bbox_inside_weights,
              'bbox_outside_weight': bbox_outside_weights}
+
+    if config.END2END == 1:
+        label.update({'gt_boxes': gt_boxes})
+
     return label
diff --git a/example/rcnn/rcnn/rpn/proposal.py b/example/rcnn/rcnn/rpn/proposal.py
index b0303c5cfd84..bc1a3a19ddbb 100644
--- a/example/rcnn/rcnn/rpn/proposal.py
+++ b/example/rcnn/rcnn/rpn/proposal.py
@@ -9,8 +9,9 @@
 
 from rcnn.config import config
 from helper.processing.generate_anchor import generate_anchors
-from helper.processing.bbox_transform import bbox_pred, clip_boxes
+from helper.processing.bbox_transform import bbox_pred, clip_boxes, clip_pad
 from helper.processing.nms import nms
+import logging
 
 DEBUG = False
 
@@ -46,7 +47,6 @@ def forward(self, is_train, req, in_data, out_data, aux):
         # apply NMS with threshold 0.7 to remaining proposals
         # take after_nms_topN proposals after NMS
         # return the top proposals (-> RoIs top, scores top)
-
         pre_nms_topN = config[self.cfg_key].RPN_PRE_NMS_TOP_N
         post_nms_topN = config[self.cfg_key].RPN_POST_NMS_TOP_N
         nms_thresh = config[self.cfg_key].RPN_NMS_THRESH
@@ -55,19 +55,24 @@ def forward(self, is_train, req, in_data, out_data, aux):
         # the first set of anchors are background probabilities
         # keep the second part
         scores = in_data[0].asnumpy()[:, self._num_anchors:, :, :]
+        if np.isnan(scores).any():
+            raise ValueError("there is nan in input scores")
         bbox_deltas = in_data[1].asnumpy()
+        if np.isnan(bbox_deltas).any():
+            raise ValueError("there is nan in input bbox_deltas")
         im_info = in_data[2].asnumpy()[0, :]
-
         if DEBUG:
             print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
             print 'scale: {}'.format(im_info[2])
 
         # 1. Generate proposals from bbox_deltas and shifted anchors
         height, width = scores.shape[-2:]
+        if self.cfg_key == 'TRAIN':
+            height, width = int(im_info[0] / self._feat_stride), int(im_info[1] / self._feat_stride)
 
         if DEBUG:
             print 'score map size: {}'.format(scores.shape)
-
+            print "resudial = ", scores.shape[2] - height, scores.shape[3] - width
         # Enumerate all shifts
         shift_x = np.arange(0, width) * self._feat_stride
         shift_y = np.arange(0, height) * self._feat_stride
@@ -84,7 +89,6 @@ def forward(self, is_train, req, in_data, out_data, aux):
         K = shifts.shape[0]
         anchors = self._anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
         anchors = anchors.reshape((K * A, 4))
-
         # Transpose and reshape predicted bbox transformations to get them
         # into the same order as the anchors:
         #
@@ -92,6 +96,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
         # transpose to (1, H, W, 4 * A)
         # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
         # in slowest to fastest order
+        bbox_deltas = clip_pad(bbox_deltas, (height, width))
         bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
 
         # Same story for the scores:
@@ -110,9 +115,9 @@ def forward(self, is_train, req, in_data, out_data, aux):
         # 3. remove predicted boxes with either height or width < threshold
         # (NOTE: convert min_size to input image scale stored in im_info[2])
         keep = ProposalOperator._filter_boxes(proposals, min_size * im_info[2])
+
         proposals = proposals[keep, :]
         scores = scores[keep]
-
         # 4. sort all (proposal, score) pairs by score from highest to lowest
         # 5. take top pre_nms_topN (e.g. 6000)
         order = scores.ravel().argsort()[::-1]
@@ -120,7 +125,6 @@ def forward(self, is_train, req, in_data, out_data, aux):
             order = order[:pre_nms_topN]
         proposals = proposals[order, :]
         scores = scores[order]
-
         # 6. apply nms (e.g. threshold = 0.7)
         # 7. take after_nms_topN (e.g. 300)
         # 8. return the top proposals (-> RoIs top)
@@ -129,23 +133,24 @@ def forward(self, is_train, req, in_data, out_data, aux):
             keep = keep[:post_nms_topN]
         # pad to ensure output size remains unchanged
         if len(keep) < post_nms_topN:
+            if len(keep) == 0:
+                logging.log(logging.ERROR, "currently len(keep) is zero")
             pad = npr.choice(keep, size=post_nms_topN - len(keep))
             keep = np.hstack((keep, pad))
         proposals = proposals[keep, :]
         scores = scores[keep]
-
         # Output rois array
         # Our RPN implementation only supports a single input image, so all
         # batch inds are 0
         batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
         blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
         self.assign(out_data[0], req[0], blob)
-
         if self._output_score:
             self.assign(out_data[1], req[1], scores.astype(np.float32, copy=False))
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
-        pass
+        self.assign(in_grad[0], req[0], 0)
+        self.assign(in_grad[1], req[1], 0)
 
     @staticmethod
     def _filter_boxes(boxes, min_size):
@@ -163,7 +168,7 @@ def __init__(self, feat_stride, scales, ratios, is_train=False, output_score=Fal
         self._feat_stride = feat_stride
         self._scales = scales
         self._ratios = ratios
-        self._is_train = is_train
+        self._is_train = True if is_train == 'True' else False
         self._output_score = output_score
 
         if self._is_train:
diff --git a/example/rcnn/rcnn/rpn/proposal_target.py b/example/rcnn/rcnn/rpn/proposal_target.py
new file mode 100644
index 000000000000..80f3206743bf
--- /dev/null
+++ b/example/rcnn/rcnn/rpn/proposal_target.py
@@ -0,0 +1,196 @@
+"""
+Proposal Operator transform anchor coordinates into ROI coordinates with prediction results on
+classification probability and bounding box prediction results, and image size and scale information.
+"""
+
+import mxnet as mx
+import numpy as np
+import numpy.random as npr
+from rcnn.config import config
+from helper.processing.bbox_regression import bbox_overlaps
+from helper.processing.bbox_regression import expand_bbox_regression_targets
+from helper.processing.bbox_transform import bbox_transform
+from helper.processing.generate_anchor import generate_anchors
+import logging
+
+
+DEBUG = False
+
+class ProposalTargetOperator(mx.operator.CustomOp):
+    def __init__(self, num_classes, is_train=False):
+        super(ProposalTargetOperator, self).__init__()
+        self._num_classes = int(num_classes)
+
+        if DEBUG:
+            self._count = 0
+            self._fg_num = 0
+            self._bg_num = 0
+        if is_train:
+            self.cfg_key = 'TRAIN'
+        else:
+            self.cfg_key = 'TEST'
+        self._img_per_batch = 1
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        assert config.TRAIN.BATCH_SIZE % self._img_per_batch == 0, \
+                'IMAGESPERBATCH {} must devide BATCHSIZE {}'.format(self._img_per_batch, config.TRAIN.BATCH_SIZE)
+        num_images = self._img_per_batch  # 1
+        assert num_images == 1, "only support signle image"
+        rois_per_image = config.TRAIN.BATCH_SIZE / self._img_per_batch
+        fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(int)  # neg : pos=3 : 1
+        all_rois = in_data[0].asnumpy()
+        gt_boxes = in_data[1].asnumpy()
+        gt_boxes = gt_boxes[np.where(gt_boxes[:, :5].mean(axis=1) != -1)]
+
+        # Include ground-truth boxes in the set of candidate rois
+        zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
+        all_rois = np.vstack(
+            (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
+        )
+        # Sanity check: single batch only
+        assert np.all(all_rois[:, 0] == 0), \
+                'Only single item batches are supported'
+
+        # Sample rois with classification labels and bounding box regression
+        # targets
+        labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
+            all_rois, gt_boxes, fg_rois_per_image,
+            rois_per_image, self._num_classes, self.cfg_key)
+
+        if DEBUG:
+            print "labels=", labels
+            print 'num fg: {}'.format((labels > 0).sum())
+            print 'num bg: {}'.format((labels == 0).sum())
+            self._count += 1
+            self._fg_num += (labels > 0).sum()
+            self._bg_num += (labels == 0).sum()
+            print "self._count=", self._count
+            print 'num fg avg: {}'.format(self._fg_num / self._count)
+            print 'num bg avg: {}'.format(self._bg_num / self._count)
+            print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num))
+
+        self.assign(out_data[0], req[0], rois)
+        self.assign(out_data[1], req[1], labels)
+        self.assign(out_data[2], req[2], bbox_targets)
+        self.assign(out_data[3], req[3], bbox_inside_weights)
+        self.assign(out_data[4], req[4], np.array(bbox_inside_weights > 0).astype(np.float32) ) # no normalization
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        self.assign(in_grad[0], req[0], 0)
+        self.assign(in_grad[1], req[1], 0)
+
+@mx.operator.register("proposal_target")
+class ProposalTargetProp(mx.operator.CustomOpProp):
+    def __init__(self, num_classes, is_train=False):
+        super(ProposalTargetProp, self).__init__(need_top_grad=False)
+        self._num_classes = int(num_classes)
+        self._is_train = True if is_train == 'True' else False
+        if self._is_train:
+            self.cfg_key = 'TRAIN'
+        else:
+            self.cfg_key = 'TEST'
+        self._img_per_batch = 1
+
+    def list_arguments(self):
+        return ['rpn_roi', 'gt_boxes']
+
+    def list_outputs(self):
+        return ['roi', 'label', 'bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']
+
+    def infer_shape(self, in_shape):
+        rpn_roi_shape = in_shape[0]
+        gt_boxes_shape = in_shape[1]
+
+        batch_size = config.TRAIN.BATCH_SIZE / self._img_per_batch
+        # output shape
+        roi_shape = (batch_size, 5)  # used for input of roi-pooling
+        label_shape = (batch_size, )  # becauseful not set (batch_size, 1)
+        bbox_target_shape = (batch_size, self._num_classes * 4)
+        bbox_inside_weight_shape = (batch_size, self._num_classes * 4)
+        bbox_outside_weight_shape = (batch_size, self._num_classes * 4)
+
+        return [rpn_roi_shape, gt_boxes_shape], [roi_shape, label_shape, bbox_target_shape, bbox_inside_weight_shape, bbox_outside_weight_shape]
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return ProposalTargetOperator(self._num_classes, self._is_train)
+
+    def declare_backward_dependency(self, out_grad, in_data, out_data):
+        return []
+
+def _compute_targets(ex_rois, gt_rois, labels):
+    """Compute bounding-box regression targets for an image."""
+
+    assert ex_rois.shape[0] == gt_rois.shape[0]
+    assert ex_rois.shape[1] == 4
+    assert gt_rois.shape[1] == 4
+
+    targets = bbox_transform(ex_rois, gt_rois)
+    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+        # Optionally normalize targets by a precomputed mean and stdev
+        targets = ((targets - np.array(config.TRAIN.BBOX_MEANS))
+                / np.array(config.TRAIN.BBOX_STDS))
+    return np.hstack(
+            (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
+
+def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes, key):
+    """Generate a random sample of RoIs comprising foreground and background
+    examples.
+    """
+    # overlaps: (rois x gt_boxes)
+    overlaps = bbox_overlaps(
+        np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
+        np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
+    gt_assignment = overlaps.argmax(axis=1)
+    max_overlaps = overlaps.max(axis=1)
+    labels = gt_boxes[gt_assignment, 4]
+
+    # Select foreground RoIs as those with >= FG_THRESH overlap
+    fg_inds = np.where(max_overlaps >= config.TRAIN.FG_THRESH)[0]
+    # Guard against the case when an image has fewer than fg_rois_per_image
+    # foreground RoIs
+    fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
+    # Sample foreground regions without replacement
+    if fg_inds.size > 0:
+        fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
+    if fg_inds.size < fg_rois_per_image:
+        fg_inds_ = npr.choice(fg_inds, size=fg_rois_per_image-fg_inds.size, replace=True)
+        fg_inds = np.hstack((fg_inds_, fg_inds))
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_inds_ = np.where((max_overlaps < config.TRAIN.BG_THRESH_HI) &
+                       (max_overlaps >= config.TRAIN.BG_THRESH_LO))[0]
+    if len(bg_inds_) == 0 and key == 'TRAIN':
+        bg_inds = np.where((max_overlaps < config.TRAIN.BG_THRESH_HI+0.2) &
+                       (max_overlaps >= 0))[0]
+    else:
+        bg_inds = bg_inds_
+
+    if len(bg_inds) == 0:
+        logging.log(logging.ERROR, "currently len(bg_inds) is zero")
+
+    # Compute number of background RoIs to take from this image (guarding
+    # against there being fewer than desired)
+    bg_rois_per_this_image = rois_per_image - len(fg_inds)
+    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
+    # Sample background regions without replacement
+    if bg_inds.size > 0:
+        bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
+    if bg_inds.size < rois_per_image-fg_rois_per_image:
+        bg_inds_ = npr.choice(bg_inds, size=rois_per_image-fg_rois_per_image-bg_inds.size, replace=True)
+        bg_inds = np.hstack((bg_inds_, bg_inds))
+
+    # The indices that we're selecting (both fg and bg)
+    keep_inds = np.append(fg_inds, bg_inds)
+    # Select sampled values from various arrays:
+    labels = labels[keep_inds]
+    # Clamp labels for the background RoIs to 0
+    labels[fg_rois_per_this_image:] = 0
+    rois = all_rois[keep_inds]
+
+    bbox_target_data = _compute_targets(
+        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
+
+    bbox_targets, bbox_inside_weights = \
+        expand_bbox_regression_targets(bbox_target_data, num_classes)
+
+    return labels, rois, bbox_targets, bbox_inside_weights
\ No newline at end of file
diff --git a/example/rcnn/rcnn/symbol.py b/example/rcnn/rcnn/symbol.py
index e483fdc4f0b2..4ea62c931e4d 100644
--- a/example/rcnn/rcnn/symbol.py
+++ b/example/rcnn/rcnn/symbol.py
@@ -1,5 +1,5 @@
 import mxnet as mx
-import rpn.proposal
+import rpn.proposal, rpn.proposal_target
 from config import config
 
 
@@ -299,3 +299,89 @@ def get_vgg_test(num_classes=21, num_anchors=9):
     # group output
     group = mx.symbol.Group([rois, cls_prob, bbox_pred])
     return group
+
+def get_faster_rcnn(num_classes=21, num_anchors=9):
+    """
+    Faster R-CNN with VGG 16 conv layers
+    :param num_classes: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    im_info = mx.symbol.Variable(name="im_info")
+    # label
+    gt_boxes = mx.symbol.Variable(name="gt_boxes")
+    label = mx.symbol.Variable(name='label')
+    bbox_target = mx.symbol.Variable(name='bbox_target')
+    bbox_inside_weight = mx.symbol.Variable(name='bbox_inside_weight')
+    bbox_outside_weight = mx.symbol.Variable(name='bbox_outside_weight')
+
+    gt_boxes = mx.symbol.Reshape(data=gt_boxes, shape=(-1, 5), name='gt_boxes_reshape')
+
+    relu5_3 = get_vgg_conv(data)
+
+    ## RPN
+    rpn_conv = mx.symbol.Convolution(
+        data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3")
+    rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu")
+    rpn_cls_score = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score")
+    rpn_bbox_pred = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred")
+    # prepare rpn data
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape")
+
+    # classification
+    rpn_cls_loss = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=label, multi_output=True,
+                                       normalization='valid', use_ignore=True, ignore_label=-1, name="rpn_cls_loss")
+    # bounding box regression
+    rpn_bbox_loss_ = bbox_outside_weight * \
+                 mx.symbol.smooth_l1(name='rpn_bbox_loss_', scalar=3.0,
+                                     data=bbox_inside_weight * (rpn_bbox_pred - bbox_target))
+    rpn_bbox_loss = mx.sym.MakeLoss(name='rpn_bbox_loss', data=rpn_bbox_loss_)
+
+    rpn_cls_prob = mx.symbol.SoftmaxActivation(
+        data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob")
+    rpn_cls_prob_reshape = mx.symbol.Reshape(
+        data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
+
+    rpn_roi = mx.symbol.Custom(
+        cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rpn_rois',
+        op_type='proposal', feat_stride=16, scales=(8, 16, 32), ratios=(0.5, 1, 2), is_train=True)  # TODO(be careful of cls_prob)
+    rois = mx.symbol.Custom(
+        rpn_roi=rpn_roi, gt_boxes=gt_boxes, name='rois', op_type='proposal_target',
+        num_classes=num_classes, is_train=True)  #
+
+    # R-CNN
+    pool5 = mx.symbol.ROIPooling(
+        name='roi_pool5', data=relu5_3, rois=rois[0], pooled_size=(7, 7), spatial_scale=0.0625)
+
+    # group 6
+    flatten = mx.symbol.Flatten(data=pool5, name="flatten")
+    fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+    relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6")
+    drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
+    # group 7
+    fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+    relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7")
+    drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
+    # classification
+    cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes)
+
+
+    cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=rois[1], normalization='batch')
+    # bounding box regression
+    bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4)
+    bbox_loss_ = rois[4] * \
+                 mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0,
+                                     data=rois[3] * (bbox_pred - rois[2]))
+    bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / config.TRAIN.BATCH_SIZE)
+
+    # reshape output
+    cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TRAIN.IMS_PER_BATCH, -1, num_classes), name='cls_prob_reshape')
+    bbox_pred = mx.symbol.Reshape(data=bbox_loss, shape=(config.TRAIN.IMS_PER_BATCH, -1, 4 * num_classes), name='bbox_pred_reshape')
+
+    # group output
+    group = mx.symbol.Group([rois[1], rpn_cls_loss, rpn_bbox_loss, cls_prob, bbox_pred])  # rois[1] is used for evaluation
+
+    return group
\ No newline at end of file
diff --git a/example/rcnn/rcnn/tester.py b/example/rcnn/rcnn/tester.py
index 0dc253e3878b..c6c65140e5fa 100644
--- a/example/rcnn/rcnn/tester.py
+++ b/example/rcnn/rcnn/tester.py
@@ -113,3 +113,25 @@ def vis_all_detection(im_array, detections, imdb_classes=None, thresh=0.7):
                                '{:s} {:.3f}'.format(imdb_classes[j], score),
                                bbox=dict(facecolor=color, alpha=0.5), fontsize=12, color='white')
     plt.show()
+
+def save_all_detection(im_array, detections, imdb_classes=None, thresh=0.7):
+    """
+    save all detections in one image with result.png
+    :param im_array: [b=1 c h w] in rgb
+    :param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ]
+    :param imdb_classes: list of names in imdb
+    :param thresh: threshold for valid detections
+    :return:
+    """
+    import random
+    im = image_processing.transform_inverse(im_array, config.PIXEL_MEANS)
+    im = im[:, :, ::-1].copy()  # back to b,g,r
+    for j in range(1, len(imdb_classes)):
+        color = (255*random.random(), 255*random.random(), 255*random.random())  # generate a random color
+        dets = detections[j]
+        for i in range(dets.shape[0]):
+            bbox = dets[i, :4]
+            score = dets[i, -1]
+            if score > thresh:
+                cv2.rectangle(im, (int(round(bbox[0])), int(round(bbox[1]))), (int(round(bbox[2])), int(round(bbox[3]))), color, 2)
+    cv2.imwrite("result.jpg", im)
\ No newline at end of file
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
index 74ffc40673c2..05d36d78d9d2 100644
--- a/example/rcnn/test.py
+++ b/example/rcnn/test.py
@@ -1,6 +1,3 @@
-import argparse
-import os
-
 import mxnet as mx
 
 from tools.test_rcnn import test_rcnn
@@ -9,4 +6,5 @@
 if __name__ == '__main__':
     args = parse_args()
     ctx = mx.gpu(args.gpu_id)
-    test_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis, args.has_rpn)
+    test_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis,
+              args.has_rpn)
diff --git a/example/rcnn/tools/test_rcnn.py b/example/rcnn/tools/test_rcnn.py
index fdbc92c97acf..92b1b8600016 100644
--- a/example/rcnn/tools/test_rcnn.py
+++ b/example/rcnn/tools/test_rcnn.py
@@ -12,7 +12,8 @@
 from utils.load_model import load_param
 
 
-def test_rcnn(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis=False, has_rpn=True, proposal='rpn'):
+def test_rcnn(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis=False, has_rpn=True, proposal='rpn',
+              end2end=False):
     # load symbol and testing data
     if has_rpn:
         sym = get_vgg_test()
@@ -28,7 +29,7 @@ def test_rcnn(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis=Fa
     test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
 
     # load model
-    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+    args, auxs, _ = load_param(prefix, epoch, convert=True, ctx=ctx)
 
     # detect
     detector = Detector(sym, ctx, args, auxs)
@@ -61,5 +62,7 @@ def parse_args():
 if __name__ == '__main__':
     args = parse_args()
     ctx = mx.gpu(args.gpu_id)
+    if args.end2end:
+        args.has_rpn = True
     test_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis,
               args.has_rpn, args.proposal)
diff --git a/example/rcnn/train_end2end.py b/example/rcnn/train_end2end.py
new file mode 100644
index 000000000000..09ac162d7ea7
--- /dev/null
+++ b/example/rcnn/train_end2end.py
@@ -0,0 +1,172 @@
+import argparse
+import logging
+import os
+import mxnet as mx
+from rcnn.callback import Speedometer
+from rcnn.config import config
+from rcnn.loader import AnchorLoader
+from rcnn.metric import AccuracyMetric, LogLossMetric, SmoothL1LossMetric
+from rcnn.module import MutableModule
+from rcnn.symbol import get_faster_rcnn
+from utils.load_data import load_gt_roidb
+from utils.load_model import do_checkpoint, load_param
+from rcnn.warmup import WarmupScheduler
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+def end2end_train(image_set, test_image_set, year, root_path, devkit_path, pretrained, epoch, prefix,
+                  ctx, begin_epoch, num_epoch, frequent, kv_store, mom, wd, lr, num_classes, monitor,
+                  work_load_list=None, resume=False, use_flip=True, factor_step=50000):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    mon = None
+    config.TRAIN.BG_THRESH_HI = 0.5  # TODO(verify)
+    config.TRAIN.BG_THRESH_LO = 0.0  # TODO(verify)
+    config.TRAIN.RPN_MIN_SIZE = 16
+
+    logging.info('########## TRAIN FASTER-RCNN WITH APPROXIMATE JOINT END2END #############')
+    config.TRAIN.HAS_RPN = True
+    config.END2END = 1
+    config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = True
+    sym = get_faster_rcnn(num_classes=num_classes)
+    feat_sym = sym.get_internals()['rpn_cls_score_output']
+
+    # setup multi-gpu
+    config.TRAIN.IMS_PER_BATCH *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)  # no used here
+
+    # infer max shape
+    max_data_shape = [('data', (config.TRAIN.IMS_PER_BATCH, 3, 1000, 1000))]
+    max_data_shape_dict = {k: v for k, v in max_data_shape}
+    _, feat_shape, _ = feat_sym.infer_shape(**max_data_shape_dict)
+    from rcnn.minibatch import assign_anchor
+    import numpy as np
+    label = assign_anchor(feat_shape[0], np.zeros((0, 5)), [[1000, 1000, 1.0]])
+    max_label_shape = [('label', label['label'].shape),
+                       ('bbox_target', label['bbox_target'].shape),
+                       ('bbox_inside_weight', label['bbox_inside_weight'].shape),
+                       ('bbox_outside_weight', label['bbox_outside_weight'].shape),
+                       ('gt_boxes', (config.TRAIN.IMS_PER_BATCH, 5*100))]  # assume at most 100 object in image
+    print 'providing maximum shape', max_data_shape, max_label_shape
+
+    # load training data
+    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path, flip=use_flip)
+    train_data = AnchorLoader(feat_sym, roidb, batch_size=config.TRAIN.IMS_PER_BATCH, shuffle=True, mode='train',
+                              ctx=ctx, work_load_list=work_load_list)
+    # load pretrained
+    args, auxs, _ = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    if not resume:
+        del args['fc8_weight']
+        del args['fc8_bias']
+        input_shapes = {k: (1,)+ v[1::] for k, v in train_data.provide_data + train_data.provide_label}
+        arg_shape, _, _ = sym.infer_shape(**input_shapes)
+        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+
+        args['rpn_conv_3x3_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_conv_3x3_weight'])
+        args['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias'])
+        args['rpn_cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_cls_score_weight'])
+        args['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias'])
+        args['rpn_bbox_pred_weight'] = mx.random.normal(0, 0.001, shape=arg_shape_dict['rpn_bbox_pred_weight'])  # guarantee not likely explode with bbox_delta
+        args['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias'])
+        args['cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['cls_score_weight'])
+        args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'])
+        args['bbox_pred_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['bbox_pred_weight'])
+        args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
+
+    # prepare training
+    if config.TRAIN.FINETUNE:
+        fixed_param_prefix = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
+    else:
+        fixed_param_prefix = ['conv1', 'conv2']
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
+    epoch_end_callback = do_checkpoint(prefix)
+    rpn_eval_metric = AccuracyMetric(use_ignore=True, ignore=-1, ex_rpn=True)
+    rpn_cls_metric = LogLossMetric(use_ignore=True, ignore=-1, ex_rpn=True)
+    rpn_bbox_metric = SmoothL1LossMetric(ex_rpn=True)
+    eval_metric = AccuracyMetric()
+    cls_metric = LogLossMetric()
+    bbox_metric = SmoothL1LossMetric()
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]:
+        eval_metrics.add(child_metric)
+    optimizer_params = {'momentum': mom,
+                        'wd': wd,
+                        'learning_rate': lr,
+                        'lr_scheduler': WarmupScheduler(factor_step, 0.1, warmup_lr=lr*0.1, warmup_step=200) if not resume \
+                                        else mx.lr_scheduler.FactorScheduler(factor_step, 0.1),
+                        'clip_gradient': 1.0,
+                        'rescale_grad': 1.0 }
+                        # 'rescale_grad': (1.0 / config.TRAIN.RPN_BATCH_SIZE)}
+    # train
+    mod = MutableModule(sym, data_names=data_names, label_names=label_names,
+                        logger=logger, context=ctx, work_load_list=work_load_list,
+                        max_data_shapes=max_data_shape, max_label_shapes=max_label_shape,
+                        fixed_param_prefix=fixed_param_prefix)
+    if monitor:
+        def norm_stat(d):
+            return mx.nd.norm(d)/np.sqrt(d.size)
+        mon = mx.mon.Monitor(100, norm_stat)
+
+    mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
+            batch_end_callback=batch_end_callback, kvstore=kv_store,
+            optimizer='sgd', optimizer_params=optimizer_params, monitor=mon,
+            arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=num_epoch)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train Faster R-CNN Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--num-classes', dest='num_classes', help='the class number of dataset',
+                        default=21, type=int)
+    parser.add_argument('--test_image_set', dest='test_image_set', help='can be test or val',
+                        default='test', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--no-flip', action='store_true', default=False,
+                        help='if true, then will flip the dataset')
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--load-epoch', dest='load_epoch', help='epoch of pretrained model',
+                        default=0, type=int)
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'faster-rcnn'), type=str)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--num_epoch', dest='num_epoch', help='end epoch of faster rcnn end2end training',
+                        default=7, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    parser.add_argument('--lr', type=float, default=0.001, help='initialization learning reate')
+    parser.add_argument('--mom', type=float, default=0.9, help='momentum for sgd')
+    parser.add_argument('--wd', type=float, default=0.0005, help='weight decay for sgd')
+    parser.add_argument('--resume', action='store_true', default=False,
+                        help='if true, then will retrain the model from rcnn')
+    parser.add_argument('--factor-step',type=int, default=50000, help='the step used for lr factor')
+    parser.add_argument('--monitor', action='store_true', default=False,
+                        help='if true, then will use monitor debug')
+    args = parser.parse_args()
+    logging.info(args)
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    end2end_train(args.image_set, args.test_image_set, args.year, args.root_path, args.devkit_path,
+                  args.pretrained, args.load_epoch, args.prefix, ctx, args.load_epoch, args.num_epoch,
+                  args.frequent, args.kv_store, args.mom, args.wd, args.lr, args.num_classes, args.monitor,
+                  args.work_load_list, args.resume, not args.no_flip, args.factor_step)
diff --git a/example/rcnn/utils/load_model.py b/example/rcnn/utils/load_model.py
index c767661232e7..cca058379bbf 100644
--- a/example/rcnn/utils/load_model.py
+++ b/example/rcnn/utils/load_model.py
@@ -1,5 +1,7 @@
 import mxnet as mx
-
+from mxnet.model import save_checkpoint
+from rcnn.config import config
+import numpy as np
 
 def load_checkpoint(prefix, epoch):
     """
@@ -24,6 +26,33 @@ def load_checkpoint(prefix, epoch):
     return arg_params, aux_params
 
 
+def do_checkpoint(prefix):
+    """Callback to checkpoint the model to prefix every epoch.
+
+    Parameters
+    ----------
+    prefix : str
+        The file prefix to checkpoint to
+
+    Returns
+    -------
+    callback : function
+        The callback function that can be passed as iter_end_callback to fit.
+    """
+    def _callback(iter_no, sym, arg, aux):
+        if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+            print "save model with mean/std"
+            num_classes = len(arg['bbox_pred_bias'].asnumpy()) / 4
+            means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (1, num_classes))
+            stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (1, num_classes))
+            arg['bbox_pred_weight'] = (arg['bbox_pred_weight'].T * mx.nd.array(stds)).T
+            arg['bbox_pred_bias'] = arg['bbox_pred_bias'] * mx.nd.array(np.squeeze(stds)) + \
+                                           mx.nd.array(np.squeeze(means))
+        """The checkpoint function."""
+        save_checkpoint(prefix, iter_no + 1, sym, arg, aux)
+    return _callback
+
+
 def convert_context(params, ctx):
     """
     :param params: dict of str to NDArray
@@ -46,9 +75,21 @@ def load_param(prefix, epoch, convert=False, ctx=None):
     :return: (arg_params, aux_params)
     """
     arg_params, aux_params = load_checkpoint(prefix, epoch)
+    num_classes = 1000
+    if "bbox_pred_bias" in arg_params.keys():
+        num_classes = len(arg_params['bbox_pred_bias'].asnumpy()) / 4
+    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED and "bbox_pred_bias" in arg_params.keys():
+        print "lode model with mean/std"
+        means = np.tile(np.array(config.TRAIN.BBOX_MEANS_INV), (1, num_classes))
+        stds = np.tile(np.array(config.TRAIN.BBOX_STDS_INV), (1, num_classes))
+        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds)).T
+        arg_params['bbox_pred_bias'] = (arg_params['bbox_pred_bias'] - mx.nd.array(np.squeeze(means))) * \
+                                       mx.nd.array(np.squeeze(stds))
+
     if convert:
         if ctx is None:
             ctx = mx.cpu()
         arg_params = convert_context(arg_params, ctx)
         aux_params = convert_context(aux_params, ctx)
-    return arg_params, aux_params
+    return arg_params, aux_params, num_classes
+
diff --git a/example/rnn/lstm_ptb.R b/example/rnn/lstm_ptb.R
new file mode 100644
index 000000000000..7c46b190ef9a
--- /dev/null
+++ b/example/rnn/lstm_ptb.R
@@ -0,0 +1,106 @@
+# PennTreeBank Language Model using lstm, you can replace mx.lstm by mx.gru/ mx.rnn to use gru/rnn model
+# The data file can be found at:
+# https://github.com/dmlc/web-data/tree/master/mxnet/ptb
+require(hash)
+require(mxnet)
+require(stringr
+
+    )
+
+load.data <- function(path, dic=NULL) {
+    fi <- file(path, "r")
+    content <- paste(readLines(fi), collapse="<eos>")
+    close(fi)
+    #cat(content)
+    content <- str_split(content, ' ')[[1]]
+    cat(paste0("Loading ", path, ", size of data = ", length(content), "\n"))
+    X <- array(0, dim=c(length(content)))
+    #cat(X)
+    if (is.null(dic))
+        dic <- hash()
+    idx <- 1
+    for (i in 1:length(content)) {
+        word <- content[i]
+        if (str_length(word) > 0) {
+            if (!has.key(word, dic)) {
+                dic[[word]] <- idx
+                idx <- idx + 1
+            }
+            X[i] <- dic[[word]]
+        }
+    }
+    cat(paste0("Unique token: ", length(dic), "\n"))
+    return (list(X=X, dic=dic))
+}
+
+
+replicate.data <- function(X, seq.len) {
+    num.seq <- as.integer(length(X) / seq.len)
+    X <- X[1:(num.seq*seq.len)]
+    print
+    dim(X) = c(seq.len, num.seq)
+    return (X)
+}
+
+drop.tail <- function(X, batch.size) {
+    shape <- dim(X)
+    nstep <- as.integer(shape[2] / batch.size)
+    return (X[, 1:(nstep * batch.size)])
+}
+
+get.label <- function(X) {
+    label <- array(0, dim=dim(X))
+    d <- dim(X)[1]
+    w <- dim(X)[2]
+    for (i in 0:(w-1)) {
+        for (j in 1:d) {
+            label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
+        }
+    }
+    return (label)
+}
+
+batch.size = 20
+seq.len = 35
+num.hidden = 200
+num.embed = 200
+num.lstm.layer = 2
+num.round = 15
+learning.rate= 0.1
+wd=0.00001
+update.period = 1
+
+
+train <- load.data("./data/ptb.train.txt")
+X.train <- train$X
+dic <- train$dic
+val <- load.data("./data/ptb.valid.txt", dic)
+X.val <- val$X
+dic <- val$dic
+X.train.data <- replicate.data(X.train, seq.len)
+X.val.data <- replicate.data(X.val, seq.len)
+vocab <- length(dic)
+cat(paste0("Vocab=", vocab, "\n"))
+
+X.train.data <- drop.tail(X.train.data, batch.size)
+X.val.data <- drop.tail(X.val.data, batch.size)
+X.train.label <- get.label(X.train.data)
+X.val.label <- get.label(X.val.data)
+X.train <- list(data=X.train.data, label=X.train.label)
+X.val <- list(data=X.val.data, label=X.val.label)
+
+model <- mx.lstm(X.train, X.val, 
+                 ctx=mx.gpu(0),
+                 num.round=num.round, 
+                 update.period=update.period,
+                 num.lstm.layer=num.lstm.layer, 
+                 seq.len=seq.len,
+                 num.hidden=num.hidden, 
+                 num.embed=num.embed, 
+                 num.label=vocab,
+                 batch.size=batch.size, 
+                 input.size=vocab,
+                 initializer=mx.init.uniform(0.01), 
+                 learning.rate=learning.rate,
+                 wd=wd)
+
diff --git a/example/rnn/rnn_cell_demo.py b/example/rnn/rnn_cell_demo.py
new file mode 100644
index 000000000000..2c798e2c9c13
--- /dev/null
+++ b/example/rnn/rnn_cell_demo.py
@@ -0,0 +1,135 @@
+"""A simple demo of new RNN cell with PTB language model."""
+
+import os
+
+import numpy as np
+import mxnet as mx
+
+from bucket_io import BucketSentenceIter, default_build_vocab
+
+
+data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
+
+
+def Perplexity(label, pred):
+    # TODO(tofix): we make a transpose of label here, because when
+    # using the RNN cell, we called swap axis to the data.
+    label = label.T.reshape((-1,))
+    loss = 0.
+    for i in range(pred.shape[0]):
+        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
+    return np.exp(loss / label.size)
+
+
+if __name__ == '__main__':
+    batch_size = 128
+    buckets = [10, 20, 30, 40, 50, 60]
+    num_hidden = 200
+    num_embed = 200
+    num_lstm_layer = 2
+
+    num_epoch = 2
+    learning_rate = 0.01
+    momentum = 0.0
+
+    contexts = [mx.context.gpu(i) for i in range(4)]
+    vocab = default_build_vocab(os.path.join(data_dir, 'ptb.train.txt'))
+
+    init_h = [('LSTM_init_h', (batch_size, num_lstm_layer, num_hidden))]
+    init_c = [('LSTM_init_c', (batch_size, num_lstm_layer, num_hidden))]
+    init_states = init_c + init_h
+
+    data_train = BucketSentenceIter(os.path.join(data_dir, 'ptb.train.txt'),
+                                    vocab, buckets, batch_size, init_states)
+    data_val = BucketSentenceIter(os.path.join(data_dir, 'ptb.valid.txt'),
+                                  vocab, buckets, batch_size, init_states)
+
+    def sym_gen(seq_len):
+        data = mx.sym.Variable('data')
+        label = mx.sym.Variable('softmax_label')
+        embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
+                                 output_dim=num_embed, name='embed')
+
+        # TODO(tofix)
+        # The inputs and labels from IO are all in batch-major.
+        # We need to transform them into time-major to use RNN cells.
+        embed_tm = mx.sym.SwapAxis(embed, dim1=0, dim2=1)
+        label_tm = mx.sym.SwapAxis(label, dim1=0, dim2=1)
+
+        # TODO(tofix)
+        # Create transformed RNN initial states. Normally we do
+        # no need to do this. But the RNN symbol expects the state
+        # to be time-major shape layout, while the current mxnet
+        # IO and high-level training logic assume everything from
+        # the data iter have batch_size as the first dimension.
+        # So until we have extended our IO and training logic to
+        # support this more general case, this dummy axis swap is
+        # needed.
+        rnn_h_init = mx.sym.SwapAxis(mx.sym.Variable('LSTM_init_h'),
+                                     dim1=0, dim2=1)
+        rnn_c_init = mx.sym.SwapAxis(mx.sym.Variable('LSTM_init_c'),
+                                     dim1=0, dim2=1)
+
+        # TODO(tofix)
+        # currently all the LSTM parameters are concatenated as
+        # a huge vector, and named '<name>_parameters'. By default
+        # mxnet initializer does not know how to initilize this
+        # guy because its name does not ends with _weight or _bias
+        # or anything familiar. Here we just use a temp workaround
+        # to create a variable and name it as LSTM_bias to get
+        # this demo running. Note by default bias is initialized
+        # as zeros, so this is not a good scheme. But calling it
+        # LSTM_weight is not good, as this is 1D vector, while
+        # the initialization scheme of a weight parameter needs
+        # at least two dimensions.
+        rnn_params = mx.sym.Variable('LSTM_bias')
+
+        # RNN cell takes input of shape (time, batch, feature)
+        rnn = mx.sym.RNN(data=embed_tm, state_size=num_hidden,
+                         num_layers=num_lstm_layer, mode='lstm',
+                         name='LSTM', 
+                         # The following params can be omitted
+                         # provided we do not need to apply the
+                         # workarounds mentioned above
+                         state=rnn_h_init,
+                         state_cell=rnn_c_init, 
+                         parameters=rnn_params)
+
+        # the RNN cell output is of shape (time, batch, dim)
+        # if we need the states and cell states in the last time
+        # step (e.g. when building encoder-decoder models), we
+        # can set state_outputs=True, and the RNN cell will have
+        # extra outputs: rnn['LSTM_output'], rnn['LSTM_state']
+        # and for LSTM, also rnn['LSTM_state_cell']
+
+        # now we collapse the time and batch dimension to do the
+        # final linear logistic regression prediction
+        hidden = mx.sym.Reshape(data=rnn, shape=(-1, num_hidden))
+        label_cl = mx.sym.Reshape(data=label_tm, shape=(-1,))
+
+        pred = mx.sym.FullyConnected(data=hidden, num_hidden=len(vocab),
+                                     name='pred')
+        sm = mx.sym.SoftmaxOutput(data=pred, label=label_cl, name='softmax')
+
+        data_names = ['data', 'LSTM_init_h', 'LSTM_init_c']
+        label_names = ['softmax_label']
+
+        return (sm, data_names, label_names)
+
+    if len(buckets) == 1:
+        mod = mx.mod.Module(*sym_gen(buckets[0]), context=contexts)
+    else:
+        mod = mx.mod.BucketingModule(sym_gen, default_bucket_key=data_train.default_bucket_key,
+                                     context=contexts)
+
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+
+    mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch,
+            eval_metric=mx.metric.np(Perplexity),
+            batch_end_callback=mx.callback.Speedometer(batch_size, 50),
+            initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
+            optimizer='sgd',
+            optimizer_params={'learning_rate': learning_rate,
+                              'momentum': momentum, 'wd': 0.00001})
diff --git a/example/rnn/rnn_model.py b/example/rnn/rnn_model.py
index fc958fc8acc8..2135abd357c9 100644
--- a/example/rnn/rnn_model.py
+++ b/example/rnn/rnn_model.py
@@ -33,7 +33,7 @@ def __init__(self,
         data_shape = [("data", (batch_size,))]
 
         input_shapes = dict(init_c + init_h + data_shape)
-        self.executor = self.sym.simple_bind(ctx=mx.cpu(), **input_shapes)
+        self.executor = self.sym.simple_bind(ctx=ctx, **input_shapes)
 
         for key in self.executor.arg_dict.keys():
             if key in arg_params:
diff --git a/example/speech-demo/io_util.py b/example/speech-demo/io_util.py
index 5ef8bf21e6f0..6def02dc836a 100644
--- a/example/speech-demo/io_util.py
+++ b/example/speech-demo/io_util.py
@@ -68,19 +68,19 @@ def __init__(self, train_sets, batch_size,
         self.data_name = data_name
         if has_label:
             self.label_name = label_name
-        
+
         features = []
         labels = []
         utt_lens = []
         utt_ids = []
         buckets = []
         self.has_label = has_label
-        
+
         if label_mean_sets is not None:
             self.label_mean_sets.initialize_read()
             (feats, tgts, utt_id) = self.label_mean_sets.load_next_seq()
 
-            self.label_mean = feats/np.sum(feats) 
+            self.label_mean = feats/np.sum(feats)
             for i,v in enumerate(feats):
                 if v <= 1.0:
                     self.label_mean[i] = 1
@@ -103,7 +103,7 @@ def __init__(self, train_sets, batch_size,
                 labels.append(tgts+1)
             if feats.shape[0] not in buckets:
                 buckets_map[feats.shape[0]] = feats.shape[0]
-        
+
         for k, v in buckets_map.iteritems():
             buckets.append(k)
 
@@ -116,13 +116,13 @@ def __init__(self, train_sets, batch_size,
         self.utt_lens = [[] for k in buckets]
         self.feat_dim = feat_dim
         self.default_bucket_key = max(buckets)
-        
+
         for i, feats in enumerate(features):
             if has_label:
                 tgts = labels[i]
             utt_len = utt_lens[i]
             utt_id = utt_ids[i]
-            
+
             for i, bkt in enumerate(buckets):
                 if bkt >= utt_len:
                     i_bucket = i
@@ -620,3 +620,4 @@ def __iter__(self):
 
     def reset(self):
         self.bucket_curr_idx = [0 for x in self.data]
+
diff --git a/example/speech-demo/python_wrap/Makefile b/example/speech-demo/python_wrap/Makefile
index 0c76e9e973a1..2c020b0d8757 100644
--- a/example/speech-demo/python_wrap/Makefile
+++ b/example/speech-demo/python_wrap/Makefile
@@ -8,6 +8,6 @@ OBJFILES = ctypes.o
 
 LIBNAME = kaldi-python-wrap
 
-ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a  ../hmm/kaldi-hmm.a ../cudamatrix/kaldi-cudamatrix.a ../nnet/kaldi-nnet.a
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a  ../hmm/kaldi-hmm.a ../cudamatrix/kaldi-cudamatrix.a ../nnet/kaldi-nnet.a ../thread/kaldi-thread.a
 
 include ../makefiles/default_rules.mk
diff --git a/example/stochastic-depth/sd_cifar10.py b/example/stochastic-depth/sd_cifar10.py
new file mode 100644
index 000000000000..995601d4af2c
--- /dev/null
+++ b/example/stochastic-depth/sd_cifar10.py
@@ -0,0 +1,201 @@
+###########################################################################################
+# Implementation of the stochastic depth algorithm described in the paper
+#
+#    Huang, Gao, et al. "Deep networks with stochastic depth." arXiv preprint arXiv:1603.09382 (2016).
+#
+# Reference torch implementation can be found at https://github.com/yueatsprograms/Stochastic_Depth
+#
+# There are some differences in the implementation:
+# - A BN->ReLU->Conv is used for skip connection when input and output shapes are different,
+#   as oppose to a padding layer.
+# - The residual block is different: we use BN->ReLU->Conv->BN->ReLU->Conv, as oppose to
+#   Conv->BN->ReLU->Conv->BN (->ReLU also applied to skip connection).
+# - We did not try to match with the same initialization, learning rate scheduling, etc.
+#
+#--------------------------------------------------------------------------------
+# A sample from the running log (We achieved ~9.4% error after 500 epochs, some
+# more careful tuning of the hyper parameters and maybe also the arch is needed
+# to achieve the reported numbers in the paper):
+#
+# INFO:root:Epoch[80] Batch [50]  Speed: 1020.95 samples/sec      Train-accuracy=0.910080
+# INFO:root:Epoch[80] Batch [100] Speed: 1013.41 samples/sec      Train-accuracy=0.912031
+# INFO:root:Epoch[80] Batch [150] Speed: 1035.48 samples/sec      Train-accuracy=0.913438
+# INFO:root:Epoch[80] Batch [200] Speed: 1045.00 samples/sec      Train-accuracy=0.907344
+# INFO:root:Epoch[80] Batch [250] Speed: 1055.32 samples/sec      Train-accuracy=0.905937
+# INFO:root:Epoch[80] Batch [300] Speed: 1071.71 samples/sec      Train-accuracy=0.912500
+# INFO:root:Epoch[80] Batch [350] Speed: 1033.73 samples/sec      Train-accuracy=0.910937
+# INFO:root:Epoch[80] Train-accuracy=0.919922
+# INFO:root:Epoch[80] Time cost=48.348
+# INFO:root:Saved checkpoint to "sd-110-0081.params"
+# INFO:root:Epoch[80] Validation-accuracy=0.880142
+# ...
+# INFO:root:Epoch[115] Batch [50] Speed: 1037.04 samples/sec      Train-accuracy=0.937040
+# INFO:root:Epoch[115] Batch [100]        Speed: 1041.12 samples/sec      Train-accuracy=0.934219
+# INFO:root:Epoch[115] Batch [150]        Speed: 1036.02 samples/sec      Train-accuracy=0.933125
+# INFO:root:Epoch[115] Batch [200]        Speed: 1057.49 samples/sec      Train-accuracy=0.938125
+# INFO:root:Epoch[115] Batch [250]        Speed: 1060.56 samples/sec      Train-accuracy=0.933438
+# INFO:root:Epoch[115] Batch [300]        Speed: 1046.25 samples/sec      Train-accuracy=0.935625
+# INFO:root:Epoch[115] Batch [350]        Speed: 1043.83 samples/sec      Train-accuracy=0.927188
+# INFO:root:Epoch[115] Train-accuracy=0.938477
+# INFO:root:Epoch[115] Time cost=47.815
+# INFO:root:Saved checkpoint to "sd-110-0116.params"
+# INFO:root:Epoch[115] Validation-accuracy=0.884415
+# ...
+# INFO:root:Saved checkpoint to "sd-110-0499.params"
+# INFO:root:Epoch[498] Validation-accuracy=0.908554
+# INFO:root:Epoch[499] Batch [50] Speed: 1068.28 samples/sec      Train-accuracy=0.991422
+# INFO:root:Epoch[499] Batch [100]        Speed: 1053.10 samples/sec      Train-accuracy=0.991094
+# INFO:root:Epoch[499] Batch [150]        Speed: 1042.89 samples/sec      Train-accuracy=0.995156
+# INFO:root:Epoch[499] Batch [200]        Speed: 1066.22 samples/sec      Train-accuracy=0.991406
+# INFO:root:Epoch[499] Batch [250]        Speed: 1050.56 samples/sec      Train-accuracy=0.990781
+# INFO:root:Epoch[499] Batch [300]        Speed: 1032.02 samples/sec      Train-accuracy=0.992500
+# INFO:root:Epoch[499] Batch [350]        Speed: 1062.16 samples/sec      Train-accuracy=0.992969
+# INFO:root:Epoch[499] Train-accuracy=0.994141
+# INFO:root:Epoch[499] Time cost=47.401
+# INFO:root:Saved checkpoint to "sd-110-0500.params"
+# INFO:root:Epoch[499] Validation-accuracy=0.906050
+# ###########################################################################################
+
+import os
+import sys
+import mxnet as mx
+import logging
+
+import sd_module
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "image-classification")))
+from train_cifar10 import get_iterator
+
+
+def residual_module(death_rate, n_channel, name_scope, context, stride=1, bn_momentum=0.9):
+    data = mx.sym.Variable(name_scope + '_data')
+
+    # computation branch:
+    #   BN -> ReLU -> Conv -> BN -> ReLU -> Conv
+    bn1 = mx.symbol.BatchNorm(data=data, name=name_scope + '_bn1', fix_gamma=False,
+        momentum=bn_momentum,
+        # Same with https://github.com/soumith/cudnn.torch/blob/master/BatchNormalization.lua
+        # cuDNN v5 don't allow a small eps of 1e-5
+        eps=2e-5
+    )
+    relu1 = mx.symbol.Activation(data=bn1, act_type='relu', name=name_scope+'_relu1')
+    conv1 = mx.symbol.Convolution(data=relu1, num_filter=n_channel, kernel=(3, 3), pad=(1,1),
+                                  stride=(stride, stride), name=name_scope+'_conv1')
+    bn2 = mx.symbol.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_momentum,
+                              eps=2e-5, name=name_scope+'_bn2')
+    relu2 = mx.symbol.Activation(data=bn2, act_type='relu', name=name_scope+'_relu2')
+    conv2 = mx.symbol.Convolution(data=relu2, num_filter=n_channel, kernel=(3, 3), pad=(1,1),
+                                  stride=(1, 1), name=name_scope+'_conv2')
+    sym_compute = conv2
+
+    # skip branch
+    if stride > 1:
+        sym_skip = mx.symbol.BatchNorm(data=data, fix_gamma=False, momentum=bn_momentum,
+                                       eps=2e-5, name=name_scope+'_skip_bn')
+        sym_skip = mx.symbol.Activation(data=sym_skip, act_type='relu', name=name_scope+'_skip_relu')
+        sym_skip = mx.symbol.Convolution(data=sym_skip, num_filter=n_channel, kernel=(3, 3), pad=(1, 1),
+                                         stride=(stride, stride), name=name_scope+'_skip_conv')
+    else:
+        sym_skip = None
+
+    mod = sd_module.StochasticDepthModule(sym_compute, sym_skip, data_names=[name_scope+'_data'],
+                                          context=context, death_rate=death_rate)
+    return mod
+
+
+#################################################################################
+# Build architecture
+# Configurations
+bn_momentum = 0.9
+contexts = [mx.context.gpu(i) for i in range(1)]
+n_residual_blocks = 18
+death_rate = 0.5
+death_mode = 'linear_decay'  # 'linear_decay' or 'uniform'
+
+n_classes = 10
+
+def get_death_rate(i_res_block):
+    n_total_res_blocks = n_residual_blocks * 3
+    if death_mode == 'linear_decay':
+        my_death_rate = float(i_res_block) / n_total_res_blocks * death_rate
+    else:
+        my_death_rate = death_rate
+    return my_death_rate
+
+# 0. base ConvNet
+sym_base = mx.sym.Variable('data')
+sym_base = mx.sym.Convolution(data=sym_base, num_filter=16, kernel=(3, 3), pad=(1, 1), name='conv1')
+sym_base = mx.sym.BatchNorm(data=sym_base, name='bn1', fix_gamma=False, momentum=bn_momentum, eps=2e-5)
+sym_base = mx.sym.Activation(data=sym_base, name='relu1', act_type='relu')
+mod_base = mx.mod.Module(sym_base, context=contexts, label_names=None)
+
+# 1. container
+mod_seq = mx.mod.SequentialModule()
+mod_seq.add(mod_base)
+
+# 2. first group, 16 x 28 x 28
+i_res_block = 0
+for i in range(n_residual_blocks):
+    mod_seq.add(residual_module(get_death_rate(i_res_block), 16, 'res_A_%d' % i, contexts), auto_wiring=True)
+    i_res_block += 1
+
+# 3. second group, 32 x 14 x 14
+mod_seq.add(residual_module(get_death_rate(i_res_block), 32, 'res_AB', contexts, stride=2), auto_wiring=True)
+i_res_block += 1
+
+for i in range(n_residual_blocks-1):
+    mod_seq.add(residual_module(get_death_rate(i_res_block), 32, 'res_B_%d' % i, contexts), auto_wiring=True)
+    i_res_block += 1
+
+# 4. third group, 64 x 7 x 7
+mod_seq.add(residual_module(get_death_rate(i_res_block), 64, 'res_BC', contexts, stride=2), auto_wiring=True)
+i_res_block += 1
+
+for i in range(n_residual_blocks-1):
+    mod_seq.add(residual_module(get_death_rate(i_res_block), 64, 'res_C_%d' % i, contexts), auto_wiring=True)
+    i_res_block += 1
+
+# 5. final module
+sym_final = mx.sym.Variable('data')
+sym_final = mx.sym.Pooling(data=sym_final, kernel=(7, 7), pool_type='avg', name='global_pool')
+sym_final = mx.sym.FullyConnected(data=sym_final, num_hidden=n_classes, name='logits')
+sym_final = mx.sym.SoftmaxOutput(data=sym_final, name='softmax')
+mod_final = mx.mod.Module(sym_final, context=contexts)
+mod_seq.add(mod_final, auto_wiring=True, take_labels=True)
+
+
+#################################################################################
+# Training
+num_examples = 60000
+batch_size = 128
+base_lr = 0.008
+lr_factor = 0.5
+lr_factor_epoch = 100
+momentum = 0.9
+weight_decay = 0.00001
+kv_store = 'local'
+
+initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
+num_epochs = 500
+
+epoch_size = num_examples / batch_size
+lr_scheduler = mx.lr_scheduler.FactorScheduler(step=max(int(epoch_size * lr_factor_epoch), 1), factor=lr_factor)
+
+batch_end_callbacks = [mx.callback.Speedometer(batch_size, 50)]
+epoch_end_callbacks = [mx.callback.do_checkpoint('sd-%d' % (n_residual_blocks * 6 + 2))]
+
+
+args = type('', (), {})()
+args.batch_size = batch_size
+args.data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "image-classification", "cifar10")) + '/'
+kv = mx.kvstore.create(kv_store)
+train, val = get_iterator(args, kv)
+
+logging.basicConfig(level=logging.DEBUG)
+mod_seq.fit(train, val,
+            optimizer_params={'learning_rate': base_lr, 'momentum': momentum,
+                              'lr_scheduler': lr_scheduler, 'wd': weight_decay},
+            num_epoch=num_epochs, batch_end_callback=batch_end_callbacks,
+            epoch_end_callback=epoch_end_callbacks,
+            initializer=initializer)
+
diff --git a/example/stochastic-depth/sd_mnist.py b/example/stochastic-depth/sd_mnist.py
new file mode 100644
index 000000000000..66529a273de1
--- /dev/null
+++ b/example/stochastic-depth/sd_mnist.py
@@ -0,0 +1,86 @@
+################################################################################
+# A sanity check mainly for debugging purpose. See sd_cifar10.py for a non-trivial
+# example of stochastic depth on cifar10.
+################################################################################
+
+import os
+import sys
+import mxnet as mx
+import logging
+
+import sd_module
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "image-classification")))
+from train_mnist import get_iterator
+from symbol_resnet import get_conv
+
+death_rates = [0.3]
+contexts = [mx.context.cpu()]
+
+data = mx.symbol.Variable('data')
+conv = get_conv(
+    name='conv0',
+    data=data,
+    num_filter=16,
+    kernel=(3, 3),
+    stride=(1, 1),
+    pad=(1, 1),
+    with_relu=True,
+    bn_momentum=0.9
+)
+
+base_mod = mx.mod.Module(conv, label_names=None, context=contexts)
+mod_seq = mx.mod.SequentialModule()
+mod_seq.add(base_mod)
+
+for i in range(len(death_rates)):
+    conv = get_conv(
+        name='conv0_%d' % i,
+        data=mx.sym.Variable('data_%d' % i),
+        num_filter=16,
+        kernel=(3, 3),
+        stride=(1, 1),
+        pad=(1, 1),
+        with_relu=True,
+        bn_momentum=0.9
+    )
+    conv = get_conv(
+        name='conv1_%d' % i,
+        data=conv,
+        num_filter=16,
+        kernel=(3, 3),
+        stride=(1, 1),
+        pad=(1, 1),
+        with_relu=False,
+        bn_momentum=0.9
+    )
+    mod = sd_module.StochasticDepthModule(conv, data_names=['data_%d' % i],
+                                          context=contexts, death_rate=death_rates[i])
+    mod_seq.add(mod, auto_wiring=True)
+
+act = mx.sym.Activation(mx.sym.Variable('data_final'), act_type='relu')
+flat = mx.sym.Flatten(act)
+pred = mx.sym.FullyConnected(flat, num_hidden=10)
+softmax = mx.sym.SoftmaxOutput(pred, name='softmax')
+mod_seq.add(mx.mod.Module(softmax, context=contexts, data_names=['data_final']),
+            auto_wiring=True, take_labels=True)
+
+
+n_epoch = 2
+batch_size = 100
+
+
+train = mx.io.MNISTIter(
+        image="../image-classification/mnist/train-images-idx3-ubyte",
+        label="../image-classification/mnist/train-labels-idx1-ubyte",
+        input_shape=(1, 28, 28), flat=False,
+        batch_size=batch_size, shuffle=True, silent=False, seed=10)
+val = mx.io.MNISTIter(
+        image="../image-classification/mnist/t10k-images-idx3-ubyte",
+        label="../image-classification/mnist/t10k-labels-idx1-ubyte",
+        input_shape=(1, 28, 28), flat=False,
+        batch_size=batch_size, shuffle=True, silent=False)
+
+logging.basicConfig(level=logging.DEBUG)
+mod_seq.fit(train, val, optimizer_params={'learning_rate': 0.01, 'momentum': 0.9},
+            num_epoch=n_epoch, batch_end_callback=mx.callback.Speedometer(batch_size, 10))
diff --git a/example/stochastic-depth/sd_module.py b/example/stochastic-depth/sd_module.py
new file mode 100644
index 000000000000..ae8cfe0ba255
--- /dev/null
+++ b/example/stochastic-depth/sd_module.py
@@ -0,0 +1,172 @@
+import logging
+import mxnet as mx
+import numpy as np
+
+
+class RandomNumberQueue(object):
+    def __init__(self, pool_size=1000):
+        self._pool = np.random.rand(pool_size)
+        self._index = 0
+
+    def get_sample(self):
+        if self._index >= len(self._pool):
+            self._pool = np.random.rand(len(self._pool))
+            self._index = 0
+        self._index += 1
+        return self._pool[self._index-1]
+
+
+class StochasticDepthModule(mx.module.BaseModule):
+    """Stochastic depth module is a two branch computation: one is actual computing and the
+    other is the skip computing (usually an identity map). This is similar to a Residual block,
+    except that a random variable is used to randomly turn off the computing branch, in order
+    to save computation during training.
+
+    Parameters
+    ----------
+    symbol_compute: Symbol
+        The computation branch.
+    symbol_skip: Symbol
+        The skip branch. Could be None, in which case an identity map will be automatically
+        used. Note the two branch should produce exactly the same output shapes.
+    data_names: list of str
+        Default is `['data']`. Indicating the input names. Note if `symbol_skip` is not None,
+        it should have the same input names as `symbol_compute`.
+    label_names: list of str
+        Default is None, indicating that this module does not take labels.
+    death_rate: float
+        Default 0. The probability of turning off the computing branch.
+    """
+    def __init__(self, symbol_compute, symbol_skip=None,
+                 data_names=('data',), label_names=None,
+                 logger=logging, context=mx.context.cpu(),
+                 work_load_list=None, fixed_param_names=None,
+                 death_rate=0):
+        super(StochasticDepthModule, self).__init__(logger=logger)
+
+        self._module_compute = mx.module.Module(
+            symbol_compute, data_names=data_names,
+            label_names=label_names, logger=logger,
+            context=context, work_load_list=work_load_list,
+            fixed_param_names=fixed_param_names)
+
+        if symbol_skip is not None:
+            self._module_skip = mx.module.Module(
+                symbol_skip, data_names=data_names,
+                label_names=label_names, logger=logger,
+                context=context, work_load_list=work_load_list,
+                fixed_param_names=fixed_param_names)
+        else:
+            self._module_skip = None
+
+        self._open_rate = 1 - death_rate
+        self._gate_open = True
+        self._outputs = None
+        self._input_grads = None
+        self._rnd_queue = RandomNumberQueue()
+
+    @property
+    def data_names(self):
+        return self._module_compute.data_names
+
+    @property
+    def output_names(self):
+        return self._module_compute.output_names
+
+    @property
+    def data_shapes(self):
+        return self._module_compute.data_shapes
+
+    @property
+    def label_shapes(self):
+        return self._module_compute.label_shapes
+
+    @property
+    def output_shapes(self):
+        return self._module_compute.output_shapes
+
+    def get_params(self):
+        params = self._module_compute.get_params()
+        if self._module_skip:
+            params = [x.copy() for x in params]
+            skip_params = self._module_skip.get_params()
+            for a, b in zip(params, skip_params):
+                # make sure they do not contain duplicated param names
+                assert len(set(a.keys()) & set(b.keys())) == 0
+                a.update(b)
+        return params
+
+    def init_params(self, *args, **kwargs):
+        self._module_compute.init_params(*args, **kwargs)
+        if self._module_skip:
+            self._module_skip.init_params(*args, **kwargs)
+
+    def bind(self, *args, **kwargs):
+        self._module_compute.bind(*args, **kwargs)
+        if self._module_skip:
+            self._module_skip.bind(*args, **kwargs)
+
+    def init_optimizer(self, *args, **kwargs):
+        self._module_compute.init_optimizer(*args, **kwargs)
+        if self._module_skip:
+            self._module_skip.init_optimizer(*args, **kwargs)
+
+    def borrow_optimizer(self, shared_module):
+        self._module_compute.borrow_optimizer(shared_module._module_compute)
+        if self._module_skip:
+            self._module_skip.borrow_optimizer(shared_module._module_skip)
+
+    def forward(self, data_batch, is_train=None):
+        if is_train is None:
+            is_train = self._module_compute.for_training
+
+        if self._module_skip:
+            self._module_skip.forward(data_batch, is_train=True)
+            self._outputs = self._module_skip.get_outputs()
+        else:
+            self._outputs = data_batch.data
+
+        if is_train:
+            self._gate_open = self._rnd_queue.get_sample() < self._open_rate
+            if self._gate_open:
+                self._module_compute.forward(data_batch, is_train=True)
+                computed_outputs = self._module_compute.get_outputs()
+                for i in range(len(self._outputs)):
+                    self._outputs[i] += computed_outputs[i]
+
+        else:  # do expectation for prediction
+            self._module_compute.forward(data_batch, is_train=False)
+            computed_outputs = self._module_compute.get_outputs()
+            for i in range(len(self._outputs)):
+                self._outputs[i] += self._open_rate * computed_outputs[i]
+
+    def backward(self, out_grads=None):
+        if self._module_skip:
+            self._module_skip.backward(out_grads=out_grads)
+            self._input_grads = self._module_skip.get_input_grads()
+        else:
+            self._input_grads = out_grads
+
+        if self._gate_open:
+            self._module_compute.backward(out_grads=out_grads)
+            computed_input_grads = self._module_compute.get_input_grads()
+            for i in range(len(self._input_grads)):
+                self._input_grads[i] += computed_input_grads[i]
+
+    def update(self):
+        self._module_compute.update()
+        if self._module_skip:
+            self._module_skip.update()
+
+    def update_metric(self, eval_metric, labels):
+        self._module_compute.update_metric(eval_metric, labels)
+        if self._module_skip:
+            self._module_skip.update_metric(eval_metric, labels)
+
+    def get_outputs(self, merge_multi_context=True):
+        assert merge_multi_context, "Force merging for now"
+        return self._outputs
+
+    def get_input_grads(self, merge_multi_context=True):
+        assert merge_multi_context, "Force merging for now"
+        return self._input_grads
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 52100cdf05ea..96e23a46ed70 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -81,11 +81,6 @@ typedef mshadow::index_t index_t;
 /*! \brief data type that will be used to store ndarray */
 typedef mshadow::default_real_t real_t;
 
-/*! \brief dynamic shape type */
-typedef mshadow::TShape TShape;
-/*! \brief storage container type */
-typedef mshadow::TBlob TBlob;
-
 /*! \brief Context information about the execution enviroment */
 struct Context {
   /*! \brief Type of device */
@@ -155,23 +150,23 @@ struct Context {
   /*!
    * \brief Create a new context.
    * \param dev_type device type.
-   * \param dev_id device id.
+   * \param dev_id device id. -1 for current device.
    */
-  inline static Context Create(DeviceType dev_type, int32_t dev_id);
+  inline static Context Create(DeviceType dev_type, int32_t dev_id = -1);
   /*! \return CPU Context */
   inline static Context CPU();
   /*!
    * Create a GPU context.
    * \param dev_id the device id.
-   * \return GPU Context.
+   * \return GPU Context. -1 for current GPU.
    */
-  inline static Context GPU(int32_t dev_id);
+  inline static Context GPU(int32_t dev_id = -1);
   /*!
    * Create a pinned CPU context.
    * \param dev_id the device id for corresponding GPU.
-   * \return Pinned CPU context.
+   * \return Pinned CPU context. -1 for current GPU.
    */
-  inline static Context CPUPinned(int32_t dev_id);
+  inline static Context CPUPinned(int32_t dev_id = -1);
 };
 
 /*!
@@ -208,7 +203,16 @@ inline bool Context::operator<(const Context &b) const {
 inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
   Context ctx;
   ctx.dev_type = dev_type;
-  ctx.dev_id = dev_id;
+  if (dev_id < 0) {
+    ctx.dev_id = 0;
+#if MXNET_USE_CUDA
+    if (dev_type != kCPU) {
+      CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
+    }
+#endif
+  } else {
+    ctx.dev_id = dev_id;
+  }
   return ctx;
 }
 inline Context Context::CPU() {
@@ -224,55 +228,6 @@ inline Context Context::GPU(int32_t dev_id) {
 }
 }  // namespace mxnet
 
-namespace dmlc {
-// Add a few patches to support TShape in dmlc/parameter.
-DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
-
-namespace parameter {
-template<>
-class FieldEntry<mxnet::TShape>
-    : public FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> {
- public:
-  FieldEntry() : enforce_nonzero_(false), expect_ndim_(0) {}
-  // parent class
-  typedef FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> Parent;
-
-  virtual void Check(void *head) const {
-    Parent::Check(head);
-    mxnet::TShape &v = this->Get(head);
-    if (expect_ndim_ != 0 && v.ndim() != expect_ndim_) {
-      std::ostringstream os;
-        os << "value " << v << "for Parameter " << this->key_
-           << " has wrong dimensions, expected dimension=" << expect_ndim_;
-        throw dmlc::ParamError(os.str());
-    }
-    if (enforce_nonzero_) {
-      for (mxnet::index_t i = 0; i < v.ndim(); ++i) {
-        if (v[i] == 0U) {
-          std::ostringstream os;
-          os << "value " << v << "for Parameter " << this->key_
-             << " is invalid, the input shape must be nonzero in all dimensions";
-          throw dmlc::ParamError(os.str());
-        }
-      }
-    }
-  }
-  inline FieldEntry<mxnet::TShape> &enforce_nonzero() {
-    this->enforce_nonzero_ = true;
-    return this->self();
-  }
-  inline FieldEntry<mxnet::TShape> &set_expect_ndim(mshadow::index_t ndim) {
-    expect_ndim_ = ndim;
-    return this->self();
-  }
-
- private:
-  // whether all the entries need to be nonzero
-  bool enforce_nonzero_;
-  // expected number of dimension, default = 0 means no restriction.
-  mxnet::index_t expect_ndim_;
-};
-}  // namespace parameter
-}  // namespace dmlc
+#include "./tensor_blob.h"
 //! \endcond
 #endif  // MXNET_BASE_H_
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 28bc89406c0b..b1a2622d44a8 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -310,7 +310,7 @@ MXNET_DLL int MXNDArrayWaitAll();
 MXNET_DLL int MXNDArrayFree(NDArrayHandle handle);
 /*!
  * \brief Slice the NDArray along axis 0.
- * \param handle the handle to the narraya
+ * \param handle the handle to the NDArray
  * \param slice_begin The beginning index of slice
  * \param slice_end The ending index of slice
  * \param out The NDArrayHandle of sliced NDArray
@@ -322,9 +322,9 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
                              NDArrayHandle *out);
 /*!
  * \brief Index the NDArray along axis 0.
- * \param handle the handle to the narraya
+ * \param handle the handle to the NDArray
  * \param idx the index
- * \param out The NDArrayHandle of sliced NDArray
+ * \param out The NDArrayHandle of output NDArray
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayAt(NDArrayHandle handle,
@@ -1103,6 +1103,8 @@ MXNET_DLL int MXDataIterGetLabel(DataIterHandle handle,
 MXNET_DLL int MXInitPSEnv(mx_uint num_vars,
                           const char **keys,
                           const char **vals);
+
+
 /*!
  * \brief Create a kvstore
  * \param type the type of KVStore
@@ -1245,6 +1247,16 @@ MXNET_DLL int MXKVStoreIsSchedulerNode(int *ret);
  */
 MXNET_DLL int MXKVStoreBarrier(KVStoreHandle handle);
 
+/**
+ * \brief whether to do barrier when finalize
+ *
+ * \param handle handle to the KVStore
+ * \param barrier_before_exit whether to do barrier when kvstore finalize
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXKVStoreSetBarrierBeforeExit(KVStoreHandle handle,
+                                            const int barrier_before_exit);
+
 /**
  * \brief the prototype of a server controller
  * \param head the head of the command
@@ -1279,6 +1291,21 @@ MXNET_DLL int MXKVStoreSendCommmandToServers(KVStoreHandle handle,
                                              int cmd_id,
                                              const char* cmd_body);
 
+/**
+ * \brief Get the number of ps dead node(s) specified by {node_id}
+ *
+ * \param handle handle to the KVStore
+ * \param node_id Can be a node group or a single node.
+ *                kScheduler = 1, kServerGroup = 2, kWorkerGroup = 4
+ * \param number Ouptut number of dead nodes
+ * \param timeout_sec A node fails to send heartbeart in {timeout_sec} seconds
+ *                    will be presumed as 'dead'
+ */
+MXNET_DLL int MXKVStoreGetNumDeadNode(KVStoreHandle handle,
+                                      const int node_id,
+                                      int *number,
+                                      const int timeout_sec = 60);
+
 /**
  * \brief Create a RecordIO writer object
  * \param uri path to file
@@ -1304,6 +1331,14 @@ MXNET_DLL int MXRecordIOWriterFree(RecordIOHandle handle);
 MXNET_DLL int MXRecordIOWriterWriteRecord(RecordIOHandle *handle,
                                           const char *buf, size_t size);
 
+/**
+ * \brief Get the current writer pointer position
+ * \param handle handle to RecordIO object
+ * \param pos handle to output position
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOWriterTell(RecordIOHandle *handle, size_t *pos);
+
 /**
  * \brief Create a RecordIO reader object
  * \param uri path to file
@@ -1329,6 +1364,14 @@ MXNET_DLL int MXRecordIOReaderFree(RecordIOHandle *handle);
 MXNET_DLL int MXRecordIOReaderReadRecord(RecordIOHandle *handle,
                                         char const **buf, size_t *size);
 
+/**
+ * \brief Set the current reader pointer position
+ * \param handle handle to RecordIO object
+ * \param pos target position
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderSeek(RecordIOHandle *handle, size_t pos);
+
 /**
  * \brief Create a MXRtc object
 */
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index e16bfcb74d0b..dafaf1bf9cab 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -10,6 +10,7 @@
 #include <unordered_map>
 #include <string>
 #include <functional>
+#include <atomic>
 #include "./ndarray.h"
 #if MXNET_USE_DIST_KVSTORE
 #include "ps/ps.h"
@@ -190,6 +191,14 @@ class KVStore {
 #endif  // MXNET_USE_DIST_KVSTORE
   }
 
+  void set_barrier_before_exit(const bool barrier_before_exit) {
+#if MXNET_USE_DIST_KVSTORE
+    if (!IsWorkerNode()) LOG(FATAL) << "barrier_before_exit takes effect only on worker nodes";
+    barrier_before_exit_ = barrier_before_exit;
+#else
+    LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to enable barrier";
+#endif
+  }
 
   /**
    * \return whether or not this process is a scheduler node.
@@ -222,6 +231,18 @@ class KVStore {
     return 1;
   }
 
+  /*!
+   * \return the number of dead node(s) specified by {node_id}
+   * \param node_id can be a node group or a single node
+   * \param timeout a node fails to send heartbeart in {timeout} seconds
+   *        will be presumed as 'dead'
+   *
+   * Always return 0 when type == "local"
+   */
+  virtual int get_num_dead_node(int node_id, int timeout = 60) const {
+    return 0;
+  }
+
   /*!
    * \brief global barrier among all worker machines
    *
@@ -274,6 +295,11 @@ class KVStore {
    * \brief the kvstore type
    */
   std::string type_;
+
+  /**
+   * \brief whether to do barrier when finalize
+   */
+  std::atomic<bool> barrier_before_exit_{true};
 };
 
 }  // namespace mxnet
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index e4f15082b398..c1c73faecba2 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -72,6 +72,19 @@ class NDArray {
     });
     return res;
   }
+  /*!
+   * \return a chunk of raw data in TBlob
+   */
+  inline TBlob raw_data(index_t offset, index_t length) const {
+    TBlob res;
+    TShape raw_shape(1);
+    raw_shape[0] = length;
+    MSHADOW_TYPE_SWITCH(dtype_, DType, {
+      res = TBlob(static_cast<DType*>(ptr_->shandle.dptr)
+        + offset_ + offset, raw_shape, ptr_->shandle.ctx.dev_mask());
+    });
+    return res;
+  }
   /*!
    * \return the context of NDArray, this function is only valid when the NDArray is not empty
    */
@@ -368,6 +381,18 @@ class NDArray {
  */
 void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
 
+/*!
+ * \brief copy a slice along any axis.
+ * \param from the NDArray we want to slice from
+ * \param slice_dim the axis we want to perform slice in
+ * \param start the beginning of the slice
+ * \param end the ending of the slice
+ * \param to the pre-allocated NDArray to copy the slice to
+ * \param priority the priority of the task
+ */
+void CopySliceTo(const NDArray &from, int slice_dim, index_t start, index_t end,
+                 NDArray *to, int priority = 0);
+
 /*!
  * \brief Perform elementwise sum over each data from source, store result into out.
  * \param source the ndarray we want to sum
diff --git a/include/mxnet/operator_util.h b/include/mxnet/operator_util.h
index 71276a4bec5f..d1b67cbb7db6 100644
--- a/include/mxnet/operator_util.h
+++ b/include/mxnet/operator_util.h
@@ -4,7 +4,7 @@
  * \brief Utility functions and registries to help quickly build new operators.
  *
  *  Use the register functions in this file when possible to simplify operator creations.
- *  Operators registred in this file will be exposed to both NDArray API and symbolic API.
+ *  Operators registered in this file will be exposed to both NDArray API and symbolic API.
  *
  * \author Tianqi Chen
  */
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index da41cb07e52d..93b8352b2617 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -76,7 +76,9 @@ struct Resource {
    *
    *  This space can be shared with other calls to this->get_space.
    *  So the caller need to serialize the calls when using the conflicted space.
-   *  The temp space will remain valid until release is called.
+   *  The old space can get freed, however, this will incur a synchronization,
+   *  when running on device, so the launched kernels that depend on the temp space
+   *  can finish correctly.
    *
    * \param shape the Shape of returning tensor.
    * \param stream the stream of retruning tensor.
@@ -136,16 +138,6 @@ struct Resource {
         reinterpret_cast<DType*>(get_host_space_internal(shape.Size() * sizeof(DType))),
         shape, shape[ndim - 1], NULL);
   }
-  /*!
-   * \brief Release the all existing allocated space.
-   *  The existing allocated address will remain valdd
-   *  until release is called.
-   *
-   *  Even if user do not call release, the space occupation
-   *  of the resource will remain at most two times of maximum
-   *  requested space.
-   */
-  void release() const;
   /*!
    * \brief internal function to get space from resources.
    * \param size The size of the space.
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 60bca03b0680..4106fa688601 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -14,7 +14,7 @@ namespace mxnet {
 /*!
  * \brief Storage manager across multiple devices.
  */
-class MXNET_API Storage {
+class Storage {
  public:
   /*!
    * \brief Storage handle.
@@ -45,6 +45,16 @@ class MXNET_API Storage {
    * \param handle Handle struect.
    */
   virtual void Free(Handle handle) = 0;
+  /*!
+   * \brief Free storage directly, without putting it into memory pool.
+   *  This can synchronization of all previous runned device functions.
+   *
+   *  This function is suitable for conatiner structure with requirement on upsizing
+   *  in the beginning phase of the iteration.
+   *
+   * \param handle Handle struct.
+   */
+  virtual void DirectFree(Handle handle) = 0;
   /*!
    * \brief Destructor.
    */
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
new file mode 100644
index 000000000000..10ed74b3130c
--- /dev/null
+++ b/include/mxnet/tensor_blob.h
@@ -0,0 +1,804 @@
+/*!
+ *  Copyright (c) 2014 by Contributors
+ * \file tensor_blob.h
+ * \brief TBlob class that holds common representation of
+ *  arbirary dimension tensor, can be used to transformed
+ *  to normal fixed dimenson tensor
+ * \author Tianqi Chen
+ */
+#ifndef MXNET_TENSOR_BLOB_H_
+#define MXNET_TENSOR_BLOB_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/json.h>
+#include <vector>
+#include <iostream>
+#include <utility>
+#include <algorithm>
+#include "./base.h"
+
+namespace mxnet {
+
+/*!
+ * \brief A dynamic sized array data strcuture
+ *  that is optimized for storing small number of elements with same type.
+ *  Data will be stored in stack when number of elements is small.
+ *
+ *  It is suitable to hold Shape of Tensor.
+ *
+ * \tparam ValueType The type of data stored inside tuple.
+ * \sa TShape
+ */
+template<typename ValueType>
+class Tuple {
+ public:
+  // Tuple requires the content to be simple data type.
+  static_assert(std::is_pod<ValueType>::value,
+                "Tuple only support simple data type like int");
+  /*! \brief default constructor */
+  Tuple() = default;
+  /*! \brief destructor */
+  inline ~Tuple() {
+    delete [] data_heap_;
+  }
+  /*!
+   * \brief copy constructor from another tuple
+   * \param s the source tuple
+   */
+  inline Tuple(const Tuple<ValueType>& s) {
+    this->assign(s.begin(), s.end());
+  }
+  /*!
+   * \brief constructor from initializer list
+   * \param init the initializer_list
+   */
+  inline Tuple(std::initializer_list<ValueType> init) {
+    this->assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief move constructor from Tuple
+   * \param src the source shape
+   */
+
+  inline Tuple(Tuple<ValueType>&& src) { // NOLINT(*)
+    this->swap(src);
+  }
+  /*!
+   * \brief construct the Tuple from content of iterator
+   * \param begin the beginning of iterator
+   * \param end end the end of the iterator
+   * \tparam RandomAccessIterator iterator type
+   */
+  template<typename RandomAccessIterator>
+  inline Tuple(RandomAccessIterator begin,
+               RandomAccessIterator end) {
+    this->assign(begin, end);
+  }
+  /*!
+   * \brief Assign content to tuple from iterator.
+   * \param begin the beginning of iteratro
+   * \param end end the end of the iterator
+   * \tparam RandomAccessIterator iterator type
+   */
+  template<typename RandomAccessIterator>
+  inline void assign(RandomAccessIterator begin,
+                     RandomAccessIterator end) {
+    this->SetDim(end - begin);
+    std::copy(begin, end, this->begin());
+  }
+  /*!
+   * \brief Swap current object with other
+   * \param other another object to be swapped.
+   */
+  inline void swap(Tuple<ValueType>& other) {  // NOLINT(*)
+    std::swap(ndim_, other.ndim_);
+    std::swap(num_heap_allocated_, other.num_heap_allocated_);
+    std::swap(data_stack_, other.data_stack_);
+    std::swap(data_heap_, other.data_heap_);
+  }
+  /*!
+   * \brief assignment from another tuple.
+   * \param src source tuple
+   * \return reference of self
+   */
+  inline Tuple<ValueType>& operator=(const Tuple<ValueType>& src) {
+    this->assign(src.begin(), src.end());
+    return *this;
+  }
+  /*!
+   * \brief assignment from rvalue of another tuple.
+   * \param src source tuple
+   * \return reference of self
+   */
+  inline Tuple<ValueType>& operator=(Tuple<ValueType>&& src) {
+    Tuple<ValueType>(std::move(src)).swap(*this);
+    return *this;
+  }
+  /*!
+   * \brief assignment from initializer list
+   * \param init the source initializer list
+   * \return reference of self
+   */
+  inline Tuple<ValueType> &operator=(std::initializer_list<ValueType> init) {
+    this->assign(init.begin(), init.end());
+    return *this;
+  }
+  /*!
+   * \return whether two tuple equals
+   * \param s the tuple to compare against
+   */
+  inline bool operator==(const Tuple<ValueType> &s) const {
+    if (ndim_ != s.ndim_) return false;
+    return std::equal(begin(), end(), s.begin());
+  }
+  /*!
+   * \return whether two tuple not equal
+   * \param s the tuple to compare against
+   */
+  inline bool operator!=(const Tuple<ValueType> &s) const {
+    return !(*this == s);
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline const ValueType *begin() const {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline ValueType *begin() {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \return the data pointer to end of the tuple */
+  inline const ValueType* end() const {
+    return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
+  }
+  /*! \return the data pointer to end the tuple */
+  inline ValueType* end() {
+    return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
+  }
+  /*! \return number of dimension of the tuple */
+  inline index_t ndim() const {
+    return ndim_;
+  }
+  /*!
+   * \brief get corresponding index
+   * \param i dimension index
+   * \return the corresponding dimension size
+   */
+  inline ValueType& operator[](index_t i) {
+    return begin()[i];
+  }
+  /*!
+   * \brief get corresponding index
+   * \param i dimension index
+   * \return the corresponding dimension size
+   */
+  inline const ValueType& operator[](index_t i) const {
+    return begin()[i];
+  }
+  /*!
+   * \brief Save Tuple to JSON.
+   * \param writer JSONWriter
+   */
+  inline void Save(dmlc::JSONWriter* writer) const {
+    std::vector<ValueType> tmp(begin(), end());
+    writer->Write(tmp);
+  }
+  /*!
+   * \brief Load Tuple from JSON.
+   * \param reader JSONReader
+   */
+  inline void Load(dmlc::JSONReader* reader) {
+    std::vector<ValueType> tmp;
+    reader->Read(&tmp);
+    this->assign(tmp.begin(), tmp.end());
+  }
+  /*!
+   * \brief allow output string of tuple to ostream
+   * \param os the output stream
+   * \param t the tuple
+   * \return the ostream
+   */
+  friend std::ostream &operator<<(std::ostream &os, const Tuple<ValueType> &t) {
+    os << '(';
+    const ValueType* begin = t.begin();
+    const ValueType* end = t.end();
+    for (const ValueType* it = begin; it != end; ++it) {
+      if (it != begin) os << ',';
+      os << *it;
+    }
+    // python style tuple
+    if (t.ndim() == 1) os << ',';
+    os << ')';
+    return os;
+  }
+  /*!
+   * \brief read tuple from the istream
+   * \param is the input stream
+   * \param t The tuple
+   * \return the istream
+   */
+  friend std::istream &operator>>(std::istream &is, Tuple<ValueType> &t) {
+    // get (
+    while (true) {
+      char ch = is.peek();
+      if (isdigit(ch)) {
+        ValueType idx;
+        if (is >> idx) {
+          t.assign(&idx, &idx + 1);
+        }
+        return is;
+      }
+      is.get();
+      if (ch == '(') break;
+      if (!isspace(ch)) {
+        is.setstate(std::ios::failbit);
+        return is;
+    }
+    }
+    index_t idx;
+    std::vector<ValueType> tmp;
+    while (is >> idx) {
+      tmp.push_back(idx);
+      char ch;
+      do {
+        ch = is.get();
+      } while (isspace(ch));
+      if (std::is_integral<ValueType>::value && ch == 'L') {
+        ch = is.get();
+      }
+      if (ch == ',') {
+        while (true) {
+          ch = is.peek();
+          if (isspace(ch)) {
+            is.get(); continue;
+          }
+          if (ch == ')') {
+            is.get(); break;
+          }
+          break;
+        }
+        if (ch == ')') break;
+      } else if (ch == ')') {
+        break;
+      } else {
+        is.setstate(std::ios::failbit);
+        return is;
+      }
+    }
+    t.assign(tmp.begin(), tmp.end());
+    return is;
+  }
+
+ protected:
+  // stack cache size
+  static const uint32_t kStackCache = 4;
+  /*! \brief number of dimension of the tuple */
+  index_t ndim_{0};
+  /*! \brief number of cells allocated in data_heap_ */
+  index_t num_heap_allocated_{0};
+  /*! \brief in stack space used to store shape when it is small */
+  ValueType data_stack_[kStackCache];
+  /*! \brief space to store shape when dimension is big*/
+  ValueType* data_heap_{nullptr};
+  // internal function to change the dimension
+  inline void SetDim(index_t dim) {
+    if (dim > kStackCache &&
+        dim > num_heap_allocated_) {
+      delete [] data_heap_;
+      data_heap_ = new ValueType[dim];
+      num_heap_allocated_ = dim;
+    }
+    ndim_ = dim;
+  }
+};
+
+/*!
+ * \brief A Shape class that is used to represent shape of each tensor.
+ */
+class TShape : public Tuple<index_t> {
+ public:
+  /*! \brief default constructor */
+  TShape() = default;
+  /*!
+   * constructor to construct a shape with all 1.
+   * \param ndim the number of dimension
+   */
+  inline TShape(index_t ndim) {  // NOLINT(*)
+    this->SetDim(ndim);
+    std::fill_n(begin(), ndim, 1);
+  }
+  /*!
+   * \brief copy constructor of TShape
+   * \param s source shape.
+   */
+  inline TShape(const Tuple<index_t>& s) { // NOLINT(*)
+    this->assign(s.begin(), s.end());
+  }
+  /*!
+   * \brief constructor from initializer list
+   * \param init the initializer_list
+   */
+  inline TShape(std::initializer_list<index_t> init) {
+    this->assign(init.begin(), init.end());
+  }
+  /*!
+   * \brief move constructor.
+   * \param s source shape.
+   */
+  inline TShape(Tuple<index_t>&& s) {  // NOLINT(*)
+    this->swap(s);
+  }
+  /*!
+   * \brief construct the Tuple from content of iterator
+   * \param begin the beginning of iterator
+   * \param end end the end of the iterator
+   * \tparam RandomAccessIterator iterator type
+   */
+  template<typename RandomAccessIterator>
+  inline TShape(RandomAccessIterator begin,
+                RandomAccessIterator end) {
+    this->assign(begin, end);
+  }
+  /*!
+   * \brief assignment function from tshape
+   * \param src source shape.
+   * \return self.
+   */
+  inline TShape& operator=(const Tuple<index_t>& src) {
+    this->assign(src.begin(), src.end());
+    return *this;
+  }
+  /*!
+   * \brief move assignment function from tshape
+   * \param src source shape.
+   * \return self.
+   */
+  inline TShape& operator=(Tuple<index_t>&& src) {  // NOLINT(*)
+    TShape(std::move(src)).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /*! \return total number of elements in the shape */
+  inline size_t Size() const {
+    size_t size = 1;
+    const index_t* start = begin(), *fin = end();
+    for (const index_t* it = start; it != fin; ++it) {
+      size *= *it;
+    }
+    return size;
+  }
+  /*!
+   * \return product shape in [dimstart,dimend)
+   * \param dimstart start dimension
+   * \param dimend end dimension
+   */
+  inline index_t ProdShape(int dimstart, int dimend) const {
+    index_t num = 1;
+    const index_t *d = this->data();
+    for (int i = dimstart; i < dimend; ++i) {
+      num *= d[i];
+    }
+    return num;
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline const index_t *data() const {
+    return begin();
+  }
+  /*! \return the begin data pointer to content of the tuple */
+  inline index_t *data() {
+    return begin();
+  }
+#ifdef MSHADOW_XINLINE
+  template<int dim>
+  inline TShape(mshadow::Shape<dim> &&s) {// NOLINT(*)
+    this->assign(s.shape_, s.shape_ + dim);
+  }
+  /*!
+   * \brief assignment from shape
+   * \param shape source shape
+   * \tparam dim shape dimension
+   * \return reference of self
+   */
+  template<int dim>
+  inline TShape &operator=(const mshadow::Shape<dim> &shape) {
+    this->assign(shape.shape_, shape.shape_ + dim);
+    return *this;
+  }
+  /*!
+   * \brief get the shape of tensor specifying dim
+   * \return the shape requested
+   * \tparam dim dimension of the tensor
+   */
+  template<int dim>
+  inline mshadow::Shape<dim> get() const {
+    CHECK_EQ(dim, ndim())
+        << "dimension do not match target dimension " << dim << " vs " << ndim();
+    const index_t *d = this->data();
+    mshadow::Shape<dim> s;
+    for (int i = 0; i < dim; ++i) {
+      s[i] = d[i];
+    }
+    return s;
+  }
+  /*!
+   * flatten the higher dimension to second dimension, return a 2D shape
+   * \return the flat 2d shape
+   */
+  inline mshadow::Shape<2> FlatTo2D(void) const {
+    mshadow::Shape<2> s;
+    if (ndim() == 0) return mshadow::Shape2(0, 0);
+    const index_t *d = this->data();
+    s.shape_[1] = d[ndim() - 1];
+    index_t ymax = 1;
+    for (index_t i = 1; i < ndim(); ++i) {
+      ymax *= d[i - 1];
+    }
+    s.shape_[0] = ymax;
+    return s;
+  }
+  /*!
+  * flatten the shape into three parts: [0, axis_begin), [axis_begin, axis_end], (axis_end, ndim)
+  * \param axis_begin The beginning axis specified.
+  * \param axis_end The ending axis specified.
+  * \return the flat 3d shape
+  */
+  inline mshadow::Shape<3> FlatTo3D(index_t axis_begin, index_t axis_end) const {
+    CHECK(axis_end >= axis_begin);
+    mshadow::Shape<3> s;
+    if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
+    const index_t *d = this->data();
+    s.shape_[0] = 1;
+    s.shape_[1] = 1;
+    s.shape_[2] = 1;
+
+    for (index_t i = 0; i < axis_begin; ++i) {
+      s.shape_[0] *= d[i];
+    }
+    for (index_t i = axis_begin; i <= axis_end; ++i) {
+      s.shape_[1] *= d[i];
+    }
+    for (index_t i = axis_end + 1; i < ndim(); ++i) {
+      s.shape_[2] *= d[i];
+    }
+    return s;
+  }
+  /*!
+   * flatten the axis before and after the specified axis, so it becomes 3D tensor
+   * \param axis The axis specified.
+   * \return the flat 3d shape
+   */
+  inline mshadow::Shape<3> FlatTo3D(index_t axis) const {
+    return FlatTo3D(axis, axis);
+  }
+  inline bool operator==(const TShape &s) const {
+    if (ndim() != s.ndim()) return false;
+    return std::equal(begin(), end(), s.begin());
+  }
+  inline bool operator!=(const TShape &s) const {
+    return !(*this == s);
+  }
+  /*!
+   * \return whether two shape equals
+   * \param s the shape to compare against
+   * \tparam dim dimension of the shape
+   */
+  template<int dim>
+  inline bool operator==(const mshadow::Shape<dim> &s) const {
+    if (ndim_ != dim) return false;
+    const index_t *d = dim <= kStackCache ? data_stack_ : data_heap_;
+    for (index_t i = 0; i < dim; ++i) {
+      if (d[i] != s.shape_[i]) return false;
+    }
+    return true;
+  }
+  /*!
+   * \return whether two shape not equals
+   * \param s the shape to compare against
+   * \tparam dim dimension of the shape
+   */
+  template<int dim>
+  inline bool operator!=(const mshadow::Shape<dim> &s) const {
+    return !(*this == s);
+  }
+  /*!
+   * \brief save the content into binary stream
+   * \param strm the output stream
+   * \tparam TStream any stream type that have write
+   */
+  template<typename TStream>
+  inline void Save(TStream *strm) const {
+    strm->Write(&ndim_, sizeof(ndim_));
+    strm->Write(data(), sizeof(index_t) * ndim_);
+  }
+  /*!
+   * \brief load the content from binary stream
+   * \param strm the output stream
+   * \tparam TStream any stream type that have write
+   * \return whether the load is successful
+   */
+  template<typename TStream>
+  inline bool Load(TStream *strm) {
+    if (strm->Read(&ndim_, sizeof(ndim_)) != sizeof(ndim_)) return false;
+    this->SetDim(ndim_);
+    size_t nread = sizeof(index_t) * ndim_;
+    if (strm->Read(data(), nread) != nread) return false;
+    return true;
+  }
+#endif
+};
+
+/*!
+ * \brief tensor blob class that can be used to hold tensor of any dimension,
+ *  any device and any data type,
+ *  This is a weak type that can be used to transfer data through interface
+ *  TBlob itself do not involve any arithmentic operations,
+ *  but it can be converted to tensor of fixed dimension for further operations
+ *
+ *  Like tensor, this data structure is like a pointer class and do not
+ *  implicit allocated, de-allocate space.
+ *  This data structure can be helpful to hold tensors of different dimensions
+ *  and wait for further processing
+ */
+class TBlob {
+ public:
+  /*! \brief pointer to the data */
+  void *dptr_;
+  /*! \brief shape of the tensor */
+  TShape shape_;
+  /*!
+   * \brief storing the stride information in x dimension
+   */
+  index_t stride_;
+  /*! \brief device mask of the corresponding device */
+  int dev_mask_;
+  /*! \brief type flag of the tensor blob */
+  int type_flag_;
+  /*! \brief default constructor, default copy assign will work */
+  TBlob(void)
+      : dptr_(NULL), dev_mask_(cpu::kDevMask),
+        type_flag_(mshadow::DataType<real_t>::kFlag) {}
+  /*!
+   * \brief constructor that construct TBlob from contiguous memory
+   * \param dptr the pointer to the memory
+   * \param shape the shape of the data
+   * \param dev_mask the device mask, can be cpu::kDevMask or gpu::kDevMask
+   */
+  template<typename DType>
+  TBlob(DType *dptr,
+        const TShape &shape,
+        int dev_mask)
+      : dptr_(dptr), shape_(shape),
+        stride_(shape[shape.ndim() - 1]),
+        dev_mask_(dev_mask),
+        type_flag_(mshadow::DataType<DType>::kFlag) {}
+  /*!
+   * \brief constructor that construct TBlob from contiguous memory
+   * \param dptr the pointer to the memory
+   * \param shape the shape of the data
+   * \param dev_mask the device mask, can be cpu::kDevMask or gpu::kDevMask
+   * \param type_flag the type flag. Can be one of enum mshadow::dtype
+   */
+  TBlob(void *dptr,
+        const TShape &shape,
+        int dev_mask,
+        int type_flag)
+      : dptr_(dptr), shape_(shape),
+        stride_(shape[shape.ndim() - 1]),
+        dev_mask_(dev_mask),
+        type_flag_(type_flag) {}
+  /*!
+   * \brief constructor from tensor
+   * \param src source tensor
+   * \tparam Device which device the tensor is on
+   * \tparam dim tensor dimension
+   * \tparam DType the type of elements in the tensor
+   */
+  template<typename Device, int dim, typename DType>
+  TBlob(const mshadow::Tensor<Device, dim, DType> &src) {  // NOLINT(*)
+    *this = src;
+  }
+  /*!
+   * \brief assignment from tensor
+   * \param src source tensor
+   * \tparam Device which device the tensor is on
+   * \tparam dim tensor dimension
+   * \tparam DType the type of elements in the tensor
+   * \return reference of self
+   */
+  template<typename Device, int dim, typename DType>
+  inline TBlob
+  &operator=(const mshadow::Tensor<Device, dim, DType> &src) {
+    dptr_ = src.dptr_;
+    shape_ = src.shape_;
+    stride_ = src.stride_;
+    dev_mask_ = Device::kDevMask;
+    type_flag_ = mshadow::DataType<DType>::kFlag;
+    return *this;
+  }
+  /*!
+   * \return whether the tensor's memory is continuous
+   */
+  inline bool CheckContiguous(void) const {
+    return shape_[shape_.ndim() - 1] == stride_;
+  }
+  /*!
+   * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
+   * \param stream the possible stream target tensor should reside on
+   * \tparam Device which device the tensor is on
+   * \tparam DType the type of elements in the tensor
+   * \return tensor after flatten
+   */
+  template<typename Device, typename DType>
+  inline mshadow::Tensor<Device, 2, DType> FlatTo2D(
+      mshadow::Stream<Device> *stream = NULL) const {
+    CHECK(Device::kDevMask == dev_mask_)
+      << "TBlob.get: device type do not match specified type";
+    CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
+      << "TBlob.get_with_shape: data type do not match specified type."
+      << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
+    return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
+                                             shape_.FlatTo2D(), stride_, stream);
+  }
+  /*!
+   * \brief flatten the tensor to 1 dimension, collapse all the dimensions together.
+   * \param stream the possible stream target tensor should reside on
+   * \tparam Device which device the tensor is on
+   * \tparam DType the type of elements in the tensor
+   * \return tensor after flatten
+   */
+  template<typename Device, typename DType>
+  inline mshadow::Tensor<Device, 1, DType> FlatTo1D(
+      mshadow::Stream<Device> *stream = NULL) const {
+    return this->get_with_shape<Device, 1, DType>(
+        mshadow::Shape1(shape_.Size()), stream);
+  }
+  /*! \brief return number of dimension of the tensor inside */
+  inline int ndim(void) const {
+    return shape_.ndim();
+  }
+  /*!
+   * \brief return size of i-th dimension, start counting from highest dimension
+   * \param idx the dimension count from the highest dimensin
+   * \return the size
+   */
+  inline index_t size(index_t idx) const {
+    return shape_[idx];
+  }
+  /*! \brief total number of elements in the tensor */
+  inline index_t Size(void) const {
+    return shape_.Size();
+  }
+  /*!
+   * \brief fetch the tensor, with respect to specific dimension
+   * if dim do not match the stored dimension, an error will be issued
+   * \return the tensor requested
+   * \param stream the possible stream target tensor should reside on
+   * \tparam Device which device the tensor is on
+   * \tparam dim dimension of the tensor
+   * \tparam DType the type of elements in the tensor
+   */
+  template<typename Device, int dim, typename DType>
+  inline mshadow::Tensor<Device, dim, DType> get(mshadow::Stream<Device> *stream = NULL) const {
+    CHECK(Device::kDevMask == dev_mask_)
+      << "TBlob.get: device type do not match specified type";
+    CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
+      << "TBlob.get_with_shape: data type do not match specified type."
+      << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
+    return mshadow::Tensor<Device, dim, DType>(static_cast<DType*>(dptr_),
+                                               shape_.get<dim>(),
+                                               stride_, stream);
+  }
+  /*!
+   * \brief fetch a tensor in given shape
+   *  If size do not match the stored size, an error will be issued
+   * \return the tensor requested
+   * \param shape the shape required
+   * \param stream the possible stream target tensor should reside on
+   * \tparam Device which device the tensor is on
+   * \tparam dim dimension of the tensor
+   * \tparam DType the type of elements in the tensor
+   */
+  template<typename Device, int dim, typename DType>
+  inline mshadow::Tensor<Device, dim, DType> get_with_shape(
+      const mshadow::Shape<dim> &shape,
+      mshadow::Stream<Device> *stream = NULL) const {
+    CHECK(Device ::kDevMask == dev_mask_)
+      << "TBlob.get: device type do not match specified type";
+    CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
+      << "TBlob.get_with_shape: data type do not match specified type."
+      << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
+    CHECK_EQ(this->CheckContiguous(), true) << "TBlob.get_reshape: must be contiguous";
+    CHECK_EQ(this->shape_.Size(), shape.Size())
+      << "TBlob.get_with_shape: new and old shape do not match total elements";
+    return mshadow::Tensor<Device, dim, DType>(static_cast<DType*>(dptr_),
+                                               shape,
+                                               shape[dim - 1],
+                                               stream);
+  }
+  /*!
+   * \brief flatten the tensor to 3 dimension,
+   *  collapse the dimension before and after specified axis.
+   * \param axis The axis specified.
+   * \param stream the possible stream target tensor should reside on
+   * \tparam Device which device the tensor is on
+   * \tparam DType the type of elements in the tensor
+   * \return tensor after flatten
+   */
+  template<typename Device, typename DType>
+  inline mshadow::Tensor<Device, 3, DType> FlatTo3D(
+      int axis, mshadow::Stream<Device> *stream = NULL) const {
+    return this->get_with_shape<Device, 3, DType>(
+        this->shape_.FlatTo3D(axis), stream);
+  }
+  /*!
+   * \brief flatten the tensor to 3 dimension,
+   *  collapse the dimension: [0, axis_begin), [axis_begin, axis_end], (axis_end, ndim).
+   * \param axis_begin The beginning axis specified.
+   * \param axis_end The ending axis specified.
+   * \param stream the possible stream target tensor should reside on
+   * \tparam Device which device the tensor is on
+   * \tparam DType the type of elements in the tensor
+   * \return tensor after flatten
+   */
+  template<typename Device, typename DType>
+  inline mshadow::Tensor<Device, 3, DType> FlatTo3D(
+      int axis_begin, int axis_end,
+      mshadow::Stream<Device> *stream = NULL) const {
+    return this->get_with_shape<Device, 3, DType>(
+        this->shape_.FlatTo3D(axis_begin, axis_end), stream);
+  }
+};
+}  // namespace mxnet
+
+namespace dmlc {
+// Add a few patches to support TShape in dmlc/parameter.
+DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
+
+namespace parameter {
+
+template<>
+class FieldEntry<mxnet::TShape>
+    : public FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> {
+ public:
+  FieldEntry() : enforce_nonzero_(false), expect_ndim_(0) {}
+  // parent class
+  typedef FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> Parent;
+
+  virtual void Check(void *head) const {
+    Parent::Check(head);
+    mxnet::TShape &v = this->Get(head);
+    if (expect_ndim_ != 0 && v.ndim() != expect_ndim_) {
+      std::ostringstream os;
+        os << "value " << v << "for Parameter " << this->key_
+           << " has wrong dimensions, expected dimension=" << expect_ndim_;
+        throw dmlc::ParamError(os.str());
+    }
+    if (enforce_nonzero_) {
+      for (mxnet::index_t i = 0; i < v.ndim(); ++i) {
+        if (v[i] == 0U) {
+          std::ostringstream os;
+          os << "value " << v << "for Parameter " << this->key_
+             << " is invalid, the input shape must be nonzero in all dimensions";
+          throw dmlc::ParamError(os.str());
+        }
+      }
+    }
+  }
+  inline FieldEntry<mxnet::TShape> &enforce_nonzero() {
+    this->enforce_nonzero_ = true;
+    return this->self();
+  }
+  inline FieldEntry<mxnet::TShape> &set_expect_ndim(mxnet::index_t ndim) {
+    expect_ndim_ = ndim;
+    return this->self();
+  }
+
+ private:
+  // whether all the entries need to be nonzero
+  bool enforce_nonzero_;
+  // expected number of dimension, default = 0 means no restriction.
+  mxnet::index_t expect_ndim_;
+};
+
+}  // namespace parameter
+}  // namespace dmlc
+
+#endif  // MXNET_TENSOR_BLOB_H_
diff --git a/make/config.mk b/make/config.mk
index aa3986a21673..659d06e07acb 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -80,6 +80,16 @@ else
 USE_STATIC_MKL = NONE
 endif
 
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
 #----------------------------
 # distributed computing
 #----------------------------
@@ -111,6 +121,11 @@ EXTRA_OPERATORS =
 # plugins
 #----------------------------
 
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
 # whether to use torch integration. This requires installing torch.
 # You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
 # TORCH_PATH = $(HOME)/torch
diff --git a/make/readthedocs.mk b/make/readthedocs.mk
index 51024ddf115e..b14c4baf7482 100644
--- a/make/readthedocs.mk
+++ b/make/readthedocs.mk
@@ -64,3 +64,6 @@ USE_S3 = 0
 
 # path to libjvm.so
 LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# uses O0 instead of O3 for better performance
+DEBUG = 1
diff --git a/mshadow b/mshadow
index 44d61f8ef9d8..223b45a5cedf 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 44d61f8ef9d86e85e7bc62b2a1d4dc40554672f1
+Subproject commit 223b45a5cedf126a50b6c8ca4c82ede8c81874e0
diff --git a/plugin/caffe/README.md b/plugin/caffe/README.md
new file mode 100644
index 000000000000..2a28e012a53a
--- /dev/null
+++ b/plugin/caffe/README.md
@@ -0,0 +1,49 @@
+# How to use Caffe operator in MXNet
+
+[Caffe](http://caffe.berkeleyvision.org/) has been a well-known and widely-used deep learning framework. Now MXNet has supported calling most caffe operators(layers) and loss functions directly in its symbolic graph! Using one's own customized caffe layer is also effortless.
+
+Besides Caffe, MXNet has already embedded Torch modules and its tensor mathematical functions. ([link](https://github.com/dmlc/mxnet/blob/master/docs/how_to/torch.md))
+
+This blog demonstrates two steps to use Caffe op in MXNet:
+
+* How to install MXNet with Caffe support.
+
+* How to embed Caffe op into MXNet's symbolic graph.
+
+## Install Caffe With MXNet interface
+* Download offical Caffe repository [BVLC/Caffe](https://github.com/BVLC/caffe).
+* Download [caffe patch for mxnet interface] (https://github.com/BVLC/caffe/pull/4527.patch). Move patch file under your caffe root folder and apply the patch by `git apply patch_file_name`.
+* Install caffe following [official guide](http://caffe.berkeleyvision.org/installation.html).
+
+## Compile with Caffe
+* In mxnet folder, open `config.mk` (if you haven't already, copy `make/config.mk` (Linux) or `make/osx.mk` (Mac) into MXNet root folder as `config.mk`) and uncomment the lines `CAFFE_PATH = $(HOME)/caffe` and `MXNET_PLUGINS += plugin/caffe/caffe.mk`. Modify `CAFFE_PATH` to your caffe installation if necessary. 
+* Run `make clean && make` to build with caffe support.
+
+## Caffe Operator (Layer)
+Caffe's neural network operator and loss functions are supported by MXNet through `mxnet.symbol.CaffeOp` and `mxnet.symbol.CaffeLoss` respectively.
+For example, the following code shows multi-layer perception network for classifying MNIST digits ([full code](https://github.com/dmlc/mxnet/blob/master/example/caffe/caffe_net.py)):
+
+### Python
+```Python
+data = mx.symbol.Variable('data')
+fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+```
+
+Let's break it down. First `data = mx.symbol.Variable('data')` defines a variable as placeholder for input.
+Then it's fed through Caffe operators with `fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")`.
+
+The inputs to caffe op are named as data_i for i=0 ... num_data-1 as `num_data` is the number of inputs. You may skip the argument, as the example does, if its value is 1. While `num_weight` is number of `blobs_`(weights). Its default value is 0, as many ops maintain no weight. `prototxt` is the configuration string.
+
+We could also replace the last line by:
+
+```Python
+label = mx.symbol.Variable('softmax_label')
+mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+```
+
+to use loss function in caffe.
diff --git a/plugin/caffe/caffe.mk b/plugin/caffe/caffe.mk
new file mode 100644
index 000000000000..ef9ec0a45a32
--- /dev/null
+++ b/plugin/caffe/caffe.mk
@@ -0,0 +1,15 @@
+CFLAGS += -I$(CAFFE_PATH)/include -I$(CAFFE_PATH)/build/src
+LDFLAGS += -lprotobuf -lboost_system -lboost_thread -lboost_filesystem -lgflags -lglog -L$(CAFFE_PATH)/build/lib -lcaffe
+
+ifeq ($(USE_CUDNN), 1)
+	CFLAGS += -DUSE_CUDNN=1
+endif
+
+ifeq ($(USE_CUDA), 0)
+	CFLAGS += -DCPU_ONLY=1
+endif
+
+CAFFE_SRC = $(wildcard plugin/caffe/*.cc)
+PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(CAFFE_SRC))
+CAFFE_CUSRC = $(wildcard plugin/caffe/*.cu)
+PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(CAFFE_CUSRC))
diff --git a/plugin/caffe/caffe_blob.cc b/plugin/caffe/caffe_blob.cc
new file mode 100644
index 000000000000..dded9ca653f7
--- /dev/null
+++ b/plugin/caffe/caffe_blob.cc
@@ -0,0 +1,75 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_blob.cc
+ * \brief Implementations of SetDataGradToBlob given various device/dimension
+ * \author Haoran Wang 
+*/
+#include "caffe_blob.h"
+namespace mxnet {
+namespace op {
+namespace caffe {
+
+template<>
+void SetDataGradToBlob<mshadow::cpu, float>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<float>*>::iterator blob,
+                            std::vector<TBlob>::const_iterator itr) {
+  float *data_ptr = reinterpret_cast<float*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_cpu_data(data_ptr);
+  else
+    (*blob)->set_cpu_diff(data_ptr);
+}
+
+template<>
+void SetDataGradToBlob<mshadow::cpu, double>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<double>*>::iterator blob,
+                            std::vector<TBlob>::const_iterator itr) {
+  double *data_ptr = reinterpret_cast<double*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_cpu_data(data_ptr);
+  else
+    (*blob)->set_cpu_diff(data_ptr);
+}
+
+template<>
+void SetDataGradToBlob<mshadow::gpu, float>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<float>*>::iterator blob,
+                            std::vector<TBlob>::const_iterator itr) {
+  float *data_ptr = reinterpret_cast<float*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_gpu_data(data_ptr);
+  else
+    (*blob)->set_gpu_diff(data_ptr);
+}
+
+template<>
+void SetDataGradToBlob<mshadow::gpu, double>(caffeMemoryTypes memType,
+                            std::vector<::caffe::Blob<double>*>::iterator blob,
+                            std::vector<TBlob>::const_iterator itr) {
+  double *data_ptr = reinterpret_cast<double*>((*itr).dptr_);
+  if (memType == Data)
+    (*blob)->set_gpu_data(data_ptr);
+  else
+    (*blob)->set_gpu_diff(data_ptr);
+}
+
+TShape Vector2TShape(const std::vector<int> &vec_int) {
+  std::vector<mshadow::index_t> vec;
+  for (uint32_t i = 0; i < vec_int.size(); ++i)
+    vec.push_back(vec_int[i]);
+  // 0-dim represents scalar in caffe
+  if (vec_int.size() == 0)
+    vec.push_back(1);
+  return {vec.begin(), vec.end()};
+}
+
+std::vector<int> TShape2Vector(const TShape &tshape) {
+  std::vector<int> s;
+  for (uint32_t i =0 ; i < tshape.ndim(); ++i)
+    s.push_back(tshape[i]);
+  return s;
+}
+
+}  // namespace caffe
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/caffe/caffe_blob.h b/plugin/caffe/caffe_blob.h
new file mode 100644
index 000000000000..24bf46e95638
--- /dev/null
+++ b/plugin/caffe/caffe_blob.h
@@ -0,0 +1,59 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_blob.h
+ * \brief conversion between tensor and caffeBlob
+ * \author Haoran Wang 
+*/
+#ifndef PLUGIN_CAFFE_CAFFE_BLOB_H_
+#define PLUGIN_CAFFE_CAFFE_BLOB_H_
+
+#include <mxnet/tensor_blob.h>
+#include <vector>
+#include <caffe/blob.hpp>
+#include <caffe/layer.hpp>
+
+namespace mxnet {
+namespace op {
+
+namespace caffe {
+
+// Declare Memory Type for Caffe blob
+enum caffeMemoryTypes {Data, Grad, Non};
+
+TShape Vector2TShape(const std::vector<int> &vec_int);
+std::vector<int> TShape2Vector(const TShape &tshape);
+
+// implementation of tensor to blob, called by TensorToBlob
+template<typename Device, typename Dtype>
+void SetDataGradToBlob(caffeMemoryTypes memType,
+                       typename std::vector< ::caffe::Blob<Dtype>*>::iterator blob,
+                       typename std::vector<TBlob>::const_iterator itr);
+
+/**
+ * \brief The interface to convert mxnet's tensor to caffe's blob
+ * \brief called in caffe_operator_inl.h
+ */
+template<typename Device, typename Dtype>
+void TBlob2CaffeBlob(caffeMemoryTypes memType,
+                     typename std::vector< ::caffe::Blob<Dtype>*>::iterator blob,
+                     typename std::vector<TBlob>::const_iterator tblob,
+                     int n = 1) {
+  for (int i = 0; i < n; ++i, ++blob, ++tblob) {
+    (*blob)->Reshape(TShape2Vector((*tblob).shape_));
+    SetDataGradToBlob<Device, Dtype>(memType, blob, tblob);
+  }
+}
+
+template<typename Dtype>
+void SetOpBlobs(::caffe::Layer<Dtype> *caffeOp,
+                const std::vector< ::caffe::Blob<Dtype>*>& weights) {
+  CHECK_EQ(caffeOp->blobs().size(), weights.size());
+  for (int i = 0; i < weights.size(); ++i)
+    caffeOp->blobs()[i].reset(weights[i]);
+}
+
+}  // namespace caffe
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // PLUGIN_CAFFE_CAFFE_BLOB_H_
diff --git a/plugin/caffe/caffe_common.cc b/plugin/caffe/caffe_common.cc
new file mode 100644
index 000000000000..722b19138f79
--- /dev/null
+++ b/plugin/caffe/caffe_common.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_common.h
+ * \brief Common functions for caffeOp and caffeLoss symbols
+ * \author Haoran Wang 
+*/
+#include<mshadow/tensor.h>
+#include<caffe/common.hpp>
+#include"caffe_common.h"
+
+namespace mxnet {
+namespace op {
+namespace caffe {
+
+// Cpu implementation of set_mode
+template<>
+void CaffeMode::SetMode<mshadow::cpu>() {
+  ::caffe::Caffe::set_mode(::caffe::Caffe::CPU);
+}
+
+// Gpu implementation of set_mode
+template<>
+void CaffeMode::SetMode<mshadow::gpu>() {
+  ::caffe::Caffe::set_mode(::caffe::Caffe::GPU);
+}
+
+}  // namespace caffe
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/caffe/caffe_common.h b/plugin/caffe/caffe_common.h
new file mode 100644
index 000000000000..27285e1a5da2
--- /dev/null
+++ b/plugin/caffe/caffe_common.h
@@ -0,0 +1,68 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_common.h
+ * \brief Common functions for caffeOp and caffeLoss symbols
+ * \author Haoran Wang
+*/
+
+#ifndef PLUGIN_CAFFE_CAFFE_COMMON_H_
+#define PLUGIN_CAFFE_CAFFE_COMMON_H_
+
+#include <mxnet/operator.h>
+#include <vector>
+#include <iostream>
+#include <exception>
+
+#include <caffe/layer.hpp>
+#include <caffe/blob.hpp>
+#include <caffe/layer_factory.hpp>
+
+namespace mxnet {
+namespace op {
+namespace caffe {
+
+/**
+ * \brief The class sets caffe's mode before doing forward/backward
+ * \tparam xpu The device that the op will be executed on.
+ */
+class CaffeMode {
+ public:
+  template<typename xpu> static void SetMode();
+};
+
+// Initialization funciton called by caffeOp & caffeLoss
+template<typename Dtype>
+void InitCaffeBlobs(std::vector< ::caffe::Blob<Dtype>*>* v, int n_num) {
+  for (index_t i=0; i < n_num; ++i)
+    v->push_back(new ::caffe::Blob<Dtype>());
+}
+
+template<typename Dtype>
+void DelCaffeBlobs(std::vector< ::caffe::Blob<Dtype>*>* v, int n_num) {
+  for (index_t i=0; i < n_num; ++i)
+    delete v->at(i);
+}
+
+
+struct NULLDeleter {template<typename T> void operator()(T*){}};
+
+template <typename Dtype>
+void Deleter(::caffe::Layer<Dtype> *ptr) {
+}
+
+template <typename Dtype>
+class LayerRegistry {
+ public:
+  static ::caffe::Layer<Dtype> * CreateLayer(const ::caffe::LayerParameter& param) {
+    ::caffe::shared_ptr< ::caffe::Layer<Dtype> > ptr =
+      ::caffe::LayerRegistry<Dtype>::CreateLayer(param);
+    // avoid caffe::layer destructor, which deletes the weights layer owns
+    new ::caffe::shared_ptr< ::caffe::Layer<Dtype> >(ptr);
+    return ptr.get();
+  }
+};
+
+}  // namespace caffe
+}  // namespace op
+}  // namespace mxnet
+#endif  // PLUGIN_CAFFE_CAFFE_COMMON_H_
diff --git a/plugin/caffe/caffe_data_iter.cc b/plugin/caffe/caffe_data_iter.cc
new file mode 100644
index 000000000000..d2f20fa0f2c7
--- /dev/null
+++ b/plugin/caffe/caffe_data_iter.cc
@@ -0,0 +1,252 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file caffe_data_iter.cc
+ * \brief register mnist iterator
+*/
+#include <sys/time.h>
+#include <caffe/proto/caffe.pb.h>
+#include <dmlc/parameter.h>
+#include <atomic>
+
+#include "caffe_common.h"
+#include "caffe_stream.h"
+#include "caffe_fieldentry.h"
+#include "caffe_blob.h"
+#include "../../src/io/inst_vector.h"
+#include "../../src/io/iter_prefetcher.h"
+#include "../../src/operator/cast-inl.h"
+
+#define CHECK_NEXT_TIMING
+
+#ifdef CHECK_NEXT_TIMING
+#define IF_CHECK_TIMING(__t$) __t$
+#else
+#define IF_CHECK_TIMING(__t$)
+#endif
+
+namespace mxnet {
+namespace io {
+
+struct CaffeDataParam : public dmlc::Parameter<CaffeDataParam> {
+  /*! \brief protobuf text */
+  ::caffe::LayerParameter prototxt;
+  /*! \brief number of iterations per epoch */
+  int num_examples;
+  /*! \brief data mode */
+  bool flat;
+
+  DMLC_DECLARE_PARAMETER(CaffeDataParam) {
+    DMLC_DECLARE_FIELD(prototxt).set_default("layer{}")
+      .describe("Caffe's layer parameter");
+    DMLC_DECLARE_FIELD(flat).set_default(false)
+      .describe("Augmentation Param: Whether to flat the data into 1D.");
+    DMLC_DECLARE_FIELD(num_examples).set_lower_bound(1).set_default(10000)
+      .describe("Number of examples in the epoch.");
+  }
+};
+
+template<typename Dtype>
+class CaffeDataIter : public IIterator<TBlobBatch> {
+ public:
+  explicit CaffeDataIter(int type_flag) : batch_size_(0), channels_(1), width_(1), height_(1)
+                               , type_flag_(type_flag), loc_(0)
+  {}
+  virtual ~CaffeDataIter(void) {}
+
+  // intialize iterator loads data in
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::map<std::string, std::string> kmap(kwargs.begin(), kwargs.end());
+    param_.InitAllowUnknown(kmap);
+
+    // Caffe seems to understand phase inside an "include {}" block
+    if (!param_.prototxt.has_phase()) {
+      if (param_.prototxt.include().size()) {
+        if (param_.prototxt.include(0).has_phase()) {
+          param_.prototxt.set_phase(param_.prototxt.include(0).phase());
+        }
+      }
+    }
+
+    std::string type = param_.prototxt.type();
+    caffe_data_layer_ = caffe::LayerRegistry<Dtype>::CreateLayer(param_.prototxt);
+    CHECK(caffe_data_layer_ != nullptr) << "Failed creating caffe data layer";
+    const size_t top_size = param_.prototxt.top_size();
+    if (top_size > 0) {
+      if (top_size > NR_SUPPORTED_TOP_ITEMS) {
+        LOG(WARNING)
+          << "Too may \"top\" items, only two (one data, one label) are currently supported";
+      }
+      top_.reserve(top_size);
+      for (size_t x = 0; x < top_size; ++x) {
+        ::caffe::Blob<Dtype> *blob = new ::caffe::Blob<Dtype>();
+        cleanup_blobs_.push_back(std::unique_ptr<::caffe::Blob<Dtype>>(blob));
+        top_.push_back(blob);
+      }
+      caffe_data_layer_->SetUp(bottom_, top_);
+      const std::vector<int> &shape = top_[DATA]->shape();
+      const size_t shapeDimCount = shape.size();
+      if (shapeDimCount > 0) {
+        batch_size_ = shape[0];
+        if (shapeDimCount > 1) {
+          channels_ = shape[1];
+          if (shapeDimCount > 2) {
+            width_ = shape[2];
+            if (shapeDimCount > 3) {
+              height_ = shape[3];
+            }
+          }
+        }
+      }
+
+      if (top_size > DATA) {
+        if (param_.flat) {
+          batch_data_ = TBlob(nullptr, mshadow::Shape2(batch_size_,
+                                                       channels_ * width_ * height_),
+                              cpu::kDevCPU, type_flag_);
+        } else {
+          batch_data_ = TBlob(nullptr, mxnet::TShape(top_[DATA]->shape().begin(),
+                                                     top_[DATA]->shape().end()),
+                              cpu::kDevCPU, type_flag_);
+        }
+      }
+      out_.data.clear();
+      if (top_size > LABEL) {
+          batch_label_ = TBlob(nullptr, mxnet::TShape(top_[LABEL]->shape().begin(),
+                                                      top_[LABEL]->shape().end()),
+                               cpu::kDevCPU, type_flag_);
+      }
+      out_.batch_size = batch_size_;
+    }
+  }
+
+  virtual void BeforeFirst(void) {
+    loc_ = 0;
+  }
+
+  virtual bool Next(void) {
+    // MxNet iterator is expected to return CPU-accessible memory
+    if (::caffe::Caffe::mode() != ::caffe::Caffe::CPU) {
+      ::caffe::Caffe::set_mode(::caffe::Caffe::CPU);
+      CHECK_EQ(::caffe::Caffe::mode(), ::caffe::Caffe::CPU);
+    }
+    caffe_data_layer_->Forward(bottom_, top_);
+    CHECK_GT(batch_size_, 0) << "batch size must be greater than zero";
+    CHECK_EQ(out_.batch_size, batch_size_) << "Internal Error: batch size mismatch";
+
+    if (loc_ + batch_size_ <= param_.num_examples) {
+      batch_data_.dptr_ = top_[DATA]->mutable_cpu_data();
+      batch_label_.dptr_ = top_[LABEL]->mutable_cpu_data();
+
+      out_.data.clear();
+      out_.data.push_back(batch_data_);
+      out_.data.push_back(batch_label_);
+      loc_ += batch_size_;
+      return true;
+    }
+
+    return false;
+  }
+
+  virtual const TBlobBatch &Value(void) const {
+    return out_;
+  }
+
+ private:
+  /*! \brief indexes into top_ */
+  enum { DATA = 0, LABEL, NR_SUPPORTED_TOP_ITEMS };
+
+  /*! \brief MNISTCass iter params */
+  CaffeDataParam param_;
+  /*! \brief Shape scalar values */
+  index_t batch_size_, channels_, width_, height_;
+  /*! \brief Caffe data layer */
+  boost::shared_ptr<caffe::Layer<Dtype> >  caffe_data_layer_;
+  /*! \brief batch data blob */
+  mxnet::TBlob batch_data_;
+  /*! \brief batch label blob */
+  mxnet::TBlob batch_label_;
+  /*! \brief Output blob data for this iteration */
+  TBlobBatch out_;
+  /*! \brief Bottom and top connection-point blob data */
+  std::vector<::caffe::Blob<Dtype>*> bottom_, top_;
+  /*! \brief Cleanup these blobs on exit */
+  std::list<std::unique_ptr<::caffe::Blob<Dtype>>> cleanup_blobs_;
+  /*! \brief type flag of the tensor blob */
+  const int type_flag_;
+  /*! \brief Blobs done so far */
+  std::atomic<size_t>  loc_;
+};  // class CaffeDataIter
+
+class CaffeDataIterWrapper : public PrefetcherIter {
+ public:
+  CaffeDataIterWrapper() : PrefetcherIter(NULL), next_time_(0) {}
+  virtual ~CaffeDataIterWrapper() {
+    IF_CHECK_TIMING(
+      if (next_time_.load() > 0) {
+        LOG(WARNING) << "Caffe data loader was blocked for "
+                     << next_time_.load()
+                     << " ms waiting for incoming data";
+      }
+    )
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    // We need to init prefetcher args in order to get dtype
+    this->param_.InitAllowUnknown(kwargs);
+    switch (this->param_.dtype) {
+      case mshadow::kFloat32:
+        this->loader_.reset(new CaffeDataIter<float>(this->param_.dtype));
+        break;
+      case mshadow::kFloat64:
+        this->loader_.reset(new CaffeDataIter<double>(this->param_.dtype));
+        break;
+      case mshadow::kFloat16:
+        LOG(FATAL) << "float16 layer is not supported by caffe";
+        return;
+      default:
+        LOG(FATAL) << "Unsupported type " << this->param_.dtype;
+        return;
+    }
+    PrefetcherIter::Init(kwargs);
+    this->param_.prefetch_buffer = 1;
+  }
+  virtual void BeforeFirst(void) {
+    return PrefetcherIter::BeforeFirst();
+  }
+  virtual bool Next(void) {
+    IF_CHECK_TIMING(
+      const uint64_t start_time = GetTickCountMS();
+    )
+    const bool rc = PrefetcherIter::Next();
+    IF_CHECK_TIMING(
+      const uint64_t diff_time  = GetTickCountMS() - start_time;
+      next_time_.fetch_add(diff_time);
+    )
+    return rc;
+  }
+
+ protected:
+  IF_CHECK_TIMING(
+    static uint64_t GetTickCountMS() {
+      struct timeval tv;
+      gettimeofday(&tv, 0);
+      return uint64_t( tv.tv_sec ) * 1000 + tv.tv_usec / 1000;
+    }
+  )
+
+  /*! \brief milliseconds spent in Next() */
+  std::atomic<uint64_t> next_time_;
+};  // class CaffeDataIterWrapper
+
+DMLC_REGISTER_PARAMETER(CaffeDataParam);
+
+MXNET_REGISTER_IO_ITER(CaffeDataIter)
+.describe("Create MxNet iterator for a Caffe data layer.")
+.add_arguments(CaffeDataParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.set_body([]() {
+    return new CaffeDataIterWrapper();
+});
+
+}  // namespace io
+}  // namespace mxnet
+
diff --git a/plugin/caffe/caffe_fieldentry.h b/plugin/caffe/caffe_fieldentry.h
new file mode 100644
index 000000000000..4f92e6691751
--- /dev/null
+++ b/plugin/caffe/caffe_fieldentry.h
@@ -0,0 +1,88 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_fieldentry.h
+ * \brief Implement FieldEntry<caffe::LayerParameter>
+ * \author Haoran Wang 
+ */
+#ifndef PLUGIN_CAFFE_CAFFE_FIELDENTRY_H_
+#define PLUGIN_CAFFE_CAFFE_FIELDENTRY_H_
+
+#include <caffe/proto/caffe.pb.h>
+#include <dmlc/parameter.h>
+#include <dmlc/base.h>
+#include <dmlc/json.h>
+#include <dmlc/logging.h>
+#include <dmlc/type_traits.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+
+#include <cstddef>
+#include <cstdlib>
+#include <sstream>
+#include <limits>
+#include <map>
+#include <set>
+#include <typeinfo>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <utility>
+
+#include <caffe/util/io.hpp>
+namespace dmlc {
+namespace parameter {
+
+// specialize define for Layer Parameter
+template<>
+class FieldEntry<caffe::LayerParameter>
+    : public FieldEntryBase<FieldEntry<caffe::LayerParameter>, caffe::LayerParameter> {
+ public:
+  // parent class
+  typedef FieldEntryBase<FieldEntry<caffe::LayerParameter>, caffe::LayerParameter> Parent;
+
+
+  bool ReadProtoFromTextContent(const std::string& text,
+                                ::google::protobuf::Message* proto) const {
+    bool success = google::protobuf::TextFormat::ParseFromString(text, proto);
+    return success;
+  }
+
+  /**
+   * /brief Customize set method for LayerParameter
+   * /tparam value string of caffe's layer configuration
+   * */
+  virtual void Set(void *head, const std::string &value) const {
+    caffe::NetParameter net_param;
+    if (!ReadProtoFromTextContent(value, &net_param))
+      CHECK(false)<< "Caffe Net Prototxt: " << value << "Initialized Failed";
+
+    CHECK_EQ(net_param.layer_size(), 1) << "Prototxt" << value <<" more than a layer";
+    caffe::LayerParameter *layer_param = new caffe::LayerParameter(net_param.layer(0));
+    this->Get(head) = (*layer_param);
+  }
+
+  virtual void PrintValue(std::ostream &os, caffe::LayerParameter value) const { // NOLINT(*)
+  }
+
+  virtual void PrintDefaultValueString(std::ostream &os) const {  // NOLINT(*)
+    os << '\'' << default_value_.name().c_str() << '\'';
+  }
+
+  // override set_default
+  inline FieldEntry<caffe::LayerParameter> &set_default(const std::string &value) {
+    caffe::NetParameter net_param;
+    if (!ReadProtoFromTextContent(value, &net_param))
+      CHECK(false)<< "Caffe Net Prototxt: " << value << "Initialized Failed";
+
+    CHECK_EQ(net_param.layer_size(), 1) << "Protoxt " << value <<" is more than one layer";
+    default_value_ = caffe::LayerParameter(net_param.layer(0));
+    has_default_ = true;
+    // return self to allow chaining
+    return this->self();
+  }
+};
+
+}  // namespace parameter
+}  // namespace dmlc
+
+#endif  // PLUGIN_CAFFE_CAFFE_FIELDENTRY_H_
diff --git a/plugin/caffe/caffe_loss-inl.h b/plugin/caffe/caffe_loss-inl.h
new file mode 100644
index 000000000000..802e3923ab64
--- /dev/null
+++ b/plugin/caffe/caffe_loss-inl.h
@@ -0,0 +1,283 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_loss-inl.h
+ * \brief Caffe Operator
+ * \author Haoran Wang 
+*/
+#ifndef PLUGIN_CAFFE_CAFFE_LOSS_INL_H_
+#define PLUGIN_CAFFE_CAFFE_LOSS_INL_H_
+
+#include <caffe/proto/caffe.pb.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+
+#include "../../src/operator/operator_common.h"
+#include "caffe_common.h"
+#include "caffe_stream.h"
+#include "caffe_fieldentry.h"
+#include "caffe_blob.h"
+
+namespace mxnet {
+namespace op {
+
+struct CaffeLossParam : public dmlc::Parameter<CaffeLossParam> {
+  ::caffe::LayerParameter prototxt;
+  int num_data, num_out;
+  float grad_scale;
+
+  DMLC_DECLARE_PARAMETER(CaffeLossParam) { DMLC_DECLARE_FIELD(prototxt).set_default("layer{}")
+    .describe("Caffe's layer parameter");
+    DMLC_DECLARE_FIELD(num_data).set_range(0, 100).set_default(2)
+    .describe("Operator input number");
+    DMLC_DECLARE_FIELD(num_out).set_range(0, 100).set_default(1)
+    .describe("Operator output number");
+    DMLC_DECLARE_FIELD(grad_scale)
+    .set_default(1.0f)
+    .describe("Scale the gradient by a float factor (a.k.a weight of this loss).");
+  }
+};
+
+/**
+ * \brief this is the implementation of caffe operator in caffe.
+ * \tparam xpu the device that the op will be executed on.
+ */
+template<typename xpu, typename Dtype>
+class CaffeLoss : public Operator {
+ public:
+  explicit CaffeLoss(CaffeLossParam p):param_(p),
+                                       setup_(false) {
+    std::string type = param_.prototxt.type();
+    caffeOp_ = caffe::LayerRegistry<Dtype>::CreateLayer(param_.prototxt);
+    grad_scale_ = (Dtype)param_.grad_scale;
+
+    caffe::InitCaffeBlobs<Dtype>(&bot_, param_.num_data);
+    caffe::InitCaffeBlobs<Dtype>(&top_, param_.num_out);
+    flags_.resize(param_.num_data);
+  }
+
+  ~CaffeLoss() {
+    caffe::DelCaffeBlobs(&bot_, param_.num_data);
+    caffe::DelCaffeBlobs(&top_, param_.num_out);
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    // Set mode before forward
+    caffe::CaffeMode::SetMode<xpu>();
+    using ::caffe::Blob;
+    using std::vector;
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    for (uint32_t i = 0; i < req.size(); ++i)
+      CHECK_EQ(req[i], kWriteTo);
+
+    CHECK_EQ(in_data.size(), param_.num_data);
+    CHECK_EQ(out_data.size(), param_.num_out);
+
+#if defined(__CUDACC__)
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // TODO(Haoran): when need cublas handle in stream?
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+          << "Must init CuBLAS handle in stream";
+#endif  // __CUDACC__
+
+    caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Data,
+                                      bot_.begin(),
+                                      in_data.begin(),
+                                      param_.num_data);
+    caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Data,
+                                      top_.begin(),
+                                      out_data.begin(),
+                                      param_.num_out);
+    CaffeOpSetup();
+    if (ctx.is_train)
+      caffeOp_->SetPhase(::caffe::TRAIN);
+    else
+      caffeOp_->SetPhase(::caffe::TEST);
+    caffeOp_->Forward(bot_, top_);
+
+#if defined(__CUDACC__)
+    // Sync cpu data to gpu data
+    for (uint32_t i = 0; i < top_.size(); ++i)
+      top_[i]->gpu_data();
+
+    CHECK_EQ(cudaStreamSynchronize(NULL), cudaSuccess);
+#endif  // __CUDACC__
+  }
+
+  // Set up caffe op with real data
+  void CaffeOpSetup() {
+    if (!setup_) {
+      setup_ = true;
+      caffeOp_->SetUp(bot_, top_);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    // Set mode before backward
+    caffe::CaffeMode::SetMode<xpu>();
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), param_.num_out);
+    for (int i = 0; i < param_.num_data; ++i)
+      CHECK(req[i] != kAddTo) << "caffe doesn't accm diff on bottom data";
+    CHECK(in_data.size() == param_.num_data);
+
+#if defined(__CUDACC__)
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // TODO(Haoran): when need cublas handle in stream?
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+          << "Must init CuBLAS handle in stream";
+#endif  // __CUDACC__
+
+    caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Grad,
+                                      bot_.begin(),
+                                      in_grad.begin(),
+                                      param_.num_data);
+    // Pass grad scale to caffe blob
+    top_[0]->set_cpu_diff(&grad_scale_);
+
+    // Set BP flag
+    for (int i = 0; i < param_.num_data; ++i)
+      flags_[i] = req[i] != kNullOp;
+
+    caffeOp_->Backward(top_, flags_, bot_);
+
+#if defined(__CUDACC__)
+    // Sync cpu diff to gpu diff
+    for (uint32_t i = 0; i < bot_.size(); ++i)
+      bot_[i]->gpu_diff();
+
+    CHECK_EQ(cudaStreamSynchronize(NULL), cudaSuccess);
+#endif  // __CUDACC__
+  }
+
+ private:
+  CaffeLossParam param_;
+  ::caffe::Layer<Dtype> *caffeOp_;
+  Dtype grad_scale_;
+  std::vector< ::caffe::Blob<Dtype> *> bot_, top_;
+  std::vector<bool> flags_;
+  bool setup_;
+};  // class CaffeLoss
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(CaffeLossParam param, int);
+
+#if DMLC_USE_CXX11
+class CaffeLossProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+    CHECK_EQ(param_.num_out, 1);
+    CHECK_EQ(param_.num_data, 2);
+
+    // Fetch grad_scale from prototxt
+    if ((param_.prototxt.loss_weight_size() > 0))
+      param_.grad_scale = param_.prototxt.loss_weight(0);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  /*brief Set up caffeop to infer output shape*/
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    using ::caffe::Blob;
+    using std::vector;
+    if (caffeOp_ == NULL)
+      caffeOp_ = caffe::LayerRegistry<float>::CreateLayer(param_.prototxt);
+
+    CHECK_GE(in_shape->size(), param_.num_data);
+    // Initialize empty bottom & top blobs for caffeOp setup
+    vector<Blob<float> *> bot_blobs, top_blobs;
+
+    for (int i = 0; i < param_.num_data; ++i) {
+      TShape tshape = (*in_shape)[i];
+      if (tshape.ndim() == 0) return false;
+      auto blob_ptr = new Blob<float>();
+      blob_ptr->Reshape(caffe::TShape2Vector(tshape));
+      bot_blobs.push_back(blob_ptr);
+    }
+
+    for (int i = 0; i < param_.num_out; ++i)
+      top_blobs.push_back(new Blob<float>());
+
+    caffeOp_->SetUp(bot_blobs, top_blobs);
+    CHECK_EQ(in_shape->size(), caffeOp_->blobs().size() + param_.num_data);
+    // Initialize out shapes
+    out_shape->clear();
+    for (auto blob : top_blobs) {
+      TShape tshape = caffe::Vector2TShape(blob->shape());
+      out_shape->push_back(tshape);
+    }
+
+    for (auto blob_ptr : bot_blobs)
+      delete blob_ptr;
+    for (auto blob_ptr : top_blobs)
+      delete blob_ptr;
+
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto copy_prop = new CaffeLossProp();
+    copy_prop->param_ = this->param_;
+    return copy_prop;
+  }
+
+  std::string TypeString() const override {
+    return "CaffeLoss";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    std::vector<int> dep;
+    dep.insert(dep.end(), in_data.begin(), in_data.end());
+    dep.insert(dep.end(), out_data.begin(), out_data.end());
+    return dep;
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+
+ private:
+  mutable CaffeLossParam param_;
+  mutable ::caffe::Layer<float> *caffeOp_;
+};  // class CaffeLossSymbol
+#endif
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // PLUGIN_CAFFE_CAFFE_LOSS_INL_H_
diff --git a/plugin/caffe/caffe_loss.cc b/plugin/caffe/caffe_loss.cc
new file mode 100644
index 000000000000..a51f12602991
--- /dev/null
+++ b/plugin/caffe/caffe_loss.cc
@@ -0,0 +1,51 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_loss.cc
+ * \brief caffe loss 
+ * \author Haoran Wang 
+*/
+#include "./caffe_loss-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(CaffeLossParam param, int dtype) {
+  Operator *op = NULL;
+  switch (dtype) {
+  case mshadow::kFloat32:
+    op = new CaffeLoss<cpu, float>(param);
+    break;
+  case mshadow::kFloat64:
+    op = new CaffeLoss<cpu, double>(param);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 layer is not supported by caffe";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
+  }
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from static_operator_common.h
+Operator *CaffeLossProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<int> out_type, aux_type;
+  std::vector<TShape> out_shape, aux_shape;
+  out_type.resize(this->ListOutputs().size());
+  out_shape.resize(this->ListOutputs().size());
+  aux_type.resize(this->ListAuxiliaryStates().size());
+  aux_shape.resize(this->ListAuxiliaryStates().size());
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(CaffeLossParam);
+
+MXNET_REGISTER_OP_PROPERTY(CaffeLoss, CaffeLossProp)
+.describe("Caffe loss layer")
+.add_arguments(CaffeLossParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/caffe/caffe_loss.cu b/plugin/caffe/caffe_loss.cu
new file mode 100644
index 000000000000..55489cafc316
--- /dev/null
+++ b/plugin/caffe/caffe_loss.cu
@@ -0,0 +1,31 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_loss_gpu.cc
+ * \brief caffe loss 
+ * \author Haoran Wang 
+*/
+#include "./caffe_loss-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(CaffeLossParam param, int dtype) {
+  Operator *op = NULL;
+  switch (dtype) {
+  case mshadow::kFloat32:
+    op = new CaffeLoss<gpu, float>(param);
+    break;
+  case mshadow::kFloat64:
+    op = new CaffeLoss<gpu, double>(param);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 layer is not supported by caffe";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
+  }
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/caffe/caffe_op-inl.h b/plugin/caffe/caffe_op-inl.h
new file mode 100644
index 000000000000..a30f224160ed
--- /dev/null
+++ b/plugin/caffe/caffe_op-inl.h
@@ -0,0 +1,329 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_op-inl.h
+ * \brief Caffe Operator
+ * \author Haoran Wang 
+*/
+#ifndef PLUGIN_CAFFE_CAFFE_OP_INL_H_
+#define PLUGIN_CAFFE_CAFFE_OP_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <caffe/proto/caffe.pb.h>
+
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+
+#include "../../src/operator/operator_common.h"
+#include "caffe_common.h"
+#include "caffe_stream.h"
+#include "caffe_fieldentry.h"
+#include "caffe_blob.h"
+
+namespace mxnet {
+namespace op {
+
+struct CaffeOpParam : public dmlc::Parameter<CaffeOpParam> {
+  ::caffe::LayerParameter prototxt;
+  int num_data, num_weight, num_out;
+
+  DMLC_DECLARE_PARAMETER(CaffeOpParam) { DMLC_DECLARE_FIELD(prototxt).set_default("layer{}")
+    .describe("Caffe's layer parameter");
+    DMLC_DECLARE_FIELD(num_data).set_default(1)
+    .describe("Operator input number");
+    DMLC_DECLARE_FIELD(num_weight).set_default(0)
+    .describe("Weight number");
+    DMLC_DECLARE_FIELD(num_out).set_default(1)
+    .describe("Operator output number");
+  }
+};
+
+
+/**
+ * \brief this is the implementation of caffe operator in caffe.
+ * \tparam xpu the device that the op will be executed on.
+ */
+template<typename xpu, typename Dtype>
+class CaffeOp : public Operator {
+ public:
+  explicit CaffeOp(CaffeOpParam p):param_(p),
+                                   init_w_(false),
+                                   init_wd_(false),
+                                   setup_(false) {
+    std::string type = param_.prototxt.type();
+    caffeOp_ = caffe::LayerRegistry<Dtype>::CreateLayer(param_.prototxt);
+
+    caffe::InitCaffeBlobs<Dtype>(&bot_, param_.num_data);
+    caffe::InitCaffeBlobs<Dtype>(&top_, param_.num_out);
+    caffe::InitCaffeBlobs<Dtype>(&wei_, param_.num_weight);
+    flags_.resize(param_.num_data);
+  }
+
+  ~CaffeOp() {
+    caffe::DelCaffeBlobs(&bot_, param_.num_data);
+    caffe::DelCaffeBlobs(&top_, param_.num_out);
+    caffe::DelCaffeBlobs(&wei_, param_.num_weight);
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    // Set mode before forward
+    caffe::CaffeMode::SetMode<xpu>();
+    using ::caffe::Blob;
+    using std::vector;
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    for (uint32_t i = 0; i < req.size(); ++i)
+      CHECK_EQ(req[i], kWriteTo);
+    int expected_num_data = param_.num_weight + param_.num_data;
+    CHECK_EQ(in_data.size(), expected_num_data);
+    CHECK_EQ(out_data.size(), param_.num_out);
+
+#if defined(__CUDACC__)
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // TODO(Haoran): when need cublas handle in stream?
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+          << "Must init CuBLAS handle in stream";
+#endif  // __CUDACC__
+
+    caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Data,
+                                       bot_.begin(),
+                                       in_data.begin(),
+                                       param_.num_data);
+    caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Data,
+                                       top_.begin(),
+                                       out_data.begin(),
+                                       param_.num_out);
+    CaffeOpSetup();
+    // Init caffe's weight pointer
+    if (!init_w_) {
+      init_w_ = true;
+      caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Data,
+                                         wei_.begin(),
+                                         in_data.begin() + param_.num_data,
+                                         param_.num_weight);
+      caffe::SetOpBlobs(caffeOp_, wei_);
+    }
+    if (ctx.is_train)
+      caffeOp_->SetPhase(::caffe::TRAIN);
+    else
+      caffeOp_->SetPhase(::caffe::TEST);
+    caffeOp_->Forward(bot_, top_);
+
+#if defined(__CUDACC__)
+    // Sync cpu data to gpu data
+    for (uint32_t i = 0; i < top_.size(); ++i)
+      top_[i]->gpu_data();
+
+    CHECK_EQ(cudaStreamSynchronize(NULL), cudaSuccess);
+#endif  // __CUDACC__
+  }
+
+  // Set up caffe op with real data
+  void CaffeOpSetup() {
+    if (!setup_) {
+      setup_ = true;
+      caffeOp_->SetUp(bot_, top_);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    // Set mode before backward
+    caffe::CaffeMode::SetMode<xpu>();
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), param_.num_out);
+    for (int i = 0; i < param_.num_data; ++i)
+      CHECK(req[i] != kAddTo) << "caffe doesn't accm diff on bottom data";
+
+    int expected_num_data = param_.num_weight + param_.num_data;
+    CHECK(in_data.size() == expected_num_data && in_grad.size() == expected_num_data);
+    CHECK_EQ(req.size(), expected_num_data);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+#if defined(__CUDACC__)
+    // TODO(Haoran): when need cublas handle in stream?
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+          << "Must init CuBLAS handle in stream";
+#endif  // __CUDACC__
+
+    caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Grad,
+                                       bot_.begin(),
+                                       in_grad.begin(),
+                                       param_.num_data);
+    caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Grad,
+                                       top_.begin(),
+                                       out_grad.begin(),
+                                       param_.num_out);
+
+    // Init caffe's gradient pointer
+    if (!init_wd_) {
+      init_wd_ = true;
+      caffe::TBlob2CaffeBlob<xpu, Dtype>(caffe::Grad,
+                                         wei_.begin(),
+                                         in_grad.begin() + param_.num_data,
+                                         param_.num_weight);
+    }
+
+    // Handle OpReqType of weights
+    for (int i = param_.num_data; i < expected_num_data; ++i)
+      HandleOpReq(s, req[i], in_grad[i]);
+
+    // Set BP flag
+    for (int i = 0; i < param_.num_data; ++i)
+      flags_[i] = req[i] != kNullOp;
+
+    caffeOp_->Backward(top_, flags_, bot_);
+
+#if defined(__CUDACC__)
+    // Sync cpu diff to gpu diff
+    for (uint32_t i = 0; i < bot_.size(); ++i)
+      bot_[i]->gpu_diff();
+
+    CHECK_EQ(cudaStreamSynchronize(NULL), cudaSuccess);
+#endif  // __CUDACC__
+  }
+
+  void HandleOpReq(mshadow::Stream<xpu>*s, OpReqType req, const TBlob& in_g) {
+    if ((req == kWriteInplace) || (req == kWriteTo)) {
+      mshadow::Tensor<xpu, 2, Dtype> grad = in_g.FlatTo2D<xpu, Dtype>(s);
+      grad = 0;
+    }
+  }
+
+ private:
+  CaffeOpParam param_;
+  ::caffe::Layer<Dtype> *caffeOp_;
+  std::vector< ::caffe::Blob<Dtype> *> bot_, top_, wei_;
+  std::vector<bool> flags_;
+  bool init_w_, init_wd_, setup_;
+};  // class CaffeOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(CaffeOpParam param, int);
+
+#if DMLC_USE_CXX11
+class CaffeOpProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    std::vector<std::string> res;
+    for (int i = 0; i < param_.num_data; ++i)
+      res.push_back(std::string("data_") + static_cast<char>('0' + i));
+
+    for (int i = 0; i < param_.num_weight; ++i) {
+      if (i == 0)
+        res.push_back(std::to_string(i) + "_weight");
+      else
+        res.push_back(std::to_string(i) + "_bias");
+    }
+    return res;
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    if (param_.num_out > 1) {
+      std::vector<std::string> ret;
+      for (int i = 0; i < param_.num_out; ++i)
+        ret.push_back("output" + std::to_string(i));
+      return ret;
+    } else {
+      return {"output"};
+    }
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  /*
+   * \brief Set up caffeOp_ to infer weights & output shape
+   * \brief Initialize param_'s in & out dims
+   */
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    if (caffeOp_ == NULL)
+      caffeOp_ = caffe::LayerRegistry<float>::CreateLayer(param_.prototxt);
+    using namespace mshadow;
+    using ::caffe::Blob;
+    using std::vector;
+    CHECK_GE(in_shape->size(), param_.num_data);
+    // Initialize emtryp bottom & top blobs for caffeop
+    vector<Blob<float> *> bot_blobs, top_blobs;
+
+    for (int i = 0; i < param_.num_data; ++i) {
+      TShape tshape = (*in_shape)[i];
+      if (tshape.ndim() == 0) return false;
+      auto blob_ptr = new Blob<float>();
+      blob_ptr->Reshape(caffe::TShape2Vector(tshape));
+      bot_blobs.push_back(blob_ptr);
+    }
+
+    for (int i = 0; i < param_.num_out; ++i)
+      top_blobs.push_back(new Blob<float>());
+
+    caffeOp_->SetUp(bot_blobs, top_blobs);
+    CHECK_EQ(in_shape->size(), caffeOp_->blobs().size() + param_.num_data);
+    // Set weight shape
+    CHECK_EQ(param_.num_weight, caffeOp_->blobs().size());
+    for (int i = 0; i < param_.num_weight ; ++i) {
+      TShape tshape = caffe::Vector2TShape(caffeOp_->blobs()[i]->shape());
+      SHAPE_ASSIGN_CHECK(*in_shape, i + param_.num_data, tshape);
+    }
+    // Initialize out shapes
+    out_shape->clear();
+    for (auto blob : top_blobs) {
+      TShape tshape = caffe::Vector2TShape(blob->shape());
+      out_shape->push_back(tshape);
+    }
+
+    for (auto blob_ptr : bot_blobs)
+      delete blob_ptr;
+    for (auto blob_ptr : top_blobs)
+      delete blob_ptr;
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto copy_prop = new CaffeOpProp();
+    copy_prop->param_ = this->param_;
+    return copy_prop;
+  }
+
+  std::string TypeString() const override {
+    return "CaffeOp";
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  mutable CaffeOpParam param_;
+  mutable ::caffe::Layer<float> *caffeOp_;
+};  // class CaffeOpSymbol
+#endif
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // PLUGIN_CAFFE_CAFFE_OP_INL_H_
diff --git a/plugin/caffe/caffe_op.cc b/plugin/caffe/caffe_op.cc
new file mode 100644
index 000000000000..90cb4da44b0f
--- /dev/null
+++ b/plugin/caffe/caffe_op.cc
@@ -0,0 +1,52 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_op.cc
+ * \brief caffe operator
+ * \author Haoran Wang 
+*/
+#include "./caffe_op-inl.h"
+namespace mxnet {
+namespace op {
+
+template<>
+Operator* CreateOp<cpu>(CaffeOpParam param, int dtype) {
+  Operator *op = NULL;
+  switch (dtype) {
+  case mshadow::kFloat32:
+    op = new CaffeOp<cpu, float>(param);
+    break;
+  case mshadow::kFloat64:
+    op = new CaffeOp<cpu, double>(param);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 layer is not supported by caffe";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
+  }
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from static_operator_common.h
+Operator *CaffeOpProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<int> out_type, aux_type;
+  std::vector<TShape> out_shape, aux_shape;
+  out_type.resize(this->ListOutputs().size());
+  out_shape.resize(this->ListOutputs().size());
+  aux_type.resize(this->ListAuxiliaryStates().size());
+  aux_shape.resize(this->ListAuxiliaryStates().size());
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(CaffeOpParam);
+
+MXNET_REGISTER_OP_PROPERTY(CaffeOp, CaffeOpProp)
+.describe("Apply caffe operator")
+.add_argument("data", "Symbol[]", "List of tensors")
+.add_arguments(CaffeOpParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/caffe/caffe_op.cu b/plugin/caffe/caffe_op.cu
new file mode 100644
index 000000000000..c52f2b69fb63
--- /dev/null
+++ b/plugin/caffe/caffe_op.cu
@@ -0,0 +1,31 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_operator_gpu.cc
+ * \brief caffe operator
+ * \author Haoran Wang
+*/
+#include "./caffe_op-inl.h"
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<gpu>(CaffeOpParam param, int dtype) {
+  Operator *op = NULL;
+  switch (dtype) {
+  case mshadow::kFloat32:
+    op = new CaffeOp<gpu, float>(param);
+    break;
+  case mshadow::kFloat64:
+    op = new CaffeOp<gpu, double>(param);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 layer is not supported by caffe";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
+  }
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/caffe/caffe_stream.cc b/plugin/caffe/caffe_stream.cc
new file mode 100644
index 000000000000..99202bf9c09c
--- /dev/null
+++ b/plugin/caffe/caffe_stream.cc
@@ -0,0 +1,18 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_stream.cc
+ * \brief define stream opertors >> and <<
+ * \author Haoran Wang 
+*/
+#include"caffe_stream.h"
+
+namespace dmlc {
+namespace parameter {
+  std::istringstream &operator>>(std::istringstream &is, ::caffe::LayerParameter &para_) {
+    return is;
+  }
+  std::ostream &operator<<(std::ostream &os, ::caffe::LayerParameter &para_) {
+    return os;
+  }
+}
+}
diff --git a/plugin/caffe/caffe_stream.h b/plugin/caffe/caffe_stream.h
new file mode 100644
index 000000000000..de9edb84feb4
--- /dev/null
+++ b/plugin/caffe/caffe_stream.h
@@ -0,0 +1,19 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file caffe_stream.h
+ * \brief define stream opertors >> and <<
+ * \author Haoran Wang 
+*/
+#ifndef PLUGIN_CAFFE_CAFFE_STREAM_H_
+#define PLUGIN_CAFFE_CAFFE_STREAM_H_
+
+#include<caffe/proto/caffe.pb.h>
+#include<iostream>
+namespace dmlc {
+namespace parameter {
+  std::istringstream &operator>>(std::istringstream &is, ::caffe::LayerParameter &para_);
+  std::ostream &operator<<(std::ostream &os, ::caffe::LayerParameter &para_);
+}
+}
+
+#endif  // PLUGIN_CAFFE_CAFFE_STREAM_H_
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
index b37132144cb9..a09ed8245666 100644
--- a/plugin/warpctc/warpctc-inl.h
+++ b/plugin/warpctc/warpctc-inl.h
@@ -26,6 +26,7 @@ namespace op {
 namespace warpctc_enum {
   enum CTCOpInputs {kData, kLabel};
   enum CTCOpOutputs {kOut};
+  enum CTCTemp {kTmp};
 }  // namespace warpctc_enum
 
 struct WarpCTCParam : public dmlc::Parameter<WarpCTCParam> {
@@ -115,6 +116,7 @@ class WarpCTCOp : public Operator {
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     TBlob data = in_data[warpctc_enum::kData];
     TBlob label = in_data[warpctc_enum::kLabel];
     CHECK_EQ(data.shape_.ndim(), 2) << "input data shape should be 2 (t*n, p)";
@@ -126,8 +128,8 @@ class WarpCTCOp : public Operator {
 #if MXNET_USE_CUDA
       info.loc = CTC_GPU;
       info.stream = ctx.get_stream<gpu>()->stream_;
-#endif
     } else {
+#endif
       LOG(FATAL) << "Unknown device type " << data.dev_mask_;
     }
 
@@ -173,16 +175,10 @@ class WarpCTCOp : public Operator {
                                       input_lengths.size(), info,
                                       &alloc_bytes),
                    "Error: get_workspace_size in inf_test");
-    void* ctc_workspace;
 
-    if (data.dev_mask_ == cpu::kDevMask) {
-      ctc_workspace = malloc(alloc_bytes);
-    } else if (data.dev_mask_ == gpu::kDevMask) {
-#if MXNET_USE_CUDA
-      cuda_status = cudaMalloc(&ctc_workspace, alloc_bytes);
-      CHECK_EQ(cuda_status, cudaSuccess) << "cuda malloc worksapce fail";
-#endif
-    }
+    Tensor<xpu, 1> ctc_workspace = ctx.requested[warpctc_enum::kTmp].get_space<xpu>(
+        mshadow::Shape1(alloc_bytes), s);
+
     std::vector<float> costs(minibatch);
     throw_on_error(compute_ctc_loss(activations,
                                     grads,
@@ -192,17 +188,14 @@ class WarpCTCOp : public Operator {
                                     alphabet_size,
                                     minibatch,
                                     costs.data(),
-                                    ctc_workspace,
+                                    ctc_workspace.dptr_,
                                     info),
                    "Error: compute_ctc_loss");
 
     if (data.dev_mask_ == cpu::kDevMask) {
-      free(ctc_workspace);
       free(cpu_labels);
     } else if (data.dev_mask_ == gpu::kDevMask) {
 #if MXNET_USE_CUDA
-      cuda_status = cudaFree(ctc_workspace);
-      CHECK_EQ(cuda_status, cudaSuccess) << "cuda free workspace fail";
       free(cpu_raw_labels);
       free(cpu_labels);
 #endif
@@ -262,6 +255,11 @@ class WarpCTCProp : public OperatorProperty {
     return true;
   }
 
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new WarpCTCProp();
     ptr->param_ = param_;
diff --git a/ps-lite b/ps-lite
index 35ddccd4cd03..36b015ffd51c 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 35ddccd4cd0302f78ed2a05f1258860d4666e43c
+Subproject commit 36b015ffd51c0f7062bba845f01164c0433dc6b3
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 6dadf8a95ba3..a2c9f7e02cea 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -47,4 +47,6 @@
 from . import module
 from . import module as mod
 
+from . import test_utils
+
 __version__ = base.__version__
diff --git a/python/mxnet/attribute.py b/python/mxnet/attribute.py
index b60ea0150492..df517699d11c 100644
--- a/python/mxnet/attribute.py
+++ b/python/mxnet/attribute.py
@@ -59,4 +59,3 @@ def __exit__(self, ptype, value, trace):
         AttrScope.current = self._old_scope
 
 AttrScope.current = AttrScope()
-
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index abe614eaf61e..0a276ea65fc3 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -1,12 +1,12 @@
 # coding: utf-8
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, no-member
 """ ctypes library of mxnet and helper functions """
 from __future__ import absolute_import
 
 import sys
 import ctypes
-import numpy as np
 import atexit
+import numpy as np
 from . import libinfo
 
 __all__ = ['MXNetError']
@@ -76,19 +76,34 @@ def check_call(ret):
     if ret != 0:
         raise MXNetError(py_str(_LIB.MXGetLastError()))
 
-def c_str(string):
-    """Create ctypes char * from a python string
-    Parameters
-    ----------
-    string : string type
-        python string
-
-    Returns
-    -------
-    str : c_char_p
-        A char pointer that can be passed to C API
-    """
-    return ctypes.c_char_p(string.encode('utf-8'))
+if sys.version_info[0] < 3:
+    def c_str(string):
+        """Create ctypes char * from a python string
+        Parameters
+        ----------
+        string : string type
+            python string
+
+        Returns
+        -------
+        str : c_char_p
+            A char pointer that can be passed to C API
+        """
+        return ctypes.c_char_p(string)
+else:
+    def c_str(string):
+        """Create ctypes char * from a python string
+        Parameters
+        ----------
+        string : string type
+            python string
+
+        Returns
+        -------
+        str : c_char_p
+            A char pointer that can be passed to C API
+        """
+        return ctypes.c_char_p(string.encode('utf-8'))
 
 
 def c_array(ctype, values):
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 32c6ec1748a4..acb66eefa9a3 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -49,6 +49,7 @@ def __init__(self, handle, symbol, ctx, grad_req, group2ctx):
         self._arg_dict = None
         self._grad_dict = None
         self._aux_dict = None
+        self._output_dict = None
         self._monitor_callback = None
         self._ctx = copy.deepcopy(ctx)
         self._grad_req = copy.deepcopy(grad_req)
@@ -124,7 +125,7 @@ def backward(self, out_grads=None):
 
         Parameters
         ----------
-        out_grads : NDArray or list of NDArray, optional
+        out_grads : NDArray or list of NDArray or dict of str to NDArray, optional
             Gradient on the outputs to be propagated back.
             This parameter is only needed when bind is called
             on outputs that are not a loss function.
@@ -133,6 +134,8 @@ def backward(self, out_grads=None):
             out_grads = []
         elif isinstance(out_grads, NDArray):
             out_grads = [out_grads]
+        elif isinstance(out_grads, dict):
+            out_grads = [out_grads[k] for k in self._symbol.list_outputs()]
 
         for obj in out_grads:
             if not isinstance(obj, NDArray):
@@ -208,6 +211,24 @@ def aux_dict(self):
                 self._symbol.list_auxiliary_states(), self.aux_arrays)
         return self._aux_dict
 
+    @property
+    def output_dict(self):
+        """Get dictionary representation of output arrays.
+
+        Returns
+        -------
+        output_dict : dict of str to NDArray
+            The dictionary that maps name of output names to NDArrays.
+
+        Raises
+        ------
+        ValueError : if there are duplicated names in the outputs.
+        """
+        if self._output_dict is None:
+            self._output_dict = Executor._get_dict(
+                self._symbol.list_outputs(), self.outputs)
+        return self._output_dict
+
     def copy_params_from(self, arg_params, aux_params=None, allow_extra_params=False):
         """Copy parameters from arg_params, aux_params into executor's internal array.
 
@@ -268,7 +289,7 @@ def reshape(self, partial_shaping=False, allow_up_sizing=False, **kwargs):
         """
         # pylint: disable=too-many-branches
         arg_shapes, _, aux_shapes = self._symbol.infer_shape(**kwargs)
-        if arg_shapes == None:
+        if arg_shapes is None:
             raise ValueError("Insufficient argument shapes provided.")
 
         new_arg_dict = {}
@@ -336,4 +357,3 @@ def debug_str(self):
         check_call(_LIB.MXExecutorPrint(
             self.handle, ctypes.byref(debug_str)))
         return py_str(debug_str.value)
-
diff --git a/python/mxnet/executor_manager.py b/python/mxnet/executor_manager.py
index cc41691d342b..186fa2950742 100644
--- a/python/mxnet/executor_manager.py
+++ b/python/mxnet/executor_manager.py
@@ -3,12 +3,13 @@
 """Executor manager"""
 from __future__ import absolute_import
 
+import logging
+import numpy as np
+
 from .base import mx_real_t
 from . import ndarray as nd
 from .context import cpu
 
-import logging
-import numpy as np
 
 def _split_input_slice(batch_size, work_load_list):
     """Get input slice from the input shape.
@@ -111,11 +112,11 @@ def _bind_exec(sym, ctx, input_shapes, param_names, need_grad=False,
 
     arg_names = sym.list_arguments()
 
-    if need_grad == False:
+    if need_grad is False:
         need_grad = set()
-    elif need_grad == True:
+    elif need_grad is True:
         need_grad = set(arg_names) - set(input_shapes.keys())
-    elif need_grad is set:
+    elif isinstance(need_grad, set):
         pass
     else:
         raise AssertionError("need_grad must be boolean or set.")
@@ -123,8 +124,7 @@ def _bind_exec(sym, ctx, input_shapes, param_names, need_grad=False,
 
 
     # create or borrow arguments and gradients
-    for i in range(len(arg_names)):
-        name = arg_names[i]
+    for i, name in enumerate(arg_names):
         if not name in param_names:
             # data or label
             if shared_data_arrays is not None and \
@@ -226,7 +226,7 @@ def __init__(self, sym, arg_names, param_names,
         self.param_names = [arg_names[i] for i in self.param_idx]
 
         self.train_execs = []
-        for i in range(len(ctx)):
+        for i, ctxi in enumerate(ctx):
             data_shapes = {}
             batch_size = 0
             for k, v in train_data.provide_data:
@@ -254,7 +254,7 @@ def __init__(self, sym, arg_names, param_names,
                                            / batch_size)] + list(v[1:]))
 
             shared_exec = None if shared_group is None else shared_group.train_execs[i]
-            train_exec = _bind_exec(sym, ctx[i], data_shapes, self.param_names,
+            train_exec = _bind_exec(sym, ctxi, data_shapes, self.param_names,
                                     need_grad=True, base_exec=shared_exec,
                                     shared_data_arrays=self.shared_data_arrays[i])
             self.train_execs.append(train_exec)
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 47aa0bd3a7b9..e97e9f0e0eb2 100644
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -3,12 +3,12 @@
 """Initialization helper for mxnet"""
 from __future__ import absolute_import
 
+import re
+import logging
 import numpy as np
 from .base import string_types
 from .ndarray import NDArray, load
 from . import random
-import logging
-import re
 
 class Initializer(object):
     """Base class for Initializer."""
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 532c6d12ebf2..230125d61371 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -1,15 +1,15 @@
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access, fixme, too-many-arguments, W0221, W0201, no-self-use
+# pylint: disable=invalid-name, protected-access, fixme, too-many-arguments, W0221, W0201, no-self-use, no-member
 
 """NDArray interface of mxnet"""
 from __future__ import absolute_import
 from collections import OrderedDict
 
-import ctypes
 import sys
-import numpy as np
+import ctypes
 import logging
 import threading
+import numpy as np
 from .base import _LIB
 from .base import c_array, c_str, mx_uint, py_str
 from .base import DataIterHandle, NDArrayHandle
@@ -197,16 +197,8 @@ def __init__(self, iters, rename_data=None, rename_label=None):
         self.n_iter = len(iters)
         assert self.n_iter > 0
         self.iters = iters
-        if rename_data is None:
-            self.provide_data = sum([i.provide_data for i in iters], [])
-        else:
-            self.provide_data = sum([[(r[n], s) for n, s in i.provide_data] \
-                                    for r, i in zip(rename_data, iters)], [])
-        if rename_label is None:
-            self.provide_label = sum([i.provide_label for i in iters], [])
-        else:
-            self.provide_label = sum([[(r[n], s) for n, s in i.provide_label] \
-                                    for r, i in zip(rename_label, iters)], [])
+        self.rename_data = rename_data
+        self.rename_label = rename_label
         self.batch_size = self.provide_data[0][1][0]
         self.data_ready = [threading.Event() for i in range(self.n_iter)]
         self.data_taken = [threading.Event() for i in range(self.n_iter)]
@@ -240,6 +232,24 @@ def __del__(self):
         for thread in self.prefetch_threads:
             thread.join()
 
+    @property
+    def provide_data(self):
+        """The name and shape of data provided by this iterator"""
+        if self.rename_data is None:
+            return sum([i.provide_data for i in self.iters], [])
+        else:
+            return sum([[(r[n], s) for n, s in i.provide_data] \
+                       for r, i in zip(self.rename_data, self.iters)], [])
+
+    @property
+    def provide_label(self):
+        """The name and shape of label provided by this iterator"""
+        if self.rename_label is None:
+            return sum([i.provide_label for i in self.iters], [])
+        else:
+            return sum([[(r[n], s) for n, s in i.provide_label] \
+                       for r, i in zip(self.rename_label, self.iters)], [])
+
     def reset(self):
         for e in self.data_ready:
             e.wait()
@@ -264,7 +274,9 @@ def iter_next(self):
             self.current_batch = DataBatch(sum([batch.data for batch in self.next_batch], []),
                                            sum([batch.label for batch in self.next_batch], []),
                                            self.next_batch[0].pad,
-                                           self.next_batch[0].index)
+                                           self.next_batch[0].index,
+                                           provide_data=self.provide_data,
+                                           provide_label=self.provide_label)
             for e in self.data_ready:
                 e.clear()
             for e in self.data_taken:
@@ -352,12 +364,9 @@ def __init__(self, data, label=None, batch_size=1, shuffle=False, last_batch_han
             self.data = [(k, array(v.asnumpy()[idx], v.context)) for k, v in self.data]
             self.label = [(k, array(v.asnumpy()[idx], v.context)) for k, v in self.label]
 
-        self.data_list = [x[1] for x in self.data] + [x[1] for x in self.label]
-        self.num_source = len(self.data_list)
-
         # batching
         if last_batch_handle == 'discard':
-            new_n = self.data_list[0].shape[0] - self.data_list[0].shape[0] % batch_size
+            new_n = self.data[0][1].shape[0] - self.data[0][1].shape[0] % batch_size
             data_dict = OrderedDict(self.data)
             label_dict = OrderedDict(self.label)
             for k, _ in self.data:
@@ -366,6 +375,9 @@ def __init__(self, data, label=None, batch_size=1, shuffle=False, last_batch_han
                 label_dict[k] = label_dict[k][:new_n]
             self.data = data_dict.items()
             self.label = label_dict.items()
+
+        self.data_list = [x[1] for x in self.data] + [x[1] for x in self.label]
+        self.num_source = len(self.data_list)
         self.num_data = self.data_list[0].shape[0]
         assert self.num_data >= batch_size, \
             "batch_size need to be smaller than data size."
@@ -396,10 +408,7 @@ def reset(self):
 
     def iter_next(self):
         self.cursor += self.batch_size
-        if self.cursor < self.num_data:
-            return True
-        else:
-            return False
+        return self.cursor < self.num_data
 
     def next(self):
         if self.iter_next():
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index 9fe45c7379b5..c23ae4309eb8 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -29,8 +29,8 @@ def _ctype_key_value(keys, vals):
             assert(isinstance(k, int))
         c_keys = []
         c_vals = []
-        for i in range(len(keys)):
-            c_key_i, c_val_i = _ctype_key_value(keys[i], vals[i])
+        for key, val in zip(keys, vals):
+            c_key_i, c_val_i = _ctype_key_value(key, val)
             c_keys += c_key_i
             c_vals += c_val_i
         return (c_array(ctypes.c_int, c_keys), c_array(NDArrayHandle, c_vals))
diff --git a/python/mxnet/kvstore_server.py b/python/mxnet/kvstore_server.py
index 42c1197c7136..b006a9e5c7fc 100644
--- a/python/mxnet/kvstore_server.py
+++ b/python/mxnet/kvstore_server.py
@@ -25,7 +25,7 @@ def _controller(self):
         """return the server controller"""
         def server_controller(cmd_id, cmd_body, _):
             """server controler"""
-            if self.init_logginig == False:
+            if not self.init_logginig:
                 # the reason put the codes here is because we cannot get
                 # kvstore.rank earlier
                 head = '%(asctime)-15s Server[' + str(
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index bc51e469db4a..674f9255628b 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -3,8 +3,9 @@
 
 """Online evaluation metric module."""
 from __future__ import absolute_import
-from . import ndarray
+
 import numpy
+from . import ndarray
 
 def check_label_shapes(labels, preds, shape=0):
     """Check to see if the two arrays are the same size."""
@@ -41,7 +42,7 @@ def update(self, label, pred):
 
     def reset(self):
         """Clear the internal statistics to initial state."""
-        if self.num == None:
+        if self.num is None:
             self.num_inst = 0
             self.sum_metric = 0.0
         else:
@@ -58,7 +59,7 @@ def get(self):
         value : float
            Value of the evaluation.
         """
-        if self.num == None:
+        if self.num is None:
             if self.num_inst == 0:
                 return (self.name, float('nan'))
             else:
@@ -133,9 +134,9 @@ def __init__(self):
     def update(self, labels, preds):
         check_label_shapes(labels, preds)
 
-        for i in range(len(labels)):
-            pred_label = ndarray.argmax_channel(preds[i]).asnumpy().astype('int32')
-            label = labels[i].asnumpy().astype('int32')
+        for label, pred_label in zip(labels, preds):
+            pred_label = ndarray.argmax_channel(pred_label).asnumpy().astype('int32')
+            label = label.asnumpy().astype('int32')
 
             check_label_shapes(label, pred_label)
 
@@ -157,10 +158,10 @@ def __init__(self, **kwargs):
     def update(self, labels, preds):
         check_label_shapes(labels, preds)
 
-        for i in range(len(labels)):
-            assert(len(preds[i].shape) <= 2), 'Predictions should be no more than 2 dims'
-            pred_label = numpy.argsort(preds[i].asnumpy().astype('float32'), axis=1)
-            label = labels[i].asnumpy().astype('int32')
+        for label, pred_label in zip(labels, preds):
+            assert(len(pred_label.shape) <= 2), 'Predictions should be no more than 2 dims'
+            pred_label = numpy.argsort(pred_label.asnumpy().astype('float32'), axis=1)
+            label = label.asnumpy().astype('int32')
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]
             num_dims = len(pred_label.shape)
@@ -182,9 +183,9 @@ def __init__(self):
     def update(self, labels, preds):
         check_label_shapes(labels, preds)
 
-        for i in range(len(labels)):
-            pred = preds[i].asnumpy()
-            label = labels[i].asnumpy().astype('int32')
+        for label, pred in zip(labels, preds):
+            pred = pred.asnumpy()
+            label = label.asnumpy().astype('int32')
             pred_label = numpy.argmax(pred, axis=1)
 
             check_label_shapes(label, pred)
@@ -299,14 +300,19 @@ def update(self, labels, preds):
 
 class Torch(EvalMetric):
     """Dummy metric for torch criterions"""
-    def __init__(self):
-        super(Torch, self).__init__('torch')
+    def __init__(self, name='torch'):
+        super(Torch, self).__init__(name)
 
     def update(self, _, preds):
         for pred in preds:
             self.sum_metric += pred.asnumpy().mean()
         self.num_inst += 1
 
+class Caffe(Torch):
+    """Dummy metric for caffe criterions"""
+    def __init__(self):
+        super(Caffe, self).__init__('caffe')
+
 class CustomMetric(EvalMetric):
     """Custom evaluation metric that takes a NDArray function.
 
diff --git a/python/mxnet/misc.py b/python/mxnet/misc.py
index 2d3ffc6e5abd..51bcf9d128fb 100644
--- a/python/mxnet/misc.py
+++ b/python/mxnet/misc.py
@@ -52,7 +52,7 @@ def __call__(self, iteration):
             Current iteration count
         """
 
-        if self.init == False:
+        if not self.init:
             self.init = True
             self.old_lr = self.base_lr
         lr = self.base_lr * math.pow(self.factor, int(iteration / self.step))
@@ -61,5 +61,3 @@ def __call__(self, iteration):
             logging.info("At Iteration [%d]: Swith to new learning rate %.5f",
                          iteration, lr)
         return lr
-
-
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 614f01813505..396138c72ffc 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -3,9 +3,11 @@
 """MXNet model module"""
 from __future__ import absolute_import
 
-import numpy as np
 import time
 import logging
+from collections import namedtuple
+import numpy as np
+
 from . import io
 from . import nd
 from . import symbol as sym
@@ -14,7 +16,6 @@
 from . import kvstore as kvs
 from .context import Context, cpu
 from .initializer import Uniform
-from collections import namedtuple
 from .optimizer import get_updater
 from .executor_manager import DataParallelExecutorManager, _check_arguments, _load_data, _load_label
 import pdb
@@ -47,7 +48,7 @@ def _create_kvstore(kvstore, num_device, arg_params):
     arg_params : dict of str to NDArray
         Model parameter, dict of name to NDArray of net's weights.
     """
-
+    update_on_kvstore = True
     if kvstore is None:
         kv = None
     elif isinstance(kvstore, kvs.KVStore):
@@ -58,21 +59,17 @@ def _create_kvstore(kvstore, num_device, arg_params):
             # no need to use kv for single device and single machine
             kv = None
         else:
-            if kvstore is 'local':
-                # automatically select a proper local
-                max_size = max(np.prod(param.shape) for param in arg_params.values())
-                if max_size < 1024 * 1024 * 16:
-                    kvstore = 'local_update_cpu'
-                else:
-                    kvstore = 'local_allreduce_cpu'
-                logging.info('Auto-select kvstore type = %s', kvstore)
             kv = kvs.create(kvstore)
+            if kvstore is 'local':
+            # automatically select a proper local
+                max_size = max(np.prod(param.shape) for param in
+                               arg_params.values())
+                if max_size > 1024 * 1024 * 16:
+                    update_on_kvstore = False
     else:
         raise TypeError('kvstore must be KVStore, str or None')
 
-    # detect whether or not update weight on kvstore
-    update_on_kvstore = True
-    if not kv or 'local_allreduce' in kv.type:
+    if kv is None:
         update_on_kvstore = False
 
     return (kv, update_on_kvstore)
@@ -80,8 +77,7 @@ def _create_kvstore(kvstore, num_device, arg_params):
 def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names,
                         update_on_kvstore):
     """ Initialize kvstore"""
-    for idx in range(len(param_arrays)):
-        param_on_devs = param_arrays[idx]
+    for idx, param_on_devs in enumerate(param_arrays):
         kvstore.init(idx, arg_params[param_names[idx]])
 
         if update_on_kvstore:
@@ -274,7 +270,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                     do_reset = False
                     break
 
-            if do_reset == True:
+            if do_reset:
                 logger.info('Epoch[%d] Resetting Data Iterator', epoch)
                 train_data.reset()
 
@@ -341,7 +337,9 @@ def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params):
     - ``prefix-symbol.json`` will be saved for symbol.
     - ``prefix-epoch.params`` will be saved for parameters.
     """
-    symbol.save('%s-symbol.json' % prefix)
+    if symbol is not None:
+        symbol.save('%s-symbol.json' % prefix)
+
     save_dict = {('arg:%s' % k) : v for k, v in arg_params.items()}
     save_dict.update({('aux:%s' % k) : v for k, v in aux_params.items()})
     param_name = '%s-%04d.params' % (prefix, epoch)
@@ -805,7 +803,7 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
         # init optmizer
         if isinstance(self.optimizer, str):
             batch_size = data.batch_size
-            if kvstore and kvstore.type == 'dist_sync':
+            if kvstore and 'dist' in kvstore.type and not '_async' in kvstore.type:
                 batch_size *= kvstore.num_workers
             optimizer = opt.create(self.optimizer,
                                    rescale_grad=(1.0/batch_size),
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index 62e815e487ec..2ddc7c4ab38f 100644
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -2,8 +2,8 @@
 # pylint: disable=too-many-branches,too-many-statements,too-many-arguments
 """Executor group is a convenient tool for managing a group of executors."""
 
-import numpy as np
 import logging
+import numpy as np
 
 from .. import context as ctx
 from .. import ndarray as nd
diff --git a/python/mxnet/module/python_module.py b/python/mxnet/module/python_module.py
index ab9b952010a7..141849f19a59 100644
--- a/python/mxnet/module/python_module.py
+++ b/python/mxnet/module/python_module.py
@@ -280,7 +280,7 @@ def get_outputs(self, merge_multi_context=True):
         merge_multi_context : bool
             Should always be `True`, because we do not use multiple contexts for computing.
         """
-        assert merge_multi_context == True
+        assert merge_multi_context is True
         return [self._scores]
 
     def backward(self, out_grads=None):
@@ -324,7 +324,7 @@ def get_input_grads(self, merge_multi_context=True):
         merge_multi_context : bool
             Should always be `True` because we do not use multiple context for computation.
         """
-        assert merge_multi_context == True
+        assert merge_multi_context is True
         return [self._scores_grad]
 
     def install_monitor(self, mon):
diff --git a/python/mxnet/monitor.py b/python/mxnet/monitor.py
index 4080aed61013..b9035eff7a78 100644
--- a/python/mxnet/monitor.py
+++ b/python/mxnet/monitor.py
@@ -1,13 +1,16 @@
 # coding: utf-8
 # pylint: disable=protected-access, logging-format-interpolation, invalid-name, no-member
 """Monitor outputs, weights, and gradients for debugging."""
+from __future__ import absolute_import
+
+import re
 import ctypes
+import logging
+from math import sqrt
+
 from .ndarray import NDArray
 from .base import NDArrayHandle, py_str
 from . import ndarray
-import logging
-from math import sqrt
-import re
 
 
 class Monitor(object):
@@ -114,7 +117,3 @@ def toc_print(self):
         res = self.toc()
         for n, k, v in res:
             logging.info('Batch: {:7d} {:30s} {:s}'.format(n, k, v))
-
-
-
-
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 3d61f95b6b21..b36cad51210f 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -11,8 +11,8 @@
 import operator
 import numpy as np
 from .base import _LIB, string_types, numeric_types
-from .base import c_array, py_str, c_str, mx_real_t
-from .base import mx_uint, mx_float, NDArrayHandle, FunctionHandle
+from .base import c_array, mx_float, py_str, c_str, mx_real_t
+from .base import mx_uint, NDArrayHandle, FunctionHandle
 from .base import ctypes2buffer
 from .base import check_call, ctypes2docstring
 from .context import Context
@@ -276,6 +276,33 @@ def _slice(self, start, stop):
             self.handle, start, stop, ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
+    def _copy_slice_to(self, axis, start, stop, target):
+        """Copy a slice along an axis.
+
+        Parameters
+        ----------
+        axis : int
+            The axis along which to do slicing.
+        start : int
+            The starting index of the slice.
+        stop : int
+            The finishing index of the slice.
+        target : NDArray or Context
+            If an NDArray, must be pre-allocated with compatible shape.
+            If a Context, a new NDArray will be created.
+
+        Returns
+        -------
+        The sliced copy of the NDArray.
+        """
+        if isinstance(target, Context):
+            shape = list(self.shape)
+            shape[axis] = stop - start
+            target = NDArray(_new_alloc_handle(shape, target, True, self.dtype))
+
+        assert isinstance(target, NDArray)
+        return _internal._copy_slice_to(self, axis, start, stop, out=target)
+
     def _at(self, idx):
         """Return a sub NDArray that shares memory with current one.
 
@@ -1122,8 +1149,8 @@ def binary_ndarray_function(lhs, rhs, out=None, **kwargs):
                 c_array(mx_float, ()), \
                 c_array(NDArrayHandle, (out.handle,)), \
                 ctypes.c_int(len(kwargs)), \
-                c_array(ctypes.c_char_p, [key.encode('ascii') for key in kwargs.keys()]), \
-                c_array(ctypes.c_char_p, [str(i).encode('ascii') for i in kwargs.values()])))
+                c_array(ctypes.c_char_p, [c_str(key) for key in kwargs.keys()]), \
+                c_array(ctypes.c_char_p, [c_str(str(i)) for i in kwargs.values()])))
         return out
 
     def unary_ndarray_function(src, out=None, *args, **kwargs):
@@ -1143,8 +1170,8 @@ def unary_ndarray_function(src, out=None, *args, **kwargs):
                 c_array(mx_float, [args[i] for i in scalar_range]), \
                 c_array(NDArrayHandle, (out.handle,)), \
                 ctypes.c_int(len(kwargs)), \
-                c_array(ctypes.c_char_p, [key.encode('ascii') for key in kwargs.keys()]), \
-                c_array(ctypes.c_char_p, [str(i).encode('ascii') for i in kwargs.values()])))
+                c_array(ctypes.c_char_p, [c_str(key) for key in kwargs.keys()]), \
+                c_array(ctypes.c_char_p, [c_str(str(i)) for i in kwargs.values()])))
         return out
 
     def generic_ndarray_function(*args, **kwargs):
@@ -1176,13 +1203,13 @@ def generic_ndarray_function(*args, **kwargs):
             else:
                 raise TypeError('argument out is required to call %s' % func_name)
         check_call(_LIB.MXFuncInvokeEx( \
-                handle, \
-                c_array(NDArrayHandle, [args[i].handle for i in use_vars_range]), \
-                c_array(mx_float, [args[i] for i in scalar_range]), \
-                c_array(NDArrayHandle, [v.handle for v in mutate_vars]), \
-                ctypes.c_int(len(kwargs)), \
-                c_array(ctypes.c_char_p, [key.encode('ascii') for key in kwargs.keys()]), \
-                c_array(ctypes.c_char_p, [str(i).encode('ascii') for i in kwargs.values()])))
+            handle, \
+            c_array(NDArrayHandle, [args[i].handle for i in use_vars_range]), \
+            c_array(mx_float, [args[i] for i in scalar_range]), \
+            c_array(NDArrayHandle, [v.handle for v in mutate_vars]), \
+            ctypes.c_int(len(kwargs)), \
+            c_array(ctypes.c_char_p, [c_str(key) for key in kwargs.keys()]), \
+            c_array(ctypes.c_char_p, [c_str(str(i)) for i in kwargs.values()])))
         if n_mutate_vars == 1:
             return mutate_vars[0]
         else:
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index e9c0e385022d..f3babcb2c470 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -5,13 +5,15 @@
 
 from threading import Lock
 from ctypes import CFUNCTYPE, POINTER, Structure, pointer
-from ctypes import c_void_p, cast, c_int, c_char, c_char_p, cast, c_bool
-c_int_p = POINTER(c_int)
+from ctypes import c_void_p, c_int, c_char, c_char_p, cast, c_bool
+
 from .base import _LIB, check_call
 from .base import c_array, c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
 from . import symbol
 from .ndarray import NDArray
 
+c_int_p = POINTER(c_int)
+
 class PythonOp(object):
     """Base class for operators implemented in python
 
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 94a84232f81f..e7658a50aeac 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -457,10 +457,7 @@ def __init__(self, momentum=0.0, rescale_grad=1., clip_gradient=-1., **kwargs):
 
     def __getstate__(self):
         this = self.__dict__.copy()
-        if this.get('handle', None) is not None:
-            this['handle'] = True
-        else:
-            this['handle'] = False
+        this['handle'] = this.get('handle', None) is not None
 
     def __setstate__(self, state):
         if state.get('handle', False):
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
index 8a6fc607f2a4..49a2b766a1fa 100644
--- a/python/mxnet/recordio.py
+++ b/python/mxnet/recordio.py
@@ -5,12 +5,16 @@
 from __future__ import absolute_import
 from collections import namedtuple
 
+import os
 import ctypes
+import struct
+import numbers
+import numpy as np
+
 from .base import _LIB
 from .base import RecordIOHandle
 from .base import check_call
-import struct
-import numpy as np
+from .base import c_str
 try:
     import cv2
     opencv_available = True
@@ -28,7 +32,7 @@ class MXRecordIO(object):
         "r" for reading or "w" writing.
     """
     def __init__(self, uri, flag):
-        self.uri = ctypes.c_char_p(uri)
+        self.uri = c_str(uri)
         self.handle = RecordIOHandle()
         self.flag = flag
         self.is_open = False
@@ -69,7 +73,7 @@ def write(self, buf):
 
         Parameters
         ----------
-        buf : string
+        buf : string (python2), bytes (python3)
             buffer to write.
         """
         assert self.writable
@@ -97,6 +101,76 @@ def read(self):
         else:
             return None
 
+class MXIndexedRecordIO(MXRecordIO):
+    """Python interface for read/write RecordIO data formmat with index.
+    Support random access.
+
+    Parameters
+    ----------
+    idx_path : str
+        Path to index file
+    uri : str
+        Path to record file. Only support file types that are seekable.
+    flag : str
+        'w' for write or 'r' for read
+    key_type : type
+        data type for keys
+    """
+    def __init__(self, idx_path, uri, flag, key_type=int):
+        super(MXIndexedRecordIO, self).__init__(uri, flag)
+        self.idx_path = idx_path
+        self.idx = {}
+        self.key_type = key_type
+        if not self.writable and os.path.isfile(idx_path):
+            with open(idx_path) as fin:
+                for line in fin.readlines():
+                    line = line.strip().split('\t')
+                    self.idx[key_type(line[0])] = int(line[1])
+
+    def close(self):
+        if self.writable:
+            with open(self.idx_path, 'w') as fout:
+                for k, v in self.idx.items():
+                    fout.write(str(k)+'\t'+str(v)+'\n')
+        super(MXIndexedRecordIO, self).close()
+
+    def reset(self):
+        if self.writable:
+            self.idx = {}
+            super(MXIndexedRecordIO, self).close()
+            super(MXIndexedRecordIO, self).open()
+
+    def seek(self, idx):
+        """Query current read head position"""
+        assert not self.writable
+        pos = ctypes.c_size_t(self.idx[idx])
+        check_call(_LIB.MXRecordIOReaderSeek(self.handle, pos))
+
+    def tell(self):
+        """Query current write head position"""
+        assert self.writable
+        pos = ctypes.c_size_t()
+        check_call(_LIB.MXRecordIOWriterTell(self.handle, ctypes.byref(pos)))
+        return pos.value
+
+    def read_idx(self, idx):
+        """Read record with index"""
+        self.seek(idx)
+        return self.read()
+
+    def write_idx(self, idx, buf):
+        """Write record with index"""
+        pos = self.tell()
+        self.idx[self.key_type(idx)] = pos
+        self.write(buf)
+
+    def keys(self):
+        """List all keys from index"""
+        return list(self.idx.keys())
+
+
+
+
 IRHeader = namedtuple('HEADER', ['flag', 'label', 'id', 'id2'])
 _IRFormat = 'IfQQ'
 _IRSize = struct.calcsize(_IRFormat)
@@ -107,11 +181,18 @@ def pack(header, s):
     Parameters
     ----------
     header : IRHeader
-        header of the image record
+        header of the image record.
+        header.label can be a number or an array.
     s : str
         string to pack
     """
     header = IRHeader(*header)
+    if isinstance(header.label, numbers.Number):
+        header = header._replace(flag=0)
+    else:
+        label = np.asarray(header.label, dtype=np.float32)
+        header = header._replace(flag=label.size, label=0)
+        s = label.tostring() + s
     s = struct.pack(_IRFormat, *header) + s
     return s
 
@@ -131,7 +212,11 @@ def unpack(s):
         unpacked string
     """
     header = IRHeader(*struct.unpack(_IRFormat, s[:_IRSize]))
-    return header, s[_IRSize:]
+    s = s[_IRSize:]
+    if header.flag > 0:
+        header = header._replace(label=np.fromstring(s, np.float32, header.flag))
+        s = s[header.flag*4:]
+    return header, s
 
 def unpack_img(s, iscolor=-1):
     """unpack a MXImageRecord to image
@@ -163,6 +248,7 @@ def pack_img(header, img, quality=80, img_fmt='.jpg'):
     ----------
     header : IRHeader
         header of the image record
+        header.label can be a number or an array.
     img : numpy.ndarray
         image to pack
     quality : int
@@ -176,12 +262,12 @@ def pack_img(header, img, quality=80, img_fmt='.jpg'):
         The packed string
     """
     assert opencv_available
-    jpg_formats = set(['.jpg', '.jpeg', '.JPG', '.JPEG'])
-    png_formats = set(['.png', '.PNG'])
+    jpg_formats = ['.JPG', '.JPEG']
+    png_formats = ['.PNG']
     encode_params = None
-    if img_fmt in jpg_formats:
+    if img_fmt.upper() in jpg_formats:
         encode_params = [cv2.IMWRITE_JPEG_QUALITY, quality]
-    elif img_fmt in png_formats:
+    elif img_fmt.upper() in png_formats:
         encode_params = [cv2.IMWRITE_PNG_COMPRESSION, quality]
 
     ret, buf = cv2.imencode(img_fmt, img, encode_params)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
new file mode 100644
index 000000000000..4ce1521ae410
--- /dev/null
+++ b/python/mxnet/test_utils.py
@@ -0,0 +1,602 @@
+# coding: utf-8
+"""Tools for testing."""
+# pylint: disable=invalid-name, no-member, too-many-arguments, too-many-locals, too-many-branches, too-many-statements, broad-except, line-too-long
+from __future__ import absolute_import, print_function, division
+import time
+import numpy as np
+import numpy.testing as npt
+import mxnet as mx
+_rng = np.random.RandomState(1234)
+
+
+def np_reduce(dat, axis, keepdims, numpy_reduce_func):
+    """Compatible reduce for old version numpy
+
+    Parameters
+    ----------
+    dat : np.ndarray
+        Same as Numpy
+
+    axis : None or int or list-like
+        Same as Numpy
+
+    keepdims : bool
+        Same as Numpy
+
+    numpy_reduce_func : function
+        Numpy reducing function like `np.sum` or `np.max`
+    """
+    if isinstance(axis, int):
+        axis = [axis]
+    else:
+        axis = list(axis) if axis is not None else range(len(dat.shape))
+    ret = dat
+    for i in reversed(sorted(axis)):
+        ret = numpy_reduce_func(ret, axis=i)
+    if keepdims:
+        keepdims_shape = list(dat.shape)
+        for i in axis:
+            keepdims_shape[i] = 1
+        ret = ret.reshape(tuple(keepdims_shape))
+    return ret
+
+
+def same(a, b):
+    """Test if two numpy arrays are the same
+
+    Parameters
+    ----------
+    a : np.ndarray
+    b : np.ndarray
+    """
+    return np.array_equal(a, b)
+
+
+def reldiff(a, b):
+    """Calculate the relative difference between two input arrays
+
+    Calculated by :math:`\\frac{|a-b|^2}{|a|^2 + |b|^2}`
+
+    Parameters
+    ----------
+    a : np.ndarray
+    b : np.ndarray
+    """
+    diff = np.sum(np.abs(a - b))
+    norm = np.sum(np.abs(a)) + np.sum(np.abs(b))
+    if diff == 0:
+        return 0
+    ret = diff / norm
+    return ret
+
+
+def _parse_location(sym, location, ctx):
+    """Parse the given location to a dictionary
+
+    Parameters
+    ----------
+    sym : Symbol
+    location : None or list of np.ndarray or dict of str to np.ndarray
+
+    Returns
+    -------
+    dict of str to np.ndarray
+    """
+    assert isinstance(location, (dict, list, tuple))
+    if isinstance(location, dict):
+        if set(location.keys()) != set(sym.list_arguments()):
+            raise ValueError("Symbol arguments and keys of the given location do not match."
+                             "symbol args:%s, location.keys():%s"
+                             % (str(set(sym.list_arguments())), str(set(location.keys()))))
+    else:
+        location = {k: v for k, v in zip(sym.list_arguments(), location)}
+    location = {k: mx.nd.array(v, ctx=ctx) for k, v in location.items()}
+    return location
+
+
+def _parse_aux_states(sym, aux_states, ctx):
+    """
+
+    Parameters
+    ----------
+    sym : Symbol
+    aux_states : None or list of np.ndarray or dict of str to np.ndarray
+
+    Returns
+    -------
+    dict of str to np.ndarray
+    """
+    if aux_states is not None:
+        if isinstance(aux_states, dict):
+            if set(aux_states.keys()) != set(sym.list_auxiliary_states()):
+                raise ValueError("Symbol aux_states names and given aux_states do not match."
+                                 "symbol aux_names:%s, aux_states.keys:%s"
+                                 % (str(set(sym.list_auxiliary_states())),
+                                    str(set(aux_states.keys()))))
+        elif isinstance(aux_states, (list, tuple)):
+            aux_names = sym.list_auxiliary_states()
+            aux_states = {k:v for k, v in zip(aux_names, aux_states)}
+        aux_states = {k: mx.nd.array(v, ctx=ctx) for k, v in aux_states.items()}
+    return aux_states
+
+
+def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_train=True):
+    """Calculates a numeric gradient via finite difference method.
+
+    Class based on Theano's `theano.gradient.numeric_grad` [1]
+
+    Parameters
+    ----------
+    executor : Executor
+        exectutor that computes the forward pass
+    location : list of numpy.ndarray or dict of str to numpy.ndarray
+        Argument values used as location to compute gradient
+        Maps the name of arguments to the corresponding numpy.ndarray.
+        Value of all the arguments must be provided.
+    aux_states : None or list of numpy.ndarray or dict of str to numpy.ndarray
+        Auxiliary states values used as location to compute gradient
+        Maps the name of aux_states to the corresponding numpy.ndarray.
+        Value of all the auxiliary arguments must be provided.
+    eps : float, optional
+        epsilon for the finite-difference method
+
+    References
+    ---------
+    ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
+    """
+    for k, v in location.items():
+        executor.arg_dict[k][:] = v
+    approx_grads = {k:np.zeros(v.shape, dtype=np.float32) for k, v in location.items()}
+
+    executor.forward(is_train=use_forward_train)
+    f_x = executor.outputs[0].asnumpy()[0]
+    for k, v in location.items():
+        old_value = v.copy()
+        for i in range(np.prod(v.shape)):
+            # inplace update
+            v.reshape((np.prod(v.shape), 1))[i] += eps
+            # set initial states. Need to set all due to inplace operations
+            for key, val in location.items():
+                executor.arg_dict[key][:] = val
+            if aux_states is not None:
+                for key, val in aux_states.items():
+                    executor.aux_dict[key][:] = val
+            executor.forward(is_train=use_forward_train)
+            f_eps = executor.outputs[0].asnumpy()[0]
+            approx_grads[k].ravel()[i] = (f_eps - f_x) / eps
+            v.reshape((np.prod(v.shape), 1))[i] = old_value.reshape((np.prod(v.shape), 1))[i]
+
+    return approx_grads
+
+
+def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-4, check_eps=1e-2,
+                           grad_nodes=None, use_forward_train=True, ctx=mx.cpu()):
+    """Verify an operation by checking backward pass via finite difference method.
+
+    Based on Theano's `theano.gradient.verify_grad` [1]
+
+    Parameters
+    ----------
+    sym : Symbol
+        Symbol containing op to test
+    location : list or tuple or dict
+        Argument values used as location to compute gradient
+
+        - if type is list of numpy.ndarray
+            inner elements should have the same the same order as mxnet.sym.list_arguments().
+        - if type is dict of str -> numpy.ndarray
+            maps the name of arguments to the corresponding numpy.ndarray.
+        *In either case, value of all the arguments must be provided.*
+    aux_states : ist or tuple or dict, optional
+        The auxiliary states required when generating the executor for the symbol
+    numeric_eps : float, optional
+        Delta for the finite difference method that approximates the gradient
+    check_eps : float, optional
+        relative error eps used when comparing numeric grad to symbolic grad
+    grad_nodes : None or list or tuple or dict, optional
+        Names of the nodes to check gradient on
+    use_forward_train : bool
+        Whether to use is_train=True when computing the finite-difference
+    ctx : Context, optional
+        Check the gradient computation on the specified device
+    References
+    ---------
+    ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
+    """
+
+    def random_projection(shape):
+        """Get a random weight matrix with not too small elements
+
+        Parameters
+        ----------
+        shape : list or tuple
+        """
+        # random_projection should not have elements too small,
+        # otherwise too much precision is lost in numerical gradient
+        plain = _rng.rand(*shape) + 0.1
+        return plain
+    location = _parse_location(sym=sym, location=location, ctx=ctx)
+    location_npy = {k:v.asnumpy() for k, v in location.items()}
+    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx)
+    if aux_states is not None:
+        aux_states_npy = {k:v.asnumpy() for k, v in aux_states.items()}
+    else:
+        aux_states_npy = None
+    if grad_nodes is None:
+        grad_nodes = sym.list_arguments()
+        grad_req = {k: 'write' for k in grad_nodes}
+    elif isinstance(grad_nodes, (list, tuple)):
+        grad_nodes = list(grad_nodes)
+        grad_req = {k: 'write' for k in grad_nodes}
+    elif isinstance(grad_nodes, dict):
+        grad_req = grad_nodes.copy()
+        grad_nodes = grad_nodes.keys()
+    else:
+        raise ValueError
+
+    input_shape = {k: v.shape for k, v in location.items()}
+    _, out_shape, _ = sym.infer_shape(**input_shape)
+    proj = mx.sym.Variable("__random_proj")
+    out = mx.sym.sum(sym * proj)
+    out = mx.sym.MakeLoss(out)
+
+    location = dict(list(location.items()) +
+                    [("__random_proj", mx.nd.array(random_projection(out_shape[0]), ctx=ctx))])
+    args_grad_npy = dict([(k, _rng.normal(0, 0.01, size=location[k].shape)) for k in grad_nodes]
+                         + [("__random_proj", _rng.normal(0, 0.01, size=out_shape[0]))])
+
+    args_grad = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()}
+
+    executor = out.bind(ctx, grad_req=grad_req,
+                        args=location, args_grad=args_grad, aux_states=aux_states)
+
+    inps = executor.arg_arrays
+    if len(inps) != len(location):
+        raise ValueError("Executor arg_arrays and and location len do not match."
+                         "Got %d inputs and %d locations"%(len(inps), len(location)))
+    assert len(executor.outputs) == 1
+
+    executor.forward(is_train=True)
+    executor.backward()
+    symbolic_grads = {k:executor.grad_dict[k].asnumpy() for k in grad_nodes}
+
+    numeric_gradients = numeric_grad(executor, location_npy, aux_states_npy,
+                                     eps=numeric_eps, use_forward_train=use_forward_train)
+    for name in grad_nodes:
+        fd_grad = numeric_gradients[name]
+        orig_grad = args_grad_npy[name]
+        sym_grad = symbolic_grads[name]
+        if grad_req[name] == 'write':
+            rel = reldiff(fd_grad, sym_grad)
+            arr_l = [fd_grad, sym_grad]
+        elif grad_req[name] == 'add':
+            rel = reldiff(fd_grad, sym_grad - orig_grad)
+            arr_l = [fd_grad, sym_grad - orig_grad]
+        elif grad_req[name] == 'null':
+            rel = reldiff(orig_grad, sym_grad)
+            arr_l = [orig_grad, sym_grad]
+        else:
+            raise ValueError
+        if np.isnan(rel) or rel > check_eps:
+            np.set_printoptions(threshold=4, suppress=True)
+            msg = npt.build_err_msg(arr_l,
+                                    err_msg="In symbol \"%s\", ctx=%s, "
+                                            "numeric check failed for \"%s\", grad_req= \"%s\". "
+                                            "Rel Err=%f, Expected <=%f"
+                                    %(sym.name, str(ctx), name, grad_req[name], rel, check_eps),
+                                    names=["NUMERICAL", "BACKWARD"])
+            raise Exception(msg)
+
+
+def check_symbolic_forward(sym, location, expected, check_eps=1E-4, aux_states=None, ctx=mx.cpu()):
+    """Compare foward call to expected value.
+
+    Parameters
+    ---------
+    sym : Symbol
+        output symbol
+    location : list of np.ndarray or dict of str to np.ndarray
+        The evaluation point
+
+        - if type is list of np.ndarray
+            contain all the numpy arrays corresponding to `sym.list_arguments()`
+        - if type is dict of str to np.ndarray
+            contain the mapping between argument names and their values
+    expected : list of np.ndarray or dict of str to np.ndarray
+        The expected output value
+
+        - if type is list of np.ndarray
+            contain arrays corresponding to exe.outputs
+        - if type is dict of str to np.ndarray
+            contain mapping between sym.list_output() and exe.outputs
+    check_eps : float, optional
+        relative error to check to
+    aux_states : list of np.ndarray of dict, optional
+        - if type is list of np.ndarray
+            contain all the numpy arrays corresponding to sym.list_auxiliary_states
+        - if type is dict of str to np.ndarray
+            contain the mapping between names of auxiliary states and their values
+    ctx : Context, optional
+        running context
+    """
+    location = _parse_location(sym=sym, location=location, ctx=ctx)
+    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx)
+    if isinstance(expected, dict):
+        expected = [expected[k] for k in sym.list_outputs()]
+    args_grad_data = {k:mx.nd.empty(v.shape, ctx=ctx) for k, v in location.items()}
+
+    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
+    for g in executor.grad_arrays:
+        if g:
+            g[:] = 0
+
+    executor.forward(is_train=False)
+    outputs = [x.asnumpy() for x in executor.outputs]
+
+    for output_name, expect, output in zip(sym.list_outputs(), expected, outputs):
+        rel = reldiff(expect, output)
+        if rel > check_eps:
+            np.set_printoptions(threshold=4, suppress=True)
+            msg = npt.build_err_msg([expect, output],
+                                    err_msg="In symbol \"%s\", ctx=%s, "
+                                            "forward check failed for \"%s\". "
+                                            "Rel Err=%f, Expected <=%f"
+                                    %(sym.name, str(ctx), output_name, rel, check_eps),
+                                    names=["EXPECTED", "FORWARD"])
+            raise Exception(msg)
+
+
+def check_symbolic_backward(sym, location, out_grads, expected, check_eps=1e-5,
+                            aux_states=None, grad_req='write', ctx=mx.cpu()):
+    """Compare backward call to expected value.
+
+    Parameters
+    ---------
+    sym : Symbol
+        output symbol
+    location : list of np.ndarray or dict of str to np.ndarray
+        The evaluation point
+
+        - if type is list of np.ndarray
+            contain all the numpy arrays corresponding to mxnet.sym.list_arguments
+        - if type is dict of str to np.ndarray
+            contain the mapping between argument names and their values
+    out_grads : None or list of np.ndarray or dict of str to np.ndarray
+        numpy arrays corresponding to sym.outputs for incomming gradient
+
+        - if type is list of np.ndarray
+            contains arrays corresponding to exe.outputs
+        - if type is dict of str to np.ndarray
+            contains mapping between mxnet.sym.list_output() and Executor.outputs
+    expected : list of np.ndarray or dict of str to np.ndarray
+        expected gradient values
+
+        - if type is list of np.ndarray
+            contains arrays corresponding to exe.grad_arrays
+        - if type is dict of str to np.ndarray
+            contains mapping between sym.list_arguments() and exe.outputs
+    check_eps: float, optional
+        relative error to check to
+    aux_states : list of np.ndarray or dict of str to np.ndarray
+    grad_req : str or list of str or dict of str to str, optional
+        gradient requirements. 'write', 'add' or 'null'
+    ctx : Context, optional
+        running context
+    """
+    location = _parse_location(sym=sym, location=location, ctx=ctx)
+    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx)
+    if isinstance(expected, (list, tuple)):
+        expected = {k:v for k, v in zip(sym.list_arguments(), expected)}
+    args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()}
+    args_grad_data = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()}
+    if isinstance(grad_req, str):
+        grad_req = {k:grad_req for k in sym.list_arguments()}
+    elif isinstance(grad_req, (list, tuple)):
+        grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)}
+
+    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
+    executor.forward(is_train=True)
+    if isinstance(out_grads, (tuple, list)):
+        out_grads = [mx.nd.array(v, ctx=ctx) for v in out_grads]
+    elif isinstance(out_grads, (dict)):
+        out_grads = {k:mx.nd.array(v, ctx=ctx) for k, v in out_grads.items()}
+    else:
+        assert out_grads is None
+    executor.backward(out_grads)
+
+    grads = {k: v.asnumpy() for k, v in args_grad_data.items()}
+    for name in expected:
+        if grad_req[name] == 'write':
+            rel = reldiff(expected[name], grads[name])
+            arr_l = [expected[name], grads[name]]
+        elif grad_req[name] == 'add':
+            rel = reldiff(expected[name], grads[name] - args_grad_npy[name])
+            arr_l = [expected[name], grads[name] - args_grad_npy[name]]
+        elif grad_req[name] == 'null':
+            rel = reldiff(args_grad_npy[name], grads[name])
+            arr_l = [args_grad_npy[name], grads[name]]
+        else:
+            raise ValueError
+        if rel > check_eps:
+            np.set_printoptions(threshold=4, suppress=True)
+            msg = npt.build_err_msg(arr_l,
+                                    err_msg="In symbol \"%s\", ctx=%s, "
+                                            "backward check failed for \"%s\". "
+                                            "Rel Err=%f, Expected <=%f"
+                                    %(sym.name, str(ctx), name, rel, check_eps),
+                                    names=["EXPECTED", "BACKWARD"])
+            raise Exception(msg)
+
+
+def check_speed(sym, location=None, ctx=mx.cpu(), N=20, grad_req=None, typ="whole",
+                **kwargs):
+    """Check the running speed of a symbol
+
+    Parameters
+    ----------
+    sym : Symbol
+        symbol to run the speed test
+    location : none or dict of str to np.ndarray
+        location to evaluate the inner executor
+    ctx : Context
+        running context
+    N : int, optional
+        repeat times
+    grad_req : None or str or list of str or dict of str to str, optional
+        gradient requirements
+    typ : str, optional
+        "whole" or "forward"
+
+        - "whole"
+            test the forward_backward speed
+        - "forward"
+            only test the forward speed
+    """
+    if grad_req is None:
+        grad_req = 'write'
+    if location is None:
+        exe = sym.simple_bind(grad_req=grad_req, ctx=ctx, **kwargs)
+        location = {k: _rng.normal(size=arr.shape, scale=1.0) for k, arr in
+                    exe.arg_dict.items()}
+    else:
+        assert isinstance(location, dict), "Expect dict, get \"location\"=%s" %str(location)
+        exe = sym.simple_bind(grad_req=grad_req, ctx=ctx,
+                              **{k: v.shape for k, v in location.items()})
+
+    for name, iarr in location.items():
+        exe.arg_dict[name][:] = iarr.astype(exe.arg_dict[name].dtype)
+
+    if typ == "whole":
+        # Warm up
+        exe.forward(is_train=True)
+        exe.backward(out_grads=exe.outputs)
+        for output in exe.outputs:
+            output.wait_to_read()
+        # Test forward + backward
+        tic = time.time()
+        for _ in range(N):
+            exe.forward(is_train=True)
+            exe.backward(out_grads=exe.outputs)
+            for output in exe.outputs:
+                output.wait_to_read()
+        mx.nd.waitall()
+        toc = time.time()
+        forward_backward_time = (toc - tic) * 1.0 / N
+        return forward_backward_time
+    elif typ == "forward":
+        # Warm up
+        exe.forward(is_train=False)
+        for output in exe.outputs:
+            output.wait_to_read()
+
+        # Test forward only
+        tic = time.time()
+        for _ in range(N):
+            exe.forward(is_train=False)
+            for output in exe.outputs:
+                output.wait_to_read()
+        mx.nd.waitall()
+        toc = time.time()
+        forward_time = (toc - tic) * 1.0 / N
+        return forward_time
+    else:
+        raise ValueError('typ can only be "whole" or "forward".')
+
+
+def check_consistency(sym, ctx_list, scale=1.0, grad_req='write'):
+    """Check symbol gives the same output for different running context
+
+    Parameters
+    ----------
+    sym : Symbol
+        symbol to run the consistency test
+    ctx_list : list
+        running context. See example for more detail.
+    scale : float, optional
+        standard deviation of the inner normal distribution. Used in initialization
+    grad_req : str or list of str or dict of str to str
+        gradient requirement.
+    Examples
+    --------
+    >>> # create the symbol
+    >>> sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), name='conv')
+    >>> # initialize the running context
+    >>> ctx_list =\
+[{'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},\
+ {'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float32}},\
+ {'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float16}},\
+ {'ctx': mx.cpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},\
+ {'ctx': mx.cpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float32}}]
+    >>> check_consistency(sym, ctx_list)
+    >>> sym = mx.sym.Concat(name='concat', num_args=2)
+    >>> ctx_list = \
+[{'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),\
+  'type_dict': {'concat_arg0': np.float64, 'concat_arg1': np.float64}},\
+ {'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),\
+  'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}},\
+ {'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),\
+  'type_dict': {'concat_arg0': np.float16, 'concat_arg1': np.float16}},\
+ {'ctx': mx.cpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),\
+  'type_dict': {'concat_arg0': np.float64, 'concat_arg1': np.float64}},\
+ {'ctx': mx.cpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),\
+  'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
+    >>> check_consistency(sym, ctx_list)
+    """
+    tol = {np.dtype(np.float16): 1e-1,
+           np.dtype(np.float32): 1e-3,
+           np.dtype(np.float64): 1e-5,
+           np.dtype(np.uint8): 0,
+           np.dtype(np.int32): 0}
+    assert len(ctx_list) > 1
+    exe_list = [sym.simple_bind(grad_req=grad_req, **ctx) for ctx in ctx_list]
+    for exe in exe_list:
+        assert len(exe.outputs) == 1
+        assert len(exe.arg_arrays) == len(exe_list[0].arg_arrays)
+        assert len(exe.grad_arrays) == len(exe_list[0].grad_arrays)
+
+    init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe_list[0].arg_arrays]
+    if sym.name == 'embedding':
+        init[0] = np.random.randint(low=0, high=10, size=exe_list[0].arg_arrays[0].shape)
+
+    for exe in exe_list:
+        for arr, iarr in zip(exe.arg_arrays, init):
+            arr[:] = iarr.astype(arr.dtype)
+
+    # forward
+    for exe in exe_list:
+        exe.forward(is_train=True)
+        exe.backward(exe.outputs[0])
+
+    outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
+    # lazy solution handling None grad
+    grads = [[grad.asnumpy() if grad is not None else np.zeros(1) for grad in exe.grad_arrays] for exe in exe_list]
+    dtypes = [arr.dtype for arr in outputs]
+    max_idx = np.argmax(dtypes)
+
+    for i, exe in enumerate(exe_list):
+        if i == max_idx:
+            continue
+        for arr1, arr2 in zip([outputs[i]]+grads[i], [outputs[max_idx]]+grads[max_idx]):
+            arr2 = arr2.astype(dtypes[i])
+            try:
+                npt.assert_allclose(arr1, arr2, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
+            except Exception as e:
+                print(e)
+
+    #forward predict
+    for exe in exe_list:
+        exe.forward(is_train=False)
+
+    outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
+    dtypes = [arr.dtype for arr in outputs]
+    max_idx = np.argmax(dtypes)
+
+    for i, exe in enumerate(exe_list):
+        if i == max_idx:
+            continue
+        for arr1, arr2 in zip([outputs[i]], [outputs[max_idx]]):
+            arr2 = arr2.astype(dtypes[i])
+            try:
+                npt.assert_allclose(arr1, arr2, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
+            except Exception as e:
+                print(e)
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index a4c4ee47ba1b..bc15a514d885 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -5,11 +5,11 @@
 """Visualization module"""
 from __future__ import absolute_import
 
-from .symbol import Symbol
-import json
 import re
 import copy
+import json
 
+from .symbol import Symbol
 
 def _str2tuple(string):
     """convert shape string to list, internal use only
@@ -25,8 +25,146 @@ def _str2tuple(string):
     """
     return re.findall(r"\d+", string)
 
+def print_summary(symbol, shape=None, line_length=120, positions=[.44, .64, .74, 1.]):
+    """convert symbol for detail information
+
+    Parameters
+    ----------
+    symbol: Symbol
+        symbol to be visualized
+    shape: dict
+        dict of shapes, str->shape (tuple), given input shapes
+    line_length: int
+        total length of printed lines
+    positions: list
+        relative or absolute positions of log elements in each line
+    Returns
+    ------
+        void
+    """
+    if not isinstance(symbol, Symbol):
+        raise TypeError("symbol must be Symbol")
+    show_shape = False
+    if shape != None:
+        show_shape = True
+        interals = symbol.get_internals()
+        _, out_shapes, _ = interals.infer_shape(**shape)
+        if out_shapes == None:
+            raise ValueError("Input shape is incompete")
+        shape_dict = dict(zip(interals.list_outputs(), out_shapes))
+    conf = json.loads(symbol.tojson())
+    nodes = conf["nodes"]
+    heads = set(conf["heads"][0])
+    if positions[-1] <= 1:
+        positions = [int(line_length * p) for p in positions]
+    # header names for the different log elements
+    to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Previous Layer']
+    def print_row(fields, positions):
+        """print format row
+
+        Parameters
+        ----------
+        fields: list
+            information field
+        positions: list
+            field length ratio
+        Returns
+        ------
+            void
+        """
+        line = ''
+        for i in range(len(fields)):
+            line += str(fields[i])
+            line = line[:positions[i]]
+            line += ' ' * (positions[i] - len(line))
+        print(line)
+    print('_' * line_length)
+    print_row(to_display, positions)
+    print('=' * line_length)
+    def print_layer_summary(node, out_shape):
+        """print layer information
 
-def plot_network(symbol, title="plot", shape=None, node_attrs={}):
+        Parameters
+        ----------
+        node: dict
+            node information
+        out_shape: dict
+            node shape information
+        Returns
+        ------
+            node total parameters
+        """
+        op = node["op"]
+        pre_node = []
+        pre_filter = 0
+        if op != "null":
+            inputs = node["inputs"]
+            for item in inputs:
+                input_node = nodes[item[0]]
+                input_name = input_node["name"]
+                if input_node["op"] != "null" or item[0] in heads:
+                    # add precede
+                    pre_node.append(input_name)
+                    if show_shape:
+                        if input_node["op"] != "null":
+                            key = input_name + "_output"
+                            shape = shape_dict[key][1:]
+                            pre_filter = pre_filter + int(shape[0])
+                        else:
+                            key = input_name
+                            shape = shape_dict[key][1:]
+                            pre_filter = pre_filter + int(shape[0])
+        cur_param = 0
+        if op == 'Convolution':
+            cur_param = pre_filter \
+                * int(_str2tuple(node["param"]["kernel"])[0]) \
+                * int(_str2tuple(node["param"]["kernel"])[1]) \
+                * int(node["param"]["num_filter"]) \
+                + int(node["param"]["num_filter"])
+        elif op == 'FullyConnected':
+            cur_param = pre_filter * (int(node["param"]["num_hidden"]) + 1)
+        elif op == 'BatchNorm':
+            key = node["name"] + "_output"
+            num_filter = shape_dict[key][1]
+            cur_param = int(num_filter) * 2
+        if not pre_node:
+            first_connection = ''
+        else:
+            first_connection = pre_node[0]
+        fields = [node['name'] + '(' + op + ')',
+                  "x".join([str(x) for x in out_shape]),
+                  cur_param,
+                  first_connection]
+        print_row(fields, positions)
+        if len(pre_node) > 1:
+            for i in range(1, len(pre_node)):
+                fields = ['', '', '', pre_node[i]]
+                print_row(fields, positions)
+        return cur_param
+    total_params = 0
+    for i in range(len(nodes)):
+        node = nodes[i]
+        out_shape = []
+        op = node["op"]
+        if op == "null" and i > 0:
+            continue
+        if op != "null" or i in heads:
+            if show_shape:
+                if op != "null":
+                    key = node["name"] + "_output"
+                    out_shape = shape_dict[key][1:]
+                else:
+                    key = node["name"]
+                    out_shape = shape_dict[key][1:]
+        total_params += print_layer_summary(nodes[i], out_shape)
+        if i == len(nodes) - 1:
+            print('=' * line_length)
+        else:
+            print('_' * line_length)
+    print('Total params: %s' % total_params)
+    print('_' * line_length)
+
+def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs={}):
     """convert symbol to dot object for visualization
 
     Parameters
@@ -59,25 +197,24 @@ def plot_network(symbol, title="plot", shape=None, node_attrs={}):
         draw_shape = True
         interals = symbol.get_internals()
         _, out_shapes, _ = interals.infer_shape(**shape)
-        if out_shapes == None:
+        if out_shapes is None:
             raise ValueError("Input shape is incompete")
         shape_dict = dict(zip(interals.list_outputs(), out_shapes))
     conf = json.loads(symbol.tojson())
     nodes = conf["nodes"]
-    heads = set([x[0] for x in conf["heads"]])  # TODO(xxx): check careful
+    heads = set(conf["heads"][0])  # TODO(xxx): check careful
     # default attributes of node
     node_attr = {"shape": "box", "fixedsize": "true",
                  "width": "1.3", "height": "0.8034", "style": "filled"}
     # merge the dict provided by user and the default one
     node_attr.update(node_attrs)
-    dot = Digraph(name=title)
+    dot = Digraph(name=title, format=save_format)
     # color map
     cm = ("#8dd3c7", "#fb8072", "#ffffb3", "#bebada", "#80b1d3",
           "#fdb462", "#b3de69", "#fccde5")
 
     # make nodes
-    for i in range(len(nodes)):
-        node = nodes[i]
+    for i, node in enumerate(nodes):
         op = node["op"]
         name = node["name"]
         # input data
@@ -120,8 +257,7 @@ def plot_network(symbol, title="plot", shape=None, node_attrs={}):
         dot.node(name=name, label=label, **attr)
 
     # add edges
-    for i in range(len(nodes)):
-        node = nodes[i]
+    for i, node in enumerate(nodes):
         op = node["op"]
         name = node["name"]
         if op == "null":
diff --git a/scala-package/README.md b/scala-package/README.md
index a39e3f5f1394..35630799716a 100644
--- a/scala-package/README.md
+++ b/scala-package/README.md
@@ -100,12 +100,12 @@ import ml.dmlc.mxnet.optimizer.SGD
 
 // model definition
 val data = Symbol.Variable("data")
-val fc1 = Symbol.FullyConnected(name = "fc1")(Map("data" -> data, "num_hidden" -> 128))
-val act1 = Symbol.Activation(name = "relu1")(Map("data" -> fc1, "act_type" -> "relu"))
-val fc2 = Symbol.FullyConnected(name = "fc2")(Map("data" -> act1, "num_hidden" -> 64))
-val act2 = Symbol.Activation(name = "relu2")(Map("data" -> fc2, "act_type" -> "relu"))
-val fc3 = Symbol.FullyConnected(name = "fc3")(Map("data" -> act2, "num_hidden" -> 10))
-val mlp = Symbol.SoftmaxOutput(name = "sm")(Map("data" -> fc3))
+val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
+val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
+val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
+val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
+val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
+val mlp = Symbol.SoftmaxOutput(name = "sm")()(Map("data" -> fc3))
 
 // load MNIST dataset
 val trainDataIter = IO.MNISTIter(Map(
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index 480290d04a2f..cd3ae86acbee 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -5,13 +5,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-full-parent_2.10</artifactId>
+    <artifactId>mxnet-full-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>ml.dmlc.mxnet</groupId>
-  <artifactId>mxnet-full_2.10-linux-x86_64-cpu</artifactId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Full Linux-x86_64 CPU-only</name>
   <packaging>jar</packaging>
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index d36ca80c00fa..3f69fafdf45b 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -5,13 +5,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-full-parent_2.10</artifactId>
+    <artifactId>mxnet-full-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>ml.dmlc.mxnet</groupId>
-  <artifactId>mxnet-full_2.10-linux-x86_64-gpu</artifactId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Full Linux-x86_64 GPU</name>
   <packaging>jar</packaging>
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index d5d622f87d8e..718da93d86c8 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -5,13 +5,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-full-parent_2.10</artifactId>
+    <artifactId>mxnet-full-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>ml.dmlc.mxnet</groupId>
-  <artifactId>mxnet-full_2.10-osx-x86_64-cpu</artifactId>
+  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Full OSX-x86_64 CPU-only</name>
   <packaging>jar</packaging>
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index 9737dbdb108a..003d17be84fc 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -5,13 +5,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-parent_2.10</artifactId>
+    <artifactId>mxnet-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>ml.dmlc.mxnet</groupId>
-  <artifactId>mxnet-full-parent_2.10</artifactId>
+  <artifactId>mxnet-full-parent_2.11</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Full Parent</name>
   <packaging>pom</packaging>
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 5900a0a710a2..babdda15b2b0 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -5,13 +5,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-parent_2.10</artifactId>
+    <artifactId>mxnet-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>ml.dmlc.mxnet</groupId>
-  <artifactId>mxnet-core_2.10</artifactId>
+  <artifactId>mxnet-core_2.11</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Core</name>
 
@@ -74,6 +74,18 @@
     </plugins>
   </build>
   <dependencies>
+    <dependency>
+      <groupId>ml.dmlc.mxnet</groupId>
+      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc.mxnet</groupId>
+      <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-library</artifactId>
@@ -102,5 +114,9 @@
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
+    </dependency>
   </dependencies>
 </project>
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
index cf3bee93a98a..0d5a6d5fb427 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
@@ -23,6 +23,9 @@ object Base {
   type KVStoreHandle = CPtrAddress
   type ExecutorHandle = CPtrAddress
   type SymbolHandle = CPtrAddress
+  type RecordIOHandle = CPtrAddress
+  type OptimizerCreator = CPtrAddress
+  type OptimizerHandle = CPtrAddress
 
   type MXUintRef = RefInt
   type MXFloatRef = RefFloat
@@ -33,6 +36,10 @@ object Base {
   type KVStoreHandleRef = RefLong
   type ExecutorHandleRef = RefLong
   type SymbolHandleRef = RefLong
+  type RecordIOHandleRef = RefLong
+  type OptimizerCreatorRef = RefLong
+  type OptimizerHandleRef = RefLong
+
 
   try {
     try {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
index 35aa2eef6ada..c6bb8fe7aadd 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
@@ -4,7 +4,7 @@ package ml.dmlc.mxnet
  * Base class of all evaluation metrics
  * @param name Metric name
  *
- * @author Yuan Tang, Yizhi Liu
+ * @author Yuan Tang, Yizhi Liu, Depeng Liang
  */
 abstract class EvalMetric(protected val name: String) {
 
@@ -64,6 +64,90 @@ class Accuracy extends EvalMetric("accuracy") {
   }
 }
 
+/**
+ * Calculate top k predictions accuracy
+ */
+class TopKAccuracy(topK: Int) extends EvalMetric("top_k_accuracy") {
+  require(topK > 1, "Please use Accuracy if topK is no more than 1")
+
+  override def update(labels: IndexedSeq[NDArray], preds: IndexedSeq[NDArray]): Unit = {
+    require(labels.length == preds.length,
+      "labels and predictions should have the same length.")
+
+    for ((pred, label) <- preds zip labels) {
+      val predShape = pred.shape
+      val dims = predShape.length
+      require(dims <= 2, "Predictions should be no more than 2 dims.")
+      val labelArray = label.toArray
+      val numSamples = predShape(0)
+      if (dims == 1) {
+        val predArray = pred.toArray.zipWithIndex.sortBy(_._1).reverse.map(_._2)
+        require(predArray.length == labelArray.length)
+        this.sumMetric +=
+          labelArray.zip(predArray).map { case (l, p) => if (l == p) 1 else 0 }.sum
+      } else if (dims == 2) {
+        val numclasses = predShape(1)
+        val predArray = pred.toArray.grouped(numclasses).map { a =>
+          a.zipWithIndex.sortBy(_._1).reverse.map(_._2)
+        }.toArray
+        require(predArray.length == labelArray.length)
+        val topK = Math.max(this.topK, numclasses)
+        for (j <- 0 until topK) {
+          this.sumMetric +=
+            labelArray.zip(predArray.map(_(j))).map { case (l, p) => if (l == p) 1 else 0 }.sum
+        }
+      }
+      this.numInst += numSamples
+    }
+  }
+}
+
+/**
+ * Calculate the F1 score of a binary classification problem.
+ */
+class F1 extends EvalMetric("f1") {
+  override def update(labels: IndexedSeq[NDArray], preds: IndexedSeq[NDArray]): Unit = {
+    require(labels.length == preds.length,
+      "labels and predictions should have the same length.")
+
+    for ((pred, label) <- preds zip labels) {
+      val predLabel = NDArray.argmaxChannel(pred)
+      require(label.shape == predLabel.shape,
+        s"label ${label.shape} and prediction ${predLabel.shape}" +
+        s"should have the same length.")
+      val labelArray = label.toArray
+      var unique = Array[Float]()
+      labelArray.foreach(l => if (!unique.contains(l)) unique = unique :+ l)
+      require(unique.length <= 2, "F1 currently only supports binary classification.")
+
+      var truePositives, falsePositives, falseNegatives = 0f
+      for ((labelElem, predElem) <- labelArray zip predLabel.toArray) {
+        if (predElem == 1 && labelElem == 1) truePositives += 1
+        else if (predElem == 1 && labelElem == 0) falsePositives += 1
+        else if (predElem == 0 && labelElem == 1) falseNegatives += 1
+      }
+
+      val precision = {
+        if (truePositives + falsePositives > 0) truePositives / (truePositives + falsePositives)
+        else 0f
+      }
+
+      val recall = {
+        if (truePositives + falseNegatives > 0) truePositives / (truePositives + falseNegatives)
+        else 0f
+      }
+
+      val f1Score = {
+        if (precision + recall > 0) (2 * precision * recall) / (precision + recall)
+        else 0f
+      }
+
+      this.sumMetric += f1Score
+      this.numInst += 1
+    }
+  }
+}
+
 // Regression metrics
 
 /**
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala
index c7ea10e04b82..dce6c6caeb0e 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala
@@ -8,6 +8,12 @@ import org.slf4j.{LoggerFactory, Logger}
  * @author Yizhi Liu
  */
 object KVStore {
+
+  // group id of scheduler/server/worker
+  val GROUP_NODE_SCHEDULER = 1
+  val GROUP_NODE_SERVER = 2
+  val GROUP_NODE_WORKER = 4
+
   /**
    * Create a new KVStore. <br />
    * <b>
@@ -208,10 +214,25 @@ class KVStore(private[mxnet] val handle: KVStoreHandle) {
    * pulling, we can place a barrier to guarantee that the initialization is
    * finished.
    */
-  def barrier() {
+  def barrier(): Unit = {
     checkCall(_LIB.mxKVStoreBarrier(handle))
   }
 
+  def numDeadNode(nodeId: Int): Int = {
+    val number = new RefInt
+    checkCall(_LIB.mxKVStoreGetNumDeadNode(handle, nodeId, number))
+    number.value
+  }
+
+  /**
+   * Whether to do barrier when the kvstore finalizes
+   * @param barrierBeforeExit
+   */
+  def setBarrierBeforeExit(barrierBeforeExit: Boolean): Unit = {
+    val flag: Int = if (barrierBeforeExit) 1 else 0
+    checkCall(_LIB.mxKVStoreSetBarrierBeforeExit(handle, flag))
+  }
+
   /**
    * Send a command to all server nodes
    *
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
index f2dfd6cd95bc..d3fe3e5040be 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
@@ -33,14 +33,60 @@ class KVStoreServer(private val kvStore: KVStore) {
 }
 
 object KVStoreServer {
-  // Start server/scheduler according to env variables
-  def start(): Unit = {
+  private val logger: Logger = LoggerFactory.getLogger(classOf[KVStoreServer])
+  /**
+   * Start server/scheduler according to env variables
+   * @param dieIfOthersGoOutTimeout When this argument is set to an integer greater than 0
+   *                                (in second),
+   *                                a daemon thread will start to periodically check
+   *                                whether scheduler (server side) or servers (scheduler side)
+   *                                are dead. If so, die itself.
+   *                                This could be useful for running mxnet on distributed
+   *                                data platform,
+   *                                where you do not know which node your application runs on
+   *                                and in such situation
+   *                                you want others die automatically once
+   *                                some of the nodes goes out.
+   */
+  def start(dieIfOthersGoOutTimeout: Int = 0): Unit = {
     val isWorker = new RefInt
     checkCall(_LIB.mxKVStoreIsWorkerNode(isWorker))
     require(isWorker.value == 0, "cannot start kv-store server on worker node")
     val kvStore = KVStore.create("dist")
+    val daemonThread: Option[Thread] =
+      if (dieIfOthersGoOutTimeout > 0) {
+        val daemon = new Runnable {
+          override def run(): Unit = {
+            var running = true
+            while (running) {
+              try {
+                Thread.sleep(dieIfOthersGoOutTimeout.toLong * 1000)
+                val numDead = kvStore.numDeadNode(KVStore.GROUP_NODE_SCHEDULER
+                  + KVStore.GROUP_NODE_SERVER + KVStore.GROUP_NODE_WORKER)
+                if (numDead > 0) {
+                  logger.error(s"Detect $numDead dead node(s). Shutdown now.")
+                  System.exit(1)
+                }
+              } catch {
+                case e: InterruptedException => running = false
+              }
+            }
+          }
+        }
+        val t = new Thread(daemon)
+        t.setDaemon(true)
+        t.start()
+        Option(t)
+      } else {
+        None
+      }
     val server = new KVStoreServer(kvStore)
     server.run()
+    daemonThread.foreach(t => {
+      t.interrupt()
+      t.join()
+    })
+    kvStore.dispose()
   }
 
   def init(env: Map[String, String]): Unit = {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
index 98ce1953243d..92f37c99d309 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
@@ -78,6 +78,7 @@ class LibInfo {
   // KVStore Server
   @native def mxInitPSEnv(keys: Array[String], values: Array[String]): Int
   @native def mxKVStoreRunServer(handle: KVStoreHandle, controller: KVServerControllerCallback): Int
+  @native def mxKVStoreGetNumDeadNode(handle: KVStoreHandle, nodeId: Int, number: RefInt): Int
 
   // KVStore
   @native def mxKVStoreCreate(name: String, handle: KVStoreHandleRef): Int
@@ -103,6 +104,7 @@ class LibInfo {
   @native def mxKVStoreBarrier(handle: KVStoreHandle): Int
   @native def mxKVStoreGetGroupSize(handle: KVStoreHandle, size: RefInt): Int
   @native def mxKVStoreGetRank(handle: KVStoreHandle, size: RefInt): Int
+  @native def mxKVStoreSetBarrierBeforeExit(handle: KVStoreHandle, doBarrier: Int): Int
   @native def mxKVStoreFree(handle: KVStoreHandle): Int
 
   // DataIter Funcs
@@ -228,4 +230,28 @@ class LibInfo {
   @native def mxRandomSeed(seed: Int): Int
 
   @native def mxNotifyShutdown(): Int
+
+  // RecordIO
+  @native def mxRecordIOWriterCreate(uri: String, out: RecordIOHandleRef): Int
+  @native def mxRecordIOReaderCreate(uri: String, out: RecordIOHandleRef): Int
+  @native def mxRecordIOWriterFree(handle: RecordIOHandle): Int
+  @native def mxRecordIOReaderFree(handle: RecordIOHandle): Int
+  @native def mxRecordIOWriterWriteRecord(handle: RecordIOHandle, buf: String, size: Int): Int
+  @native def mxRecordIOReaderReadRecord(handle: RecordIOHandle, buf: RefString): Int
+  @native def mxRecordIOWriterTell(handle: RecordIOHandle, pos: RefInt): Int
+  @native def mxRecordIOReaderSeek(handle: RecordIOHandle, pos: Int): Int
+
+  @native def mxOptimizerFindCreator(key: String, out: OptimizerCreatorRef): Int
+  @native def mxOptimizerCreateOptimizer(creator: OptimizerCreator,
+                                         numParam: Int,
+                                         keys: Array[String],
+                                         vals: Array[String],
+                                         out: OptimizerHandleRef): Int
+  @native def mxOptimizerFree(handle: OptimizerHandle): Int
+  @native def mxOptimizerUpdate(handle: OptimizerHandle,
+                                index: Int,
+                                weight: NDArrayHandle,
+                                grad: NDArrayHandle,
+                                lr: Float,
+                                wd: Float): Int
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/RecordIO.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/RecordIO.scala
new file mode 100644
index 000000000000..e749c8413744
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/RecordIO.scala
@@ -0,0 +1,214 @@
+package ml.dmlc.mxnet
+
+import ml.dmlc.mxnet.Base._
+import java.io.File
+import scala.io.Source
+import java.io.PrintWriter
+import java.io.ByteArrayOutputStream
+import java.io.DataOutputStream
+import java.io.DataInputStream
+import java.io.ByteArrayInputStream
+
+/**
+ * Scala interface for read/write RecordIO data format
+ *
+ * @author Depeng Liang
+ *
+ * @param uri, path to recordIO file.
+ * @param flag, RecordIO.IORead for reading or RecordIO.Write for writing.
+ */
+class MXRecordIO(uri: String, flag: MXRecordIO.IOFlag) {
+  protected val recordIOHandle: RecordIOHandleRef = new RecordIOHandleRef
+  protected var isOpen: Boolean = false
+
+  open()
+
+  // Open record file
+  protected def open(): Unit = {
+    flag match {
+      case MXRecordIO.IOWrite => {
+        checkCall(_LIB.mxRecordIOWriterCreate(uri, recordIOHandle))
+      }
+      case MXRecordIO.IORead => {
+        checkCall(_LIB.mxRecordIOReaderCreate(uri, recordIOHandle))
+      }
+    }
+    this.isOpen = true
+  }
+
+  // Close record file
+  def close(): Unit = {
+    if (this.isOpen) {
+      flag match {
+        case MXRecordIO.IOWrite => {
+          checkCall(_LIB.mxRecordIOWriterFree(recordIOHandle.value))
+        }
+        case MXRecordIO.IORead => {
+          checkCall(_LIB.mxRecordIOReaderFree(recordIOHandle.value))
+        }
+      }
+    }
+  }
+
+  // Reset pointer to first item.
+  // If record is opened with RecordIO.IOWrite, this will truncate the file to empty.
+  def reset(): Unit = {
+    this.close()
+    this.open()
+  }
+
+  // Write a string buffer as a record
+  def write(buf: String): Unit = {
+    assert(this.flag == MXRecordIO.IOWrite)
+    checkCall(_LIB.mxRecordIOWriterWriteRecord(this.recordIOHandle.value, buf, buf.size))
+  }
+
+  // Read a record as string
+  def read(): String = {
+    assert(this.flag == MXRecordIO.IORead)
+    val result = new RefString
+    checkCall(_LIB.mxRecordIOReaderReadRecord(this.recordIOHandle.value, result))
+    result.value
+  }
+}
+
+object MXRecordIO {
+  sealed trait IOFlag
+  case object IOWrite extends IOFlag
+  case object IORead extends IOFlag
+
+  case class IRHeader(flag: Int, label: Array[Float], id: Int, id2: Int)
+
+  /**
+   * pack an string into MXImageRecord.
+   * @param
+   *  header of the image record.
+   *  header.label an array.
+   * @param s string to pack
+   * @return the resulting packed string
+   */
+  def pack(header: IRHeader, s: String): String = {
+    val data = new ByteArrayOutputStream()
+    val stream = new DataOutputStream(data)
+    stream.writeInt(header.label.length)
+    header.label.foreach(stream.writeFloat)
+    stream.writeInt(header.id)
+    stream.writeInt(header.id2)
+    stream.writeUTF(s)
+    stream.flush()
+    stream.close()
+    data.toByteArray().map(_.toChar).mkString
+  }
+
+  /**
+   * unpack a MXImageRecord to string.
+   * @param s string buffer from MXRecordIO.read
+   * @return
+   * header : IRHeader, header of the image record
+   * str : String, unpacked string
+   */
+  def unpack(s: String): (IRHeader, String) = {
+    val data = s.toCharArray().map(_.toByte)
+    val stream = new DataInputStream(new ByteArrayInputStream(data))
+    val flag = stream.readInt()
+    val label = (0 until flag).map( idx => stream.readFloat()).toArray
+    val id = stream.readInt()
+    val id2 = stream.readInt()
+    val str = stream.readUTF()
+    stream.close()
+    (IRHeader(flag, label, id, id2), str)
+  }
+
+}
+
+/**
+ * Scala interface for read/write RecordIO data formmat with index.
+ * Support random access.
+ *
+ * @author Depeng Liang
+ *
+ * @param idx_path, path to index file
+ * @param uri, path to recordIO file.
+ * @param flag, RecordIO.IORead for reading or RecordIO.Write for writing.
+ * @param keyType, data type for keys.
+ */
+class MXIndexedRecordIO(idxPath: String, uri: String, flag: MXRecordIO.IOFlag,
+  keyType: MXIndexedRecordIO.KeyType = MXIndexedRecordIO.TyepInt) extends MXRecordIO(uri, flag) {
+  private var idx = this.keyType match {
+    case MXIndexedRecordIO.TyepInt => Map[Int, Int]()
+    case _ => Map[Any, Int]()
+  }
+
+  if (flag == MXRecordIO.IORead && new File(idxPath).isFile()) {
+    Source.fromFile(idxPath).getLines().foreach { line =>
+      val (k, v) = {
+        val tmp = line.trim().split("\t")
+        val key = this.keyType match {
+          case MXIndexedRecordIO.TyepInt => tmp(0).toInt
+        }
+        (key, tmp(1).toInt)
+      }
+      this.idx = this.idx + (k -> v)
+    }
+  }
+
+  override def close(): Unit = {
+    if (this.flag == MXRecordIO.IOWrite) {
+      val fOut = new PrintWriter(idxPath)
+      this.idx.foreach { case (k, v) =>
+        fOut.write(s"$k\t$v\n")
+      }
+      fOut.flush()
+      fOut.close()
+    }
+    super.close()
+  }
+
+  override def reset(): Unit = {
+    this.idx = Map[Any, Int]()
+    super.close()
+    super.open()
+  }
+
+  // Query current read head position
+  def seek(idx: Any): Unit = {
+    assert(this.flag == MXRecordIO.IORead)
+    val idxx = this.keyType match {
+      case MXIndexedRecordIO.TyepInt => idx.asInstanceOf[Int]
+    }
+    val pos = this.idx(idxx)
+    checkCall(_LIB.mxRecordIOReaderSeek(this.recordIOHandle.value, pos))
+  }
+
+  // Query current write head position
+  def tell(): Int = {
+    assert(this.flag == MXRecordIO.IOWrite)
+    val pos = new RefInt
+    checkCall(_LIB.mxRecordIOWriterTell(this.recordIOHandle.value, pos))
+    pos.value
+  }
+
+  // Read record with index
+  def readIdx(idx: Any): String = {
+    this.seek(idx)
+    this.read()
+  }
+
+  // Write record with index
+  def writeIdx(idx: Any, buf: String): Unit = {
+    val pos = this.tell()
+    val idxx = this.keyType match {
+      case MXIndexedRecordIO.TyepInt => idx.asInstanceOf[Int]
+    }
+    this.idx = this.idx + (idxx -> pos)
+    this.write(buf)
+  }
+
+  // List all keys from index
+  def keys(): Iterable[Any] = this.idx.keys
+}
+
+object MXIndexedRecordIO {
+  sealed trait KeyType
+  case object TyepInt extends KeyType
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala
index e9520b453d68..cb66492b4980 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala
@@ -765,16 +765,15 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) {
     jsonStr.value
   }
 }
-// scalastyle:on finalize
 
+// scalastyle:on finalize
+@AddSymbolFunctions
 object Symbol {
   private type SymbolCreateNamedFunc = Map[String, Any] => Symbol
   private val logger = LoggerFactory.getLogger(classOf[Symbol])
   private val functions: Map[String, SymbolFunction] = initSymbolModule()
   private val bindReqMap = Map("null" -> 0, "write" -> 1, "add" -> 3)
 
-  // TODO: _CrossDeviceCopy
-
   def pow(sym1: Symbol, sym2: Symbol): Symbol = {
     Symbol.createFromListedSymbols("_Power")(Array(sym1, sym2))
   }
@@ -787,118 +786,6 @@ object Symbol {
     Symbol.createFromListedSymbols("_RPowerScalar")(Array(sym), Map("scalar" -> number.toString))
   }
 
-  /**
-   * Take absolute value of the src
-   * @param src Source symbolic input to the function
-   */
-  def abs(src: Symbol): Symbol = {
-    createFromListedSymbols("abs")(Array(src))
-  }
-
-  /**
-   * Take sign value of the src
-   * @param src Source symbolic input to the function
-   */
-  def sign(src: Symbol): Symbol = {
-    createFromListedSymbols("sign")(Array(src))
-  }
-
-  /**
-   * Take round value of the src
-   * @param src Source input to the function
-   */
-  def round(src: Symbol): Symbol = {
-    createFromListedSymbols("round")(Array(src))
-  }
-
-  /**
-   * Take ceil value of the src
-   * src Source input to the function
-   */
-  def ceil(src: Symbol): Symbol = {
-    createFromListedSymbols("ceil")(Array(src))
-  }
-
-  /**
-   * Take floor value of the src
-   * @param src Source input to the function
-   */
-  def floor(src: Symbol): Symbol = {
-    createFromListedSymbols("floor")(Array(src))
-  }
-
-  /**
-   * Take square of the src
-   * @param src Source symbolic input to the function
-   */
-  def square(src: Symbol): Symbol = {
-    createFromListedSymbols("square")(Array(src))
-  }
-
-  /**
-   * Take sum of the src
-   * @param src Source symbolic input to the function
-   */
-  def sum(src: Symbol): Symbol = {
-    createFromListedSymbols("sum")(Array(src))
-  }
-
-  /**
-   * Take sqrt of the src
-   * src Source symbolic input to the function
-   */
-  def sqrt(src: Symbol): Symbol = {
-    createFromListedSymbols("sqrt")(Array(src))
-  }
-
-  /**
-   * Take rsqrt of the src
-   * @param src Source symbolic input to the function
-   */
-  def rsqrt(src: Symbol): Symbol = {
-    createFromListedSymbols("rsqrt")(Array(src))
-  }
-
-  /**
-   * Take exp of the src
-   * @param src Source symbolic input to the function
-   */
-  def exp(src: Symbol): Symbol = {
-    createFromListedSymbols("exp")(Array(src))
-  }
-
-  /**
-   * Take log of the src
-   * @param src Source symbolic input to the function
-   */
-  def log(src: Symbol): Symbol = {
-    createFromListedSymbols("log")(Array(src))
-  }
-
-  /**
-   * Take cos of the src
-   * @param src Source symbolic input to the function
-   */
-  def cos(src: Symbol): Symbol = {
-    createFromListedSymbols("cos")(Array(src))
-  }
-
-  /**
-   * Take sin of the src
-   * @param src Source symbolic input to the function
-   */
-  def sin(src: Symbol): Symbol = {
-    createFromListedSymbols("sin")(Array(src))
-  }
-
-  /**
-   * Return transpose of the src
-   * @param src Source symbolic input to the function
-   */
-  def transpose(src: Symbol): Symbol = {
-    createFromListedSymbols("transpose")(Array(src))
-  }
-
   def max(left: Symbol, right: Symbol): Symbol = {
     createFromListedSymbols("_Maximum")(Array(left, right))
   }
@@ -937,422 +824,6 @@ object Symbol {
     sym
   }
 
-  /**
-   * Get output from a symbol and pass 0 gradient back
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data.
-   */
-  def BlockGrad(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("BlockGrad", name, attr)
-  }
-
-  /**
-   * Crop the 2th and 3th dim of input data, with the corresponding size of w_h or with width
-   * and height of the second input symbol
-   *
-   * Parameters
-   * ----------
-   * num_args : int, required.
-   *            Number of inputs for crop,
-   *            if equals one, then we will use the h_w for crop height and width,
-   *            else if equals two,
-   *            then we will use the height and width of the second input symbol,
-   *            we name crop_like here
-   * offset : Shape(tuple), optional, default=(0, 0), corp offset coordinate: (y, x)
-   * h_w : Shape(tuple), optional, default=(0, 0), corp height and weight: (h, w)
-   * center_crop : boolean, optional, default=False.
-   *               If set to true, then it will use be the center_crop,
-   *               or it will crop using the shape of crop_like
-   */
-  def Crop(name: String = null, attr: Map[String, String] = null)(
-           inputs: Array[Symbol], params: Map[String, Any] = null): Symbol = {
-    createFromListedSymbolsNoCheck("Crop", name, attr)(inputs, params)
-  }
-
-  /**
-   * Apply dropout to input
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to dropout.
-   * p : float, optional, default=0.5. Fraction of the input that gets dropped out at training time
-   */
-  def Dropout(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Dropout", name, attr)
-  }
-
-  /**
-   * Apply a sparse regularization to the output a sigmoid activation function.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data.
-   * sparseness_target : float, optional, default=0.1. The sparseness target
-   * penalty : float, optional, default=0.001. The tradeoff parameter for the sparseness penalty
-   * momentum : float, optional, default=0.9. The momentum for running average
-   */
-  def IdentityAttachKLSparseReg(name: String = null,
-                                attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("IdentityAttachKLSparseReg", name, attr)
-  }
-
-  /**
-   * Apply activation function to input.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to activation function.
-   * act_type : {'elu', 'leaky', 'prelu', 'rrelu'},optional, default='leaky'
-   *            Activation function to be applied.
-   * slope : float, optional, default=0.25. Init slope for the activation. (For leaky and elu only)
-   * lower_bound : float, optional, default=0.125. Lower bound of random slope. (For rrelu only)
-   * upper_bound : float, optional, default=0.334. Upper bound of random slope. (For rrelu only)
-   */
-  def LeakyReLU(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("LeakyReLU", name, attr)
-  }
-
-  /**
-   * Apply convolution to input then add a bias.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to the ConvolutionOp.
-   * alpha : float, optional, default=0.0001,
-   *         value of the alpha variance scaling parameter in the normalization formula
-   * beta : float, optional, default=0.75,
-   *        value of the beta power parameter in the normalization formula
-   * knorm : float, optional, default=2, value of the k parameter in normalization formula
-   * nsize : int (non-negative), required, normalization window width in elements.
-   */
-  def LRN(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("LRN", name, attr)
-  }
-
-  /**
-   * Use mean absolute error regression for final output, this is used on final output of a net.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to function.
-   * label : Symbol. Input label to function.
-   * grad_scale : float, optional, default=1. Scale the gradient by a float factor
-   */
-  def MAERegressionOutput(name: String = null,
-                          attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("MAERegressionOutput", name, attr)
-  }
-
-  /**
-   * Reshape input to target shape
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to  reshape.
-   * target_shape : Shape(tuple), required. Target new shape. One and only one dim can be 0,
-   *                in which case it will be infered from the rest of dims
-   */
-  def Reshape(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Reshape", name, attr)
-  }
-
-  /**
-   * Slice channel into many outputs with equally divided channel
-   *
-   * Parameters
-   * ----------
-   * num_outputs : int, required. Number of outputs to be sliced.
-   */
-  def SliceChannel(name: String = null, attr: Map[String, String] = null)(
-                   inputs: Array[Symbol], params: Map[String, Any] = null): Symbol = {
-    createFromListedSymbolsNoCheck("SliceChannel", name, attr)(inputs, params)
-  }
-
-  /**
-   * Apply softmax activation to input.
-   * This is intended for internal layers. For output (loss layer) please use SoftmaxOutput.
-   * If type=instance,
-   * this operator will compute a softmax for each instance in the batch; this is the default mode.
-   * If type=channel,
-   * this operator will compute a num_channel-class softmax at each position of each instance;
-   * this can be used for fully convolutional network, image segmentation, etc.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to activation function.
-   * type : {'channel', 'instance'},optional, default='instance'. Softmax Mode.
-   *        If set to instance,
-   *        this operator will compute a softmax for each instance in the batch;
-   *        this is the default mode.
-   *        If set to channel,
-   *        this operator will compute a num_channel-class softmax
-   *        at each position of each instance;
-   *        this can be used for fully convolutional network, image segmentation, etc.
-   */
-  def SoftmaxActivation(name: String = null,
-                        attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("SoftmaxActivation", name, attr)
-  }
-
-  /**
-   * Apply matrix multiplication to input then add a bias.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to the FullyConnectedOp.
-   * weight : Symbol. Weight matrix.
-   * bias : Symbol. Bias parameter.
-   * num_hidden : int, required. Number of hidden nodes of the output.
-   * no_bias : boolean, optional, default=False. Whether to disable bias parameter.
-   */
-  def FullyConnected(name: String = null,
-                     attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("FullyConnected", name, attr)
-  }
-
-  /**
-   * Apply activation function to input.
-   * Softmax Activation is only available with CUDNN on GPUand will be computed
-   * at each location across channel if input is 4D.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to activation function.
-   * act_type : {'relu', 'sigmoid', 'softrelu', 'tanh'}, required.
-   *            Activation function to be applied.
-   */
-  def Activation(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Activation", name, attr)
-  }
-
-  /**
-   * Apply convolution to input then add a bias.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to the ConvolutionOp.
-   * weight : Symbol. Weight matrix.
-   * bias : Symbol. Bias parameter.
-   * kernel : Shape(tuple), required. Convolution kernel size: (y, x)
-   * stride : Shape(tuple), optional, default=(1, 1). Convolution stride: (y, x)
-   * dilate : Shape(tuple), optional, default=(1, 1). Convolution dilate: (y, x)
-   * pad : Shape(tuple), optional, default=(0, 0). Pad for convolution: (y, x)
-   * num_filter : int (non-negative), required. Convolution filter(channel) number
-   * num_group : int (non-negative), optional, default=1
-   *             Number of groups partition.
-   *             This option is not supported by CuDNN,
-   *             you can use SliceChannel to num_group,
-   *             apply convolution and concat instead to achieve the same need.
-   * workspace : long (non-negative), optional, default=512. Tmp workspace for convolution (MB).
-   * no_bias : boolean, optional, default=False. Whether to disable bias parameter.
-   */
-  def Convolution(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Convolution", name, attr)
-  }
-
-  /**
-   * Apply deconvolution to input then add a bias.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to the DeconvolutionOp.
-   * weight : Symbol. Weight matrix.
-   * bias : Symbol. Bias parameter.
-   * kernel : Shape(tuple), required, deconvolution kernel size: (y, x)
-   * stride : Shape(tuple), optional, default=(1, 1), deconvolution stride: (y, x)
-   * pad : Shape(tuple), optional, default=(0, 0), pad for deconvolution: (y, x)
-   * num_filter : int (non-negative), required, deconvolution filter(channel) number
-   * num_group : int (non-negative), optional, default=1, number of groups partition
-   * workspace : long (non-negative), optional, default=512. Tmp workspace for deconvolution (MB)
-   * no_bias : boolean, optional, default=True. Whether to disable bias parameter.
-   */
-  def Deconvolution(name: String = null,
-                    attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Deconvolution", name, attr)
-  }
-
-  /**
-   * Perform spatial pooling on inputs.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to the pooling operator.
-   * kernel : Shape(tuple), required, pooling kernel size: (y, x)
-   * pool_type : {'avg', 'max', 'sum'}, required. Pooling type to be applied.
-   * stride : Shape(tuple), optional, default=(1, 1), stride for pooling (y, x)
-   * pad : Shape(tuple), optional, default=(0, 0), pad for pooling: (y, x)
-   */
-  def Pooling(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Pooling", name, attr)
-  }
-
-  /**
-   * Flatten input
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to flatten.
-   */
-  def Flatten(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Flatten", name, attr)
-  }
-
-  /**
-   * Perform a softmax transformation on input, backprop with logloss.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to softmax.
-   * label : Symbol. Label data.
-   * grad_scale : float, optional, default=1. Scale the gradient by a float factor
-   * ignore_label : float, optional, default=-1.
-   *                the ignore_label will not work in backward,
-   *                and this onlybe used when multi_output=true
-   * multi_output : boolean, optional, default=False.
-   *                If set to true, for a (n,k,x_1,..,x_n) dimensionalinput tensor,
-   *                softmax will generate n*x_1*...*x_n output, eachhas k classes
-   * use_ignore : boolean, optional, default=False.
-   *              If set to true,
-   *              the ignore_label value will not contributorto the backward gradient
-   */
-  def SoftmaxOutput(name: String = null,
-                    attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("SoftmaxOutput", name, attr)
-  }
-
-  /**
-   * Cast array to a different data type.
-   * Parameters
-   * ----------
-   * data : Symbol, Input data to cast function.
-   * dtype : {Int, Double, Short, Float}, required, Target data type.
-   */
-  def Cast(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Cast", name, attr)
-  }
-
-  /**
-   * Perform an elementwise sum over all the inputs.
-   *
-   * Parameters
-   * ----------
-   * num_args : int, required. Number of inputs to be sum.
-   */
-  def ElementWiseSum(name: String = null,
-                     attr: Map[String, String] = null)(
-                     symbols: Array[Symbol], params: Map[String, Any] = null): Symbol = {
-    createFromListedSymbolsNoCheck("ElementWiseSum", name, attr)(symbols, params)
-  }
-
-  /**
-   * Apply batch normalization to input.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol, Input data to batch normalization
-   * eps : float, optional, default=0.001, Epsilon to prevent div 0
-   * momentum : float, optional, default=0.9, Momentum for moving average
-   * fix_gamma : boolean, optional, default=True, Fix gamma while training
-   */
-  def BatchNorm(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("BatchNorm", name, attr)
-  }
-
-  /**
-   * Perform nearest neighbor/bilinear up sampling to inputs
-   *
-   * Parameters
-   * ----------
-   * data : Symbol[]. Array of tensors to upsample
-   * scale : int (non-negative), required. Up sampling scale
-   * num_filter : int (non-negative), optional, default=0.
-   *              Input filter. Only used by nearest sample_type.
-   * sample_type : {'bilinear', 'nearest'}, required, upsampling method
-   * multi_input_mode : {'concat', 'sum'},optional, default='concat'
-   *                    How to handle multiple input.
-   *                    concat means concatenate upsampled images along the channel dimension.
-   *                    sum means add all images together,
-   *                    only available for nearest neighbor upsampling.
-   * num_args : int, required. Number of inputs to be upsampled.
-   *            For nearest neighbor upsampling, this can be 1-N;
-   *            the size of output will be(scale*h_0,scale*w_0)
-   *            and all other inputs will be upsampled to thesame size.
-   *            For bilinear upsampling this must be 2; 1 input and 1 weight.
-   */
-  def UpSampling(name: String = null, attr: Map[String, String] = null)(
-                 inputs: Array[Symbol], params: Map[String, Any] = null): Symbol = {
-    createFromListedSymbolsNoCheck("UpSampling", name, attr)(inputs, params)
-  }
-
-  /**
-   * Perform an feature concat on channel dim (dim 1) over all the inputs.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol[]. List of tensors to concatenate
-   * num_args : int, required. Number of inputs to be concated.
-   * dim : int, optional, default='1'. the dimension to be concated.
-   */
-  def Concat(name: String = null, attr: Map[String, String] = null)(
-             inputs: Array[Symbol], params: Map[String, Any] = null): Symbol = {
-    createFromListedSymbolsNoCheck("Concat", name, attr)(inputs, params)
-  }
-
-  /**
-   * Use Logistic regression for final output, this is used on final output of a net.
-   * Logistic regression is suitable for binary classification or probability prediction tasks.
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to function.
-   * label : Symbol. Input label to function.
-   * grad_scale : float, optional, default=1. Scale the gradient by a float factor
-   */
-  def LogisticRegressionOutput(name: String = null,
-                               attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("LogisticRegressionOutput", name, attr)
-  }
-
-  /**
-   * Use linear regression for final output, this is used on final output of a net.
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to function.
-   * label : Symbol. Input label to function.
-   * grad_scale : float, optional, default=1. Scale the gradient by a float factor
-   */
-  def LinearRegressionOutput(name: String = null,
-                             attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("LinearRegressionOutput", name, attr)
-  }
-
-  /**
-   * Apply swapaxis to input.
-   *
-   * Parameters
-   * ----------
-   * data : Symbol. Input data to the SwapAxisOp.
-   * dim1 : int (non-negative), default=0, the first axis to be swapped.
-   * dim2 : int (non-negative), default=0, the second axis to be swapped.
-   */
-  def SwapAxis(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("SwapAxis", name, attr)
-  }
-
-  /**
-   * Get embedding for one-hot input
-   *
-   * Parameters
-   * ----------
-   * data : Symbol, Input data to the EmbeddingOp.
-   * weight : Symbol, Embedding weight matrix.
-   * input_dim : int, input dim of one-hot encoding
-   * output_dim : int, output dim of embedding
-   */
-  def Embedding(name: String = null, attr: Map[String, String] = null): SymbolCreateNamedFunc = {
-    createFromNamedSymbolsNoCheck("Embedding", name, attr)
-  }
-
   /**
    * Create a symbol that groups symbols together.
    * @param symbols List of symbols to be grouped.
@@ -1390,6 +861,39 @@ object Symbol {
     (name.value, new SymbolFunction(handle, keyVarNumArgs.value))
   }
 
+  // Used by SymbolMacro
+  private def createSymbolGeneral(operator: String, name: String, attr: Map[String, String],
+      symbols: Seq[Symbol], kwargs: Map[String, Any]): Symbol = {
+    val symbolKwargs: Map[String, Symbol] =
+      if (kwargs == null || kwargs.isEmpty) {
+        Map.empty[String, Symbol]
+      } else {
+        kwargs.filter { case (key, value) =>
+          value.isInstanceOf[Symbol]
+        }.map { case (key, value) =>
+          (key, value.asInstanceOf[Symbol])
+        }
+      }
+    val strKwargs: Map[String, String] =
+      if (kwargs == null || kwargs.isEmpty) {
+        Map.empty[String, String]
+      } else {
+        kwargs.filter { case (key, value) =>
+          !value.isInstanceOf[Symbol]
+        }.map { case (key, value) =>
+          (key, value.toString)
+        }
+      }
+    require(symbols.isEmpty || symbolKwargs.isEmpty, String.format(
+      "%s can only accept input Symbols either as positional or keyword arguments, not both",
+      operator))
+    if (symbols.isEmpty) {
+      createFromNamedSymbols(operator, name, attr)(symbolKwargs, strKwargs)
+    } else {
+      createFromListedSymbols(operator, name, attr)(symbols.toArray, strKwargs)
+    }
+  }
+
   /**
    * Activation Operator of Neural Net.
    * The parameters listed below can be passed in as keyword arguments.
@@ -1446,7 +950,7 @@ object Symbol {
     val function = functions(operator)
     require(function != null, s"invalid operator name $operator")
     require(function.keyVarNumArgs == null || function.keyVarNumArgs.isEmpty,
-      "This function support variable length of Symbol arguments.\n" +
+      s"[$operator] support variable length of Symbol arguments.\n" +
       "Please pass all the input Symbols via positional arguments instead of keyword arguments.")
 
     val paramKeys =
@@ -1470,6 +974,7 @@ object Symbol {
 
   // a more friendly interface for creating symbols
   // all values except symbols in kwargs will be cast to String using its toString() method
+  @Deprecated
   def createFromNamedSymbolsNoCheck(
       operator: String, name: String = null, attr: Map[String, String] = null)(
       kwargs: Map[String, Any]): Symbol = {
@@ -1488,9 +993,10 @@ object Symbol {
 
   // a more friendly interface for creating symbols
   // all values except symbols in kwargs will be cast to String using its toString() method
+  @Deprecated
   def createFromListedSymbolsNoCheck(
-       operator: String, name: String = null, attr: Map[String, String] = null)(
-       symbols: Array[Symbol], kwargs: Map[String, Any] = null): Symbol = {
+      operator: String, name: String = null, attr: Map[String, String] = null)(
+      symbols: Array[Symbol], kwargs: Map[String, Any] = null): Symbol = {
     val args =
       if (kwargs == null) null
       else kwargs.map { case (key, value) => (key, value.toString) }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Visualization.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Visualization.scala
new file mode 100644
index 000000000000..aab7f47f4854
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Visualization.scala
@@ -0,0 +1,282 @@
+package ml.dmlc.mxnet
+
+import scala.util.parsing.json._
+import java.io.File
+import java.io.PrintWriter
+import scala.collection.mutable.ArrayBuffer
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Shape
+
+/**
+ * @author Depeng Liang
+ */
+object Visualization {
+
+  /**
+   * A simplify implementation of the python-Graphviz library functionality
+   * based on: https://github.com/xflr6/graphviz/tree/master/graphviz
+   */
+  class Dot(name: String) {
+    // http://www.graphviz.org/cgi-bin/man?dot
+    private val ENGINES = Set(
+      "dot", "neato", "twopi", "circo", "fdp", "sfdp", "patchwork", "osage"
+    )
+
+    // http://www.graphviz.org/doc/info/output.html
+    private val FORMATS = Set(
+        "bmp",
+        "canon", "dot", "gv", "xdot", "xdot1.2", "xdot1.4",
+        "cgimage",
+        "cmap",
+        "eps",
+        "exr",
+        "fig",
+        "gd", "gd2",
+        "gif",
+        "gtk",
+        "ico",
+        "imap", "cmapx",
+        "imap_np", "cmapx_np",
+        "ismap",
+        "jp2",
+        "jpg", "jpeg", "jpe",
+        "pct", "pict",
+        "pdf",
+        "pic",
+        "plain", "plain-ext",
+        "png",
+        "pov",
+        "ps",
+        "ps2",
+        "psd",
+        "sgi",
+        "svg", "svgz",
+        "tga",
+        "tif", "tiff",
+        "tk",
+        "vml", "vmlz",
+        "vrml",
+        "wbmp",
+        "webp",
+        "xlib",
+        "x11"
+    )
+
+    private val _head = "digraph %s{".format(name)
+    private val _node = "\t%s %s"
+    private val _edge = "\t\t%s -> %s %s"
+    private val _tail = "}"
+    private val _body = ArrayBuffer[String]()
+
+    private def attribute(label: String = null, attrs: Map[String, String]): String = {
+      if (label != null) {
+        s"[label=$label ${("" /: attrs){ (acc, elem) => s"$acc ${elem._1}=${elem._2}"}}]"
+      }
+      else {
+        s"[${("" /: attrs){ (acc, elem) => s"$acc ${elem._1}=${elem._2}"}}]"
+      }
+    }
+
+    /**
+     * Create a node.
+     * @param name Unique identifier for the node inside the source.
+     * @param label Caption to be displayed (defaults to the node name).
+     * @param attrs Any additional node attributes (must be strings).
+     */
+    def node(name: String, label: String = null, attrs: Map[String, String]): Unit = {
+      _body += _node.format(name, attribute(label, attrs))
+    }
+
+    /**
+     * Create an edge between two nodes.
+     * @param tailName Start node identifier.
+     * @param headName End node identifier.
+     * @param label Caption to be displayed near the edge.
+     * @param attrs Any additional edge attributes (must be strings).
+     */
+    def edge(tailName: String, headName: String,
+        label: String = null, attrs: Map[String, String]): Unit = {
+      _body += _edge.format(tailName, headName, attribute(label, attrs))
+    }
+
+    private def save(filename: String, directory: String): String = {
+      val path = s"$directory${File.separator}$filename"
+      val writer = new PrintWriter(path)
+      try {
+        // scalastyle:off println
+        writer.println(s"${this._head}")
+        this._body.toArray.foreach { line => writer.println(s"$line") }
+        writer.println(s"${this._tail}")
+        writer.flush()
+        // scalastyle:off println
+      } finally {
+        writer.close()
+      }
+      path
+    }
+
+    private def command(engine: String, format: String, filepath: String): String = {
+      require(ENGINES.contains(engine) == true, s"unknown engine: $engine")
+      require(FORMATS.contains(format) == true, s"unknown format: $format")
+      s"$engine -T${format} -O $filepath"
+    }
+
+    /**
+     * Render file with Graphviz engine into format.
+     *  @param engine The layout commmand used for rendering ('dot', 'neato', ...).
+     *  @param format The output format used for rendering ('pdf', 'png', ...).
+     * @param fileName Name of the DOT source file to render.
+     * @param path Path to save the Dot source file.
+     */
+    def render(engine: String = "dot", format: String = "pdf",
+        fileName: String, path: String): Unit = {
+      val filePath = this.save(fileName, path)
+      val args = command(engine, format, filePath)
+      import sys.process._
+      try {
+        args !
+      } catch { case _ : Throwable =>
+        val errorMsg = s"""failed to execute "$args", """ +
+        """"make sure the Graphviz executables are on your systems' path"""
+        throw new RuntimeException(errorMsg)
+      }
+    }
+  }
+
+  /**
+   * convert shape string to list, internal use only
+   * @param str shape string
+   * @return list of string to represent shape
+   */
+  def str2Tuple(str: String): List[String] = {
+    val re = """\d+""".r
+    re.findAllIn(str).toList
+  }
+
+  /**
+   * convert symbol to Dot object for visualization
+   * @param symbol symbol to be visualized
+   * @param title title of the dot graph
+   * @param shape Map of shapes, str -> shape, given input shapes
+   * @param nodeAttrs Map of node's attributes
+   *               for example:
+   *                      nodeAttrs = Map("shape" -> "oval", "fixedsize" -> "fasle")
+   *                      means to plot the network in "oval"
+   * @return Dot object of symbol
+   */
+  def plotNetwork(symbol: Symbol,
+      title: String = "plot", shape: Map[String, Shape] = null,
+      nodeAttrs: Map[String, String] = Map[String, String]()): Dot = {
+
+    val (drawShape, shapeDict) = {
+      if (shape == null) (false, null)
+      else {
+        val internals = symbol.getInternals()
+        val (_, outShapes, _) = internals.inferShape(shape)
+        require(outShapes != null, "Input shape is incompete")
+        val shapeDict = internals.listOutputs().zip(outShapes).toMap
+        (true, shapeDict)
+      }
+    }
+    val conf = JSON.parseFull(symbol.toJson) match {
+      case None => null
+      case Some(map) => map.asInstanceOf[Map[String, Any]]
+    }
+    require(conf != null)
+
+    require(conf.contains("nodes"))
+    val nodes = conf("nodes").asInstanceOf[List[Any]]
+
+    require(conf.contains("heads"))
+    val heads = {
+      val headsList = conf("heads").asInstanceOf[List[List[Int]]]
+      require(headsList.length > 0)
+      headsList(0).toSet
+    }
+
+    // default attributes of node
+    val nodeAttr = scala.collection.mutable.Map("shape" -> "box", "fixedsize" -> "true",
+              "width" -> "1.3", "height" -> "0.8034", "style" -> "filled")
+    // merge the dict provided by user and the default one
+    nodeAttrs.foreach { case (k, v) => nodeAttr(k) = v }
+    val dot = new Dot(name = title)
+    // color map
+    val cm = List(""""#8dd3c7"""", """"#fb8072"""", """"#ffffb3"""",
+                            """"#bebada"""", """"#80b1d3"""", """"#fdb462"""",
+                            """"#b3de69"""", """"#fccde5"""")
+
+    // make nodes
+    nodes.zipWithIndex.foreach { case (node, i) =>
+      val params = node.asInstanceOf[Map[String, Any]]
+      val op = params("op").asInstanceOf[String]
+      val name = params("name").asInstanceOf[String]
+      val param = params("param").asInstanceOf[Map[String, String]]
+      // input data
+      val attr = nodeAttr.clone()
+      var label = op
+      var continue = false
+      op match {
+        case "null" => if (heads.contains(i)) {
+          label = name
+          attr("fillcolor") = cm(0)
+        } else continue = true
+        case "Convolution" => {
+          val kernel = str2Tuple(param("kernel"))
+          val stride = str2Tuple(param("stride"))
+          label =
+            s""""Convolution\\n${kernel(0)}x${kernel(1)}/${stride(0)}, ${param("num_filter")}""""
+          attr("fillcolor") = cm(1)
+        }
+        case "FullyConnected" => {
+          label = s""""FullyConnected\\n${param("num_hidden")}""""
+          attr("fillcolor") = cm(1)
+        }
+        case "BatchNorm" => attr("fillcolor") = cm(3)
+        case "Activation" | "LeakyReLU" => {
+          label = s""""${op}\\n${param("act_type")}""""
+          attr("fillcolor") = cm(2)
+        }
+        case "Pooling" => {
+          val kernel = str2Tuple(param("kernel"))
+          val stride = str2Tuple(param("stride"))
+          label =
+            s""""Pooling\\n${param("pool_type")}, ${kernel(0)}x${kernel(1)}/${stride(0)}""""
+          attr("fillcolor") = cm(4)
+        }
+        case "Concat" | "Flatten" | "Reshape" => attr("fillcolor") = cm(5)
+        case "Softmax" => attr("fillcolor") = cm(6)
+        case _ => attr("fillcolor") = cm(7)
+      }
+      if (!continue) dot.node(name = name , label, attr.toMap)
+    }
+
+    // add edges
+    nodes.zipWithIndex.foreach { case (node, i) =>
+      val params = node.asInstanceOf[Map[String, Any]]
+      val op = params("op").asInstanceOf[String]
+      val name = params("name").asInstanceOf[String]
+      if (op != "null") {
+        val inputs = params("inputs").asInstanceOf[List[List[Double]]]
+        for (item <- inputs) {
+          val inputNode = nodes(item(0).toInt).asInstanceOf[Map[String, Any]]
+          val inputName = inputNode("name").asInstanceOf[String]
+          if (inputNode("op").asInstanceOf[String] != "null" || heads.contains(item(0).toInt)) {
+            val attrs = scala.collection.mutable.Map("dir" -> "back", "arrowtail" -> "open")
+            // add shapes
+            if (drawShape) {
+              val key = {
+                if (inputNode("op").asInstanceOf[String] != "null") s"${inputName}_output"
+                else inputName
+              }
+              val shape = shapeDict(key).toArray.drop(1)
+              val label = s""""${shape.mkString("x")}""""
+              attrs("label") = label
+            }
+            dot.edge(tailName = name, headName = inputName, attrs = attrs.toMap)
+          }
+        }
+      }
+    }
+    dot
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/NAG.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/NAG.scala
new file mode 100644
index 000000000000..59ea76e8b8b0
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/NAG.scala
@@ -0,0 +1,91 @@
+package ml.dmlc.mxnet.optimizer
+
+import ml.dmlc.mxnet.{Optimizer, LRScheduler, NDArray}
+import ml.dmlc.mxnet.NDArrayConversions._
+
+/**
+ * SGD with nesterov.
+ * It is implemented according to
+ * https://github.com/torch/optim/blob/master/sgd.lua
+ *
+ * @author Depeng Liang
+ *
+ * @param learningRate Float, Step size.
+ * @param momentum Float, momentum value.
+ * @param wd Float, L2 regularization coefficient add to all the weights
+ * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
+ * @param lrScheduler The learning rate scheduler
+ */
+class NAG(val learningRate: Float = 0.01f, val momentum: Float = 0.0f,
+          val wd: Float = 0.0001f, val clipGradient: Float = 0f,
+          val lrScheduler: LRScheduler = null) extends Optimizer {
+
+  if (lrScheduler != null) {
+    lrScheduler.baseLR = learningRate
+  }
+
+  /**
+   * Update the parameters.
+   * @param index An unique integer key used to index the parameters
+   * @param weight weight ndarray
+   * @param grad grad ndarray
+   * @param state NDArray or other objects returned by initState
+   *              The auxiliary state used in optimization.
+   */
+  override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
+    // TODO(bing) implement wd_bias, wd_gamma, wd_beta (copy from python package)
+    val lr =
+      (if (lrScheduler != null) {
+        val scheduledLr = lrScheduler(numUpdate)
+        updateCount(index)
+        scheduledLr
+      } else {
+        this.learningRate
+      }) * lrScale.getOrElse(index, 1f)
+
+    val wd = getWd(index, this.wd)
+    var resdGrad = grad * this.rescaleGrad
+    if (clipGradient != 0f) {
+      // to get rid of memory leak
+      val oldResdGrad = resdGrad
+      resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
+      oldResdGrad.dispose()
+    }
+
+    if (state != null) {
+      val mom = state.asInstanceOf[NDArray]
+      mom *= momentum
+      resdGrad += wd * weight
+      mom += resdGrad
+      resdGrad += momentum * mom
+      weight += -lr * resdGrad
+    } else {
+      require(momentum == 0f)
+      // adder = -lr * (resdGrad + this.wd * weight)
+      // we write in this way to get rid of memory leak
+      val adder = this.wd * weight
+      adder += resdGrad
+      adder *= (-lr)
+      weight += adder
+      adder.dispose()
+    }
+
+    resdGrad.dispose()
+  }
+
+  // Create additional optimizer state such as momentum.
+  override def createState(index: Int, weight: NDArray): AnyRef = {
+    if (momentum == 0.0f) {
+      null
+    } else {
+      NDArray.zeros(weight.shape, weight.context)
+    }
+  }
+
+  // Dispose the state it created
+  override def disposeState(state: AnyRef): Unit = {
+    if (state != null) {
+      state.asInstanceOf[NDArray].dispose()
+    }
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
index 349a3ddb31a1..6e35358877e5 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
@@ -10,6 +10,11 @@ import ml.dmlc.mxnet.NDArrayConversions._
 class SGD(private val learningRate: Float = 0.01f, private val momentum: Float = 0.0f,
           private val wd: Float = 0.0001f, private val clipGradient: Float = 0f,
           private val lrScheduler: LRScheduler = null) extends Optimizer {
+
+  if (lrScheduler != null) {
+    lrScheduler.baseLR = learningRate
+  }
+
   /**
    * Update the parameters.
    * @param index An unique integer key used to index the parameters
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGLD.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGLD.scala
new file mode 100644
index 000000000000..a1bd5db55c3c
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGLD.scala
@@ -0,0 +1,70 @@
+package ml.dmlc.mxnet.optimizer
+
+import ml.dmlc.mxnet.{Optimizer, LRScheduler, NDArray}
+import ml.dmlc.mxnet.NDArrayConversions._
+import ml.dmlc.mxnet.Random
+
+/**
+ * Stochastic Langevin Dynamics Updater to sample from a distribution.
+ *
+ * @author Depeng Liang
+ *
+ * @param learningRate Float, Step size.
+ * @param rescaleGradient Float, rescaling factor of gradient.
+ * @param wd Float, L2 regularization coefficient add to all the weights
+ * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
+ * @param lrScheduler The learning rate scheduler
+ */
+class SGLD(val learningRate: Float = 0.01f, val rescaleGradient: Float = 1.0f,
+           val wd: Float = 0.0001f, val clipGradient: Float = 0f,
+           val lrScheduler: LRScheduler = null) extends Optimizer {
+
+  if (lrScheduler != null) {
+    lrScheduler.baseLR = learningRate
+  }
+
+  /**
+   * Update the parameters.
+   * @param index An unique integer key used to index the parameters
+   * @param weight weight ndarray
+   * @param grad grad ndarray
+   * @param state NDArray or other objects returned by initState
+   *              The auxiliary state used in optimization.
+   */
+  override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
+    val lr =
+      (if (lrScheduler != null) {
+        val scheduledLr = lrScheduler(numUpdate)
+        updateCount(index)
+        scheduledLr
+      } else {
+        this.learningRate
+      }) * lrScale.getOrElse(index, 1f)
+
+    val wd = getWd(index, this.wd)
+    var resdGrad = grad * this.rescaleGrad
+    if (clipGradient != 0f) {
+      // to get rid of memory leak
+      val oldResdGrad = resdGrad
+      resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
+      oldResdGrad.dispose()
+    }
+
+    val adder = this.wd * weight
+    adder += resdGrad
+    adder *= -(lr / 2)
+    val norm = Random.normal(0f, Math.sqrt(lr).toFloat, weight.shape, weight.context)
+    adder += norm
+    weight += adder
+    adder.dispose()
+    norm.dispose()
+  }
+
+  // Create additional optimizer state such as momentum.
+  override def createState(index: Int, weight: NDArray): AnyRef = {
+    null
+  }
+
+  // Dispose the state it created
+  override def disposeState(state: AnyRef): Unit = {}
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/ccSGD.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/ccSGD.scala
new file mode 100644
index 000000000000..fbc82a2efd9b
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/ccSGD.scala
@@ -0,0 +1,76 @@
+package ml.dmlc.mxnet.optimizer
+
+import ml.dmlc.mxnet.{Optimizer, LRScheduler, NDArray}
+import ml.dmlc.mxnet.NDArrayConversions._
+import ml.dmlc.mxnet.Base._
+
+
+/**
+ * A very simple SGD optimizer with momentum and weight regularization.
+ * Implemented in C++.
+ *
+ * @author Depeng Liang
+ *
+ * @param learningRate Float, Step size.
+ * @param momentum Float, momentum value.
+ * @param rescaleGradient Float, rescaling factor of gradient.
+ * @param wd Float, L2 regularization coefficient add to all the weights
+ * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
+ * @param lrScheduler The learning rate scheduler
+ */
+class ccSGD(val learningRate: Float = 0.01f, val momentum: Float = 0.0f,
+            val wd: Float = 0.0001f, val rescaleGradient: Float = 1.0f,
+            val clipGradient: Float = -1f, val lrScheduler: LRScheduler = null
+    ) extends Optimizer {
+
+  if (lrScheduler != null) {
+    lrScheduler.baseLR = learningRate
+  }
+
+  private val optCreator = new OptimizerCreatorRef
+  private val optHandle = new OptimizerHandleRef
+
+  checkCall(_LIB.mxOptimizerFindCreator("ccsgd", optCreator))
+  private val paramKeys = Array("momentum", "rescale_grad", "clip_gradient")
+  private val paramvals = Array(s"$momentum", s"$rescaleGradient", s"$clipGradient")
+  checkCall(_LIB.mxOptimizerCreateOptimizer(
+    optCreator.value, paramKeys.length, paramKeys, paramvals, optHandle))
+
+  /**
+   * Update the parameters.
+   * @param index An unique integer key used to index the parameters
+   * @param weight weight ndarray
+   * @param grad grad ndarray
+   * @param state NDArray or other objects returned by initState
+   *              The auxiliary state used in optimization.
+   */
+  override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
+    val lr =
+      (if (lrScheduler != null) {
+        val scheduledLr = lrScheduler(numUpdate)
+        updateCount(index)
+        scheduledLr
+      } else {
+        this.learningRate
+      }) * lrScale.getOrElse(index, 1f)
+
+    val wd = getWd(index, this.wd)
+    checkCall(_LIB.mxOptimizerUpdate(optHandle.value, index, weight.handle, grad.handle, lr, wd))
+  }
+
+  // Create additional optimizer state such as momentum.
+  override def createState(index: Int, weight: NDArray): AnyRef = {
+    null
+  }
+
+  // Dispose the state it created
+  override def disposeState(state: AnyRef): Unit = {}
+
+  /**
+   * Free the optimizer handle.
+   * The object shall never be used after it is disposed.
+   */
+  def dispose(): Unit = {
+    checkCall(_LIB.mxOptimizerFree(optHandle.value))
+  }
+}
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/ExecutorSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/ExecutorSuite.scala
index c0dfa5a7d2a9..6aa88fc9af45 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/ExecutorSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/ExecutorSuite.scala
@@ -43,7 +43,7 @@ class ExecutorSuite extends FunSuite with BeforeAndAfterAll {
 
   test("reshape") {
     val x = Symbol.Variable("x")
-    val y = Symbol.FullyConnected()(Map("data" -> x, "num_hidden" -> 4))
+    val y = Symbol.FullyConnected()()(Map("data" -> x, "num_hidden" -> 4))
 
     val exec = y.simpleBind(Context.cpu(), "write", shapeDict = Map("x" -> Shape(5, 4)))
     exec.argArrays(0).set(1)
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
index 62301d3e0864..c6dc0a7bfd16 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
@@ -13,7 +13,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
   private def checkElementwiseSumWithShape(shape: Shape, n: Int) = {
     // forward
     val inputs = (0 until n).map(i => Symbol.Variable(s"arg $i"))
-    val out = Symbol.ElementWiseSum(name = "esum")(inputs.toArray)
+    val out = Symbol.ElementWiseSum(name = "esum")(inputs: _*)()
     val arr = (0 until n).map(_ => Random.uniform(-10, 10, shape))
     val arrGrad = (0 until n).map(_ => NDArray.empty(shape))
     val exec = out.bind(Context.cpu(), args = arr, argsGrad = arrGrad)
@@ -46,7 +46,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val targetDim = shapes.map(_(dimension)).sum
 
     val inputs = (0 until shapes.size).map(i => Symbol.Variable(s"arg$i"))
-    val out = Symbol.Concat(name = "conc")(inputs.toArray, Map("dim" -> dimension))
+    val out = Symbol.Concat(name = "conc")(inputs: _*)(Map("dim" -> dimension))
     val arr = shapes.map { shape =>
       val nd = NDArray.empty(shape)
       nd.set(shape(dimension))
@@ -120,11 +120,11 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
   }
 
   test("regression") {
-    checkRegression(Symbol.LogisticRegressionOutput()(
+    checkRegression(Symbol.LogisticRegressionOutput()()(
       Map("data" -> Symbol.Variable("data"), "label" -> Symbol.Variable("label"))),
       (x: Float) => 1.0f / (1.0f + Math.exp(-x).toFloat),
       (x: Float, y: Float) => x - y)
-    checkRegression(Symbol.LinearRegressionOutput()(
+    checkRegression(Symbol.LinearRegressionOutput()()(
       Map("data" -> Symbol.Variable("data"), "label" -> Symbol.Variable("label"))),
       (x: Float) => x,
       (x: Float, y: Float) => x - y)
@@ -147,8 +147,8 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     // [[ 2.,  2.,  2.,  2.],
     //  [ 2.,  2.,  2.,  2.],
     //  [ 2.,  2.,  2.,  2.]]]
-    val swap0 = Symbol.SwapAxis()(Map("data" -> data, "dim1" -> 0, "dim2" -> 2))
-    val swap = Symbol.SwapAxis()(Map("data" -> swap0, "dim1" -> 1, "dim2" -> 2))
+    val swap0 = Symbol.SwapAxis()()(Map("data" -> data, "dim1" -> 0, "dim2" -> 2))
+    val swap = Symbol.SwapAxis()()(Map("data" -> swap0, "dim1" -> 1, "dim2" -> 2))
     val exec = swap.bind(Context.cpu(), args = Array(arrData))
     exec.forward()
     val out = exec.outputs(0)
@@ -247,7 +247,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val batch = 24
 
     val data = Symbol.Variable("data")
-    val embed = Symbol.Embedding(name = "embed")(
+    val embed = Symbol.Embedding(name = "embed")()(
       Map("data" -> data, "input_dim" -> inDim, "output_dim" -> outDim))
     // TODO
     // scalastyle:off println
@@ -278,7 +278,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val arrData = dataTmp.copy()
     val arrGrad = NDArray.ones(shape) * 3
 
-    val test = Symbol.sign(data)
+    val test = Symbol.sign()(data)()
     val exeTest = test.bind(Context.cpu(), args = Array(arrData), argsGrad = Array(arrGrad))
     exeTest.forward()
     val out = exeTest.outputs.head
@@ -297,7 +297,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val arrData = dataTmp.copy()
     val arrGrad = NDArray.ones(shape) * 2
 
-    val test = Symbol.round(data) + Symbol.ceil(data) + Symbol.floor(data)
+    val test = Symbol.round()(data)() + Symbol.ceil()(data)() + Symbol.floor()(data)()
     val exeTest = test.bind(Context.cpu(), args = Array(arrData))
     exeTest.forward()
     val out = exeTest.outputs.head
@@ -312,7 +312,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val arrData = dataTmp.copy()
     val arrGrad = NDArray.ones(shape) * 3
 
-    val test = Symbol.rsqrt(data) + Symbol.cos(data) + Symbol.sin(data)
+    val test = Symbol.rsqrt()(data)() + Symbol.cos()(data)() + Symbol.sin()(data)()
     val exeTest = test.bind(Context.cpu(), args = Array(arrData), argsGrad = Array(arrGrad))
     exeTest.forward()
     val out = exeTest.outputs.head
@@ -370,7 +370,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
 
   test("transpose") {
     val data = Symbol.Variable("data")
-    val test = Symbol.transpose(data)
+    val test = Symbol.transpose()(data)()
 
     val shape = Shape(3, 4)
     val ctx = Context.cpu()
@@ -391,6 +391,39 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     assert(reldiff(out.toArray, trans) < 1e-6)
   }
 
+  test("smooth_l1 & makeloss") {
+    val data = Symbol.Variable("data")
+    val smoothL1 = Symbol.smooth_l1()()(Map("data" -> data, "scalar" -> 1.0f))
+    val loss = Symbol.MakeLoss()()(Map("data" -> smoothL1))
+
+    val shape = Shape(2, 6)
+    val ctx = Context.cpu()
+    val input = NDArray.empty(ctx, shape.toArray: _*)
+    val grad = NDArray.empty(ctx, shape.toArray: _*)
+    val array = Array[Float](
+        -3.5f, -2.5f, -1.5f, -0.5f, -0.3f, -0.1f,
+        0.1f, 0.3f, 0.5f, 1.5f, 2.5f, 3.5f)
+    input.set(array)
+
+    val arrTmp = Array[Float](
+        3.0f, 2.0f, 1.0f, 0.125f, 0.045f, 0.005f,
+        0.005f, 0.045f, 0.125f, 1.0f, 2.0f, 3.0f)
+    val gradTmp = Array[Float](
+        -1.0f, -1.0f, -1.0f, -0.5f, -0.3f, -0.1f,
+        0.1f, 0.3f, 0.5f, 1.0f, 1.0f, 1.0f)
+
+    val exeTest =
+      loss.bind(ctx, args = Map("data" -> input), argsGrad = Map("data" -> grad))
+    exeTest.forward(isTrain = true)
+    val out = exeTest.outputs.head
+
+    assert(reldiff(out.toArray, arrTmp) < 1e-6)
+
+    exeTest.backward()
+
+    assert(reldiff(grad.toArray, gradTmp) < 1e-6)
+  }
+
   test("maximum minimum scalar") {
     val data = Symbol.Variable("data")
     val shape = Shape(3, 4)
@@ -413,7 +446,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val arrData = dataTmp.copy()
     val arrGrad = NDArray.ones(shape) * 3
 
-    val test = Symbol.abs(data)
+    val test = Symbol.abs()(data)()
     val exeTest = test.bind(Context.cpu(), args = Array(arrData), argsGrad = Array(arrGrad))
     exeTest.forward()
     val out = exeTest.outputs.head
@@ -439,10 +472,10 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
                                                 pad: (Int, Int)): Unit = {
     require(inputShape(1) == numFilter)
     val data = Symbol.Variable(name = "data")
-    val conv = Symbol.Convolution(name = "conv")(Map(
+    val conv = Symbol.Convolution(name = "conv")()(Map(
       "data" -> data, "kernel" -> kernel, "stride" -> stride, "pad" -> pad,
       "num_filter" -> numFilter, "no_bias" -> "true"))
-    val deconv = Symbol.Deconvolution(name = "deconv")(Map(
+    val deconv = Symbol.Deconvolution(name = "deconv")()(Map(
       "data" -> conv, "kernel" -> kernel, "stride" -> stride, "pad" -> pad,
       "num_filter" -> numFilter, "no_bias" -> "true"))
 
@@ -498,11 +531,11 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val stride = (1, 1)
     val kernel = (2 * pad._1 + 1, 2 * pad._2 + 1)
     val dataConv = Symbol.Variable(name = "data_conv")
-    val conv = Symbol.Convolution(name = "conv")(Map(
+    val conv = Symbol.Convolution(name = "conv")()(Map(
       "data" -> dataConv, "kernel" -> kernel, "stride" -> stride, "pad" -> pad,
       "num_filter" -> numFilter, "no_bias" -> "true"))
     val dataDeconv = Symbol.Variable(name = "data_deconv")
-    val deconv = Symbol.Deconvolution(name = "deconv")(Map(
+    val deconv = Symbol.Deconvolution(name = "deconv")()(Map(
       "data" -> dataDeconv, "kernel" -> kernel, "stride" -> stride, "pad" -> pad,
       "num_filter" -> numFilter, "no_bias" -> "true"))
 
@@ -550,8 +583,8 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
       (s"arg_$i", NDArray.zeros(shape))
     }.toMap
 
-    val up = Symbol.UpSampling()((0 until shapes.size).map(i => Symbol.Variable(s"arg_$i")).toArray,
-      Map("sample_type" -> "nearest", "scale" -> rootScale))
+    val upArgs = (0 until shapes.size).map(i => Symbol.Variable(s"arg_$i"))
+    val up = Symbol.UpSampling()(upArgs: _*)(Map("sample_type" -> "nearest", "scale" -> rootScale))
     val exe = up.bind(Context.cpu(), args = arr, argsGrad = arrGrad)
     exe.forward(isTrain = true)
     exe.backward(exe.outputs)
@@ -583,7 +616,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
 
   test("batch norm") {
     val data = Symbol.Variable("data")
-    val test = Symbol.BatchNorm(name = "bn")(Map("data" -> data, "fix_gamma" -> "False"))
+    val test = Symbol.BatchNorm(name = "bn")()(Map("data" -> data, "fix_gamma" -> "False"))
     // scalastyle:off println
     println(s"BatchNorm: ${test.toJson}")
     // scalastyle:on println
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/RecordIOSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/RecordIOSuite.scala
new file mode 100644
index 000000000000..670e3f6cafa7
--- /dev/null
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/RecordIOSuite.scala
@@ -0,0 +1,69 @@
+package ml.dmlc.mxnet
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import java.io._
+
+class RecordIOSuite extends FunSuite with BeforeAndAfterAll {
+  test("test RecordIO") {
+    val fRec = File.createTempFile("tmpFile", ".tmp")
+    val N = 255
+
+    val writer = new MXRecordIO(fRec.getAbsolutePath, MXRecordIO.IOWrite)
+    for (i <- 0 until N) {
+      writer.write(s"$i")
+    }
+    writer.close()
+
+    val reader = new MXRecordIO(fRec.getAbsolutePath, MXRecordIO.IORead)
+    for (i <- 0 until N) {
+      val res = reader.read()
+      assert(res === s"$i")
+    }
+  }
+
+  test("test IndexedRecordIO") {
+    val fIdxRec = File.createTempFile("tmpIdxFile", ".tmp")
+    val fIdx = File.createTempFile("tmpIdx", ".tmp")
+    val N = 255
+
+    val writer = new MXIndexedRecordIO(fIdx.getAbsolutePath,
+        fIdxRec.getAbsolutePath, MXRecordIO.IOWrite)
+    for (i <- 0 until N) {
+      writer.writeIdx(i, s"$i")
+    }
+    writer.close()
+
+    val reader = new MXIndexedRecordIO(fIdx.getAbsolutePath,
+        fIdxRec.getAbsolutePath, MXRecordIO.IORead)
+    var keys = reader.keys().map(_.asInstanceOf[Int]).toList.sorted
+    assert(keys.zip(0 until N).forall(x => x._1 == x._2))
+    keys = scala.util.Random.shuffle(keys)
+    for (k <- keys) {
+      val res = reader.readIdx(k)
+      assert(res === s"$k")
+    }
+  }
+
+  test("test RecordIOPackLabel") {
+    val fRec = File.createTempFile("tmpFile", ".tmp")
+    val N = 255
+
+    val charsDigits =
+      (0 until 26).map(x => ('A' + x).toChar.toString ).toArray ++ (0 to 9).map(_.toString)
+
+    for (i <- 1 until N) {
+      for (j <- 0 until N) {
+        val content = {
+          val idx = scala.util.Random.shuffle(charsDigits.indices.toList).take(j)
+          idx.map(charsDigits(_)).mkString
+        }
+        val label = (0 until i).map(x => scala.util.Random.nextFloat()).toArray
+        val header = MXRecordIO.IRHeader(0, label, 0, 0)
+        val s = MXRecordIO.pack(header, content)
+        val (rHeader, rContent) = MXRecordIO.unpack(s)
+        assert(label === rHeader.label)
+        assert(content === rContent)
+      }
+    }
+  }
+}
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/SymbolSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/SymbolSuite.scala
index 71fb8f257191..0970e355c20a 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/SymbolSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/SymbolSuite.scala
@@ -6,14 +6,14 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
   test("symbol compose") {
     val data = Symbol.Variable("data")
 
-    var net1 = Symbol.FullyConnected(name = "fc1")(Map("data" -> data, "num_hidden" -> 10))
-    net1 = Symbol.FullyConnected(name = "fc2")(Map("data" -> net1, "num_hidden" -> 100))
+    var net1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 10))
+    net1 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> net1, "num_hidden" -> 100))
     assert(net1.listArguments().toArray ===
       Array("data", "fc1_weight", "fc1_bias", "fc2_weight", "fc2_bias"))
 
-    var net2 = Symbol.FullyConnected(name = "fc3")(Map("num_hidden" -> 10))
-    net2 = Symbol.Activation()(Map("data" -> net2, "act_type" -> "relu"))
-    net2 = Symbol.FullyConnected(name = "fc4")(Map("data" -> net2, "num_hidden" -> 20))
+    var net2 = Symbol.FullyConnected(name = "fc3")()(Map("num_hidden" -> 10))
+    net2 = Symbol.Activation()()(Map("data" -> net2, "act_type" -> "relu"))
+    net2 = Symbol.FullyConnected(name = "fc4")()(Map("data" -> net2, "num_hidden" -> 20))
     // scalastyle:off println
     println(s"net2 debug info:\n${net2.debugStr}")
     // scalastyle:on println
@@ -28,8 +28,8 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
 
   test("symbol internal") {
     val data = Symbol.Variable("data")
-    val oldfc = Symbol.FullyConnected(name = "fc1")(Map("data" -> data, "num_hidden" -> 10))
-    val net1 = Symbol.FullyConnected(name = "fc2")(Map("data" -> oldfc, "num_hidden" -> 100))
+    val oldfc = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 10))
+    val net1 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> oldfc, "num_hidden" -> 100))
     assert(net1.listArguments().toArray
       === Array("data", "fc1_weight", "fc1_bias", "fc2_weight", "fc2_bias"))
     val internal = net1.getInternals()
@@ -39,9 +39,9 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
 
   test("symbol infer type") {
     val data = Symbol.Variable("data")
-    val f32data = Symbol.Cast()(Map("data" -> data, "dtype" -> "float32"))
-    val fc1 = Symbol.FullyConnected(name = "fc1")(Map("data" -> f32data, "num_hidden" -> 128))
-    val mlp = Symbol.SoftmaxOutput(name = "softmax")(Map("data" -> fc1))
+    val f32data = Symbol.Cast()()(Map("data" -> data, "dtype" -> "float32"))
+    val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> f32data, "num_hidden" -> 128))
+    val mlp = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc1))
 
     val (arg, out, aux) = mlp.inferType(Map("data" -> classOf[Double]))
     assert(arg.toArray === Array(classOf[Double], classOf[Float], classOf[Float], classOf[Float]))
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
index 57c065f2e86b..837258a2406d 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
@@ -16,23 +16,23 @@ class ConvSuite extends FunSuite with BeforeAndAfterAll {
     val batchSize = 100
 
     val data = Symbol.Variable("data")
-    val conv1 = Symbol.Convolution(name = "conv1")(Map("data" -> data, "num_filter" -> 32,
-                                                       "kernel" -> (3, 3), "stride" -> (2, 2)))
-    val bn1 = Symbol.BatchNorm(name = "bn1")(Map("data" -> conv1))
-    val act1 = Symbol.Activation(name = "relu1")(Map("data" -> bn1, "act_type" -> "relu"))
-    val mp1 = Symbol.Pooling(name = "mp1")(Map("data" -> act1, "kernel" -> (2, 2),
-                                               "stride" -> (2, 2), "pool_type" -> "max"))
+    val conv1 = Symbol.Convolution(name = "conv1")()(Map("data" -> data, "num_filter" -> 32,
+                                                         "kernel" -> (3, 3), "stride" -> (2, 2)))
+    val bn1 = Symbol.BatchNorm(name = "bn1")()(Map("data" -> conv1))
+    val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> bn1, "act_type" -> "relu"))
+    val mp1 = Symbol.Pooling(name = "mp1")()(Map("data" -> act1, "kernel" -> (2, 2),
+                                                 "stride" -> (2, 2), "pool_type" -> "max"))
 
-    val conv2 = Symbol.Convolution(name = "conv2")(Map("data" -> mp1, "num_filter" -> 32,
-                                                       "kernel" -> (3, 3), "stride" -> (2, 2)))
-    val bn2 = Symbol.BatchNorm(name = "bn2")(Map("data" -> conv2))
-    val act2 = Symbol.Activation(name = "relu2")(Map("data" -> bn2, "act_type" -> "relu"))
-    val mp2 = Symbol.Pooling(name = "mp2")(Map("data" -> act2, "kernel" -> (2, 2),
-                                               "stride" -> (2, 2), "pool_type" -> "max"))
+    val conv2 = Symbol.Convolution(name = "conv2")()(Map("data" -> mp1, "num_filter" -> 32,
+                                                         "kernel" -> (3, 3), "stride" -> (2, 2)))
+    val bn2 = Symbol.BatchNorm(name = "bn2")()(Map("data" -> conv2))
+    val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> bn2, "act_type" -> "relu"))
+    val mp2 = Symbol.Pooling(name = "mp2")()(Map("data" -> act2, "kernel" -> (2, 2),
+                                                 "stride" -> (2, 2), "pool_type" -> "max"))
 
-    val fl = Symbol.Flatten(name = "flatten")(Map("data" -> mp2))
-    val fc2 = Symbol.FullyConnected(name = "fc2")(Map("data" -> fl, "num_hidden" -> 10))
-    val softmax = Symbol.SoftmaxOutput(name = "sm")(Map("data" -> fc2))
+    val fl = Symbol.Flatten(name = "flatten")()(Map("data" -> mp2))
+    val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> fl, "num_hidden" -> 10))
+    val softmax = Symbol.SoftmaxOutput(name = "sm")()(Map("data" -> fc2))
 
     // get data
     "./scripts/get_mnist_data.sh" !
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 2b3f44c42448..cc331129cec8 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -5,12 +5,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-parent_2.10</artifactId>
+    <artifactId>mxnet-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-examples_2.10</artifactId>
+  <artifactId>mxnet-examples_2.11</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Examples</name>
   <packaging>pom</packaging>
@@ -152,17 +152,17 @@
     </dependency>
     <dependency>
       <groupId>com.sksamuel.scrimage</groupId>
-      <artifactId>scrimage-core_2.10</artifactId>
+      <artifactId>scrimage-core_2.11</artifactId>
       <version>2.1.5</version>
     </dependency>
     <dependency>
       <groupId>com.sksamuel.scrimage</groupId>
-      <artifactId>scrimage-io-extra_2.10</artifactId>
+      <artifactId>scrimage-io-extra_2.11</artifactId>
       <version>2.1.5</version>
     </dependency>
     <dependency>
       <groupId>com.sksamuel.scrimage</groupId>
-      <artifactId>scrimage-filters_2.10</artifactId>
+      <artifactId>scrimage-filters_2.11</artifactId>
       <version>2.1.5</version>
     </dependency>
   </dependencies>
diff --git a/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh b/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh
new file mode 100644
index 000000000000..32f20a153078
--- /dev/null
+++ b/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+INPUT_IMG=$1 
+MODEL_DIR=$2
+OUTPUT_DIR=$3
+GPU=0
+
+java -Xmx1024m -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.neuralstyle.end2end.BoostInference \
+	--model-path $MODEL_DIR \
+	--input-image $INPUT_IMG \
+	--output-path $OUTPUT_DIR \
+	--gpu $GPU
\ No newline at end of file
diff --git a/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh b/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh
new file mode 100644
index 000000000000..4c1f1dfead9a
--- /dev/null
+++ b/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+# more details please refer to
+# https://github.com/Ldpe2G/mxnet/blob/develop/example/neural-style/end_to_end/README.md
+TRAIN_DATA_PATH=$1 
+STYLE_IMG=$2 
+VGG_MODEL_PATH=$3
+SAVE_MODEL_DIR=$4
+GPU=0
+
+java -Xmx1024m -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.neuralstyle.end2end.BoostTrain \
+	--data-path $TRAIN_DATA_PATH  \
+	--vgg--model-path  $VGG_MODEL_PATH \
+	--save--model-path $SAVE_MODEL_DIR \
+	--style-image $STYLE_IMG \
+	--gpu $GPU
\ No newline at end of file
diff --git a/scala-package/examples/scripts/run_multitask.sh b/scala-package/examples/scripts/run_multitask.sh
new file mode 100644
index 000000000000..d5d620ca5b6a
--- /dev/null
+++ b/scala-package/examples/scripts/run_multitask.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+# which gpu card to use, -1 means cpu
+GPU=$1
+
+# the mnist data path
+# you can get the mnist data using the script core/scripts/get_mnist_data.sh
+DATA_PATH=$2
+
+java -Xmx4G -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.multitask.ExampleMultiTask \
+	--data-path $DATA_PATH \
+	--gpu $GPU \
diff --git a/scala-package/examples/scripts/run_visualization.sh b/scala-package/examples/scripts/run_visualization.sh
new file mode 100644
index 000000000000..9a9e1ae41649
--- /dev/null
+++ b/scala-package/examples/scripts/run_visualization.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-cpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+# please install the Graphviz library
+# if you are using ubuntu, use the following command:
+# sudo apt-get install graphviz
+
+# path to save the generated visualization result
+OUT_DIR=$1
+# net to visualze, e.g. "LeNet", "AlexNet", "VGG", "GoogleNet", "Inception_BN", "Inception_V3", "ResNet_Small"
+NET=$2
+
+java -Xmx1024m -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.visualization.ExampleVis \
+	--out-dir $OUT_DIR  \
+	--net $NET 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/CNNTextClassification.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/CNNTextClassification.scala
index 748632c28137..e39bc5564523 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/CNNTextClassification.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/CNNTextClassification.scala
@@ -35,26 +35,26 @@ object CNNTextClassification {
     val inputX = Symbol.Variable("data")
     val inputY = Symbol.Variable("softmax_label")
     val polledOutputs = filterList.map { filterSize =>
-      val conv = Symbol.Convolution()(Map("data" -> inputX, "kernel" -> s"($filterSize, $numEmbed)",
-          "num_filter" -> numFilter))
-       val relu = Symbol.Activation()(Map("data" -> conv, "act_type" -> "relu"))
-       val pool = Symbol.Pooling()(Map("data" -> relu, "pool_type" -> "max",
-           "kernel" -> s"(${sentenceSize - filterSize + 1}, 1)", "stride" -> "(1,1)"))
-       pool
+      val conv = Symbol.Convolution()()(
+        Map("data" -> inputX, "kernel" -> s"($filterSize, $numEmbed)", "num_filter" -> numFilter))
+      val relu = Symbol.Activation()()(Map("data" -> conv, "act_type" -> "relu"))
+      val pool = Symbol.Pooling()()(Map("data" -> relu, "pool_type" -> "max",
+        "kernel" -> s"(${sentenceSize - filterSize + 1}, 1)", "stride" -> "(1,1)"))
+      pool
     }
 
     val totalFilters = numFilter * filterList.length
-    val concat = Symbol.Concat()(polledOutputs, Map("dim" -> 1))
-    val hPool = Symbol.Reshape()(Map("data" -> concat,
+    val concat = Symbol.Concat()(polledOutputs: _*)(Map("dim" -> 1))
+    val hPool = Symbol.Reshape()()(Map("data" -> concat,
       "target_shape" -> s"($batchSize, $totalFilters)"))
 
     val hDrop = {
-      if (dropout > 0f) Symbol.Dropout()(Map("data" -> hPool, "p" -> dropout))
+      if (dropout > 0f) Symbol.Dropout()()(Map("data" -> hPool, "p" -> dropout))
       else hPool
     }
 
-    val fc = Symbol.FullyConnected()(Map("data" -> hDrop, "num_hidden" -> numLabel))
-    val sm = Symbol.SoftmaxOutput()(Map("data" -> fc, "label" -> inputY))
+    val fc = Symbol.FullyConnected()()(Map("data" -> hDrop, "num_hidden" -> numLabel))
+    val sm = Symbol.SoftmaxOutput()()(Map("data" -> fc, "label" -> inputY))
     sm
   }
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
index 44792cf4fc00..5097cf196d3c 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
@@ -13,12 +13,12 @@ object TrainMnist {
   // multi-layer perceptron
   def getMlp: Symbol = {
     val data = Symbol.Variable("data")
-    val fc1 = Symbol.FullyConnected(name = "fc1")(Map("data" -> data, "num_hidden" -> 128))
-    val act1 = Symbol.Activation(name = "relu1")(Map("data" -> fc1, "act_type" -> "relu"))
-    val fc2 = Symbol.FullyConnected(name = "fc2")(Map("data" -> act1, "num_hidden" -> 64))
-    val act2 = Symbol.Activation(name = "relu2")(Map("data" -> fc2, "act_type" -> "relu"))
-    val fc3 = Symbol.FullyConnected(name = "fc3")(Map("data" -> act2, "num_hidden" -> 10))
-    val mlp = Symbol.SoftmaxOutput(name = "softmax")(Map("data" -> fc3))
+    val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
+    val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
+    val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
+    val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
+    val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
+    val mlp = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc3))
     mlp
   }
 
@@ -28,23 +28,25 @@ object TrainMnist {
   def getLenet: Symbol = {
     val data = Symbol.Variable("data")
     // first conv
-    val conv1 = Symbol.Convolution()(Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
-    val tanh1 = Symbol.Activation()(Map("data" -> conv1, "act_type" -> "tanh"))
-    val pool1 = Symbol.Pooling()(Map("data" -> tanh1, "pool_type" -> "max",
-                                     "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
+    val conv1 = Symbol.Convolution()()(
+      Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
+    val tanh1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "tanh"))
+    val pool1 = Symbol.Pooling()()(Map("data" -> tanh1, "pool_type" -> "max",
+                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
     // second conv
-    val conv2 = Symbol.Convolution()(Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
-    val tanh2 = Symbol.Activation()(Map("data" -> conv2, "act_type" -> "tanh"))
-    val pool2 = Symbol.Pooling()(Map("data" -> tanh2, "pool_type" -> "max",
-                                     "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
+    val conv2 = Symbol.Convolution()()(
+      Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
+    val tanh2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "tanh"))
+    val pool2 = Symbol.Pooling()()(Map("data" -> tanh2, "pool_type" -> "max",
+                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
     // first fullc
-    val flatten = Symbol.Flatten()(Map("data" -> pool2))
-    val fc1 = Symbol.FullyConnected()(Map("data" -> flatten, "num_hidden" -> 500))
-    val tanh3 = Symbol.Activation()(Map("data" -> fc1, "act_type" -> "tanh"))
+    val flatten = Symbol.Flatten()()(Map("data" -> pool2))
+    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 500))
+    val tanh3 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "tanh"))
     // second fullc
-    val fc2 = Symbol.FullyConnected()(Map("data" -> tanh3, "num_hidden" -> 10))
+    val fc2 = Symbol.FullyConnected()()(Map("data" -> tanh3, "num_hidden" -> 10))
     // loss
-    val lenet = Symbol.SoftmaxOutput(name = "softmax")(Map("data" -> fc2))
+    val lenet = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc2))
     lenet
   }
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/Data.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/Data.scala
new file mode 100644
index 000000000000..fa5f8779cb6e
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/Data.scala
@@ -0,0 +1,34 @@
+package ml.dmlc.mxnet.examples.multitask
+
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.IO
+import ml.dmlc.mxnet.DataIter
+
+/**
+ * @author Depeng Liang
+ */
+object Data {
+
+  // return train and val iterators for mnist
+  def mnistIterator(dataPath: String, batchSize: Int, inputShape: Shape): (DataIter, DataIter) = {
+    val flat = if (inputShape.length == 3) "False" else "True"
+    val trainParams = Map(
+      "image" -> s"$dataPath/train-images-idx3-ubyte",
+      "label" -> s"$dataPath/train-labels-idx1-ubyte",
+      "input_shape" -> inputShape.toString(),
+      "batch_size" -> s"$batchSize",
+      "shuffle" -> "True",
+      "flat" -> flat
+    )
+    val trainDataIter = IO.MNISTIter(trainParams)
+    val testParams = Map(
+      "image" -> s"$dataPath/t10k-images-idx3-ubyte",
+      "label" -> s"$dataPath/t10k-labels-idx1-ubyte",
+      "input_shape" -> inputShape.toString(),
+      "batch_size" -> s"$batchSize",
+      "flat" -> flat
+    )
+    val testDataIter = IO.MNISTIter(testParams)
+    (trainDataIter, testDataIter)
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/ExampleMultiTask.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/ExampleMultiTask.scala
new file mode 100644
index 000000000000..6089ee7f1c18
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/ExampleMultiTask.scala
@@ -0,0 +1,317 @@
+package ml.dmlc.mxnet.examples.multitask
+
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+import scala.collection.JavaConverters._
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.DataIter
+import ml.dmlc.mxnet.DataBatch
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.EvalMetric
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.Xavier
+import ml.dmlc.mxnet.optimizer.RMSProp
+
+/**
+ * Example of multi-task
+ * @author Depeng Liang
+ */
+object ExampleMultiTask {
+  private val logger = LoggerFactory.getLogger(classOf[ExampleMultiTask])
+
+  def buildNetwork(): Symbol = {
+    val data = Symbol.Variable("data")
+    val fc1 = Symbol.FullyConnected("fc1")()(Map("data" -> data, "num_hidden" -> 128))
+    val act1 = Symbol.Activation("relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
+    val fc2 = Symbol.FullyConnected("fc2")()(Map("data" -> act1, "num_hidden" -> 64))
+    val act2 = Symbol.Activation("relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
+    val fc3 = Symbol.FullyConnected("fc3")()(Map("data" -> act2, "num_hidden" -> 10))
+    val sm1 = Symbol.SoftmaxOutput("softmax1")()(Map("data" -> fc3))
+    val sm2 = Symbol.SoftmaxOutput("softmax2")()(Map("data" -> fc3))
+
+    val softmax = Symbol.Group(sm1, sm2)
+
+    softmax
+  }
+
+  // multi label mnist iterator
+  class MultiMnistIterator(dataIter: DataIter) extends DataIter {
+
+    @throws(classOf[NoSuchElementException])
+    override def next(): DataBatch = {
+      if (hasNext) {
+        val batch = this.dataIter.next()
+        val label = batch.label(0)
+        new DataBatch(batch.data,
+                                     IndexedSeq(label, label),
+                                     batch.index,
+                                     batch.pad)
+      } else {
+        throw new NoSuchElementException
+      }
+    }
+
+    /**
+     * reset the iterator
+     */
+    override def reset(): Unit = this.dataIter.reset()
+
+    override def batchSize: Int = dataIter.batchSize
+
+    /**
+     * get data of current batch
+     * @return the data of current batch
+     */
+    override def getData(): IndexedSeq[NDArray] = this.dataIter.getData()
+
+    /**
+     * Get label of current batch
+     * @return the label of current batch
+     */
+    override def getLabel(): IndexedSeq[NDArray] = {
+      val label = this.dataIter.getLabel()(0)
+      IndexedSeq(label, label)
+    }
+
+    /**
+     * the index of current batch
+     * @return
+     */
+    override def getIndex(): IndexedSeq[Long] = this.dataIter.getIndex()
+
+    // The name and shape of label provided by this iterator
+    override def provideLabel: Map[String, Shape] = {
+      val provideLabel = this.dataIter.provideLabel.toArray
+      // Different labels should be used here for actual application
+      Map("softmax1_label" -> provideLabel(0)._2,
+          "softmax2_label" -> provideLabel(0)._2)
+    }
+
+    /**
+     * get the number of padding examples
+     * in current batch
+     * @return number of padding examples in current batch
+     */
+    override def getPad(): Int = this.dataIter.getPad()
+
+    // The name and shape of data provided by this iterator
+    override def provideData: Map[String, Shape] = this.dataIter.provideData
+
+    override def hasNext: Boolean = this.dataIter.hasNext
+  }
+
+  class MultiAccuracy(num: Int, name: String) {
+    require(num >= 1)
+
+    private var sumMetric: Array[Float] = new Array[Float](num)
+    private var numInst: Array[Int] = new Array[Int](num)
+
+    def update(labels: IndexedSeq[NDArray], preds: IndexedSeq[NDArray]): Unit = {
+      require(labels.length == preds.length,
+        "labels and predictions should have the same length.")
+      assert(labels.length == num)
+
+      for (i <- labels.indices) {
+        val (pred, label) = (preds(i), labels(i))
+        val predLabel = NDArray.argmaxChannel(pred)
+        require(label.shape == predLabel.shape,
+          s"label ${label.shape} and prediction ${predLabel.shape}" +
+          s"should have the same length.")
+        for ((labelElem, predElem) <- label.toArray zip predLabel.toArray) {
+          if (labelElem == predElem) {
+            this.sumMetric(i) += 1
+          }
+        }
+        this.numInst(i) += predLabel.shape(0)
+        predLabel.dispose()
+      }
+    }
+
+    def get(): Array[(String, Float)] = {
+      (0 until num).map( i => (this.name, this.sumMetric(i) / this.numInst(i))).toArray
+    }
+
+    def reset(): Unit = {
+      this.numInst = this.numInst.map(x => 0)
+      this.sumMetric = this.numInst.map(x => 0f)
+    }
+
+  }
+
+  class Speedometer(val batchSize: Int, val frequent: Int = 50) {
+    private val logger = LoggerFactory.getLogger(classOf[Speedometer])
+    private var init = false
+    private var tic: Long = 0L
+    private var lastCount: Int = 0
+
+    def invoke(epoch: Int, count: Int, evalMetric: MultiAccuracy): Unit = {
+      if (lastCount > count) {
+        init = false
+      }
+      lastCount = count
+
+      if (init) {
+        if (count % frequent == 0) {
+          val speed = frequent.toDouble * batchSize / (System.currentTimeMillis - tic) * 1000
+          if (evalMetric != null) {
+            val nameVals = evalMetric.get
+            nameVals.foreach { case (name, value) =>
+              logger.info("Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f".format(
+                  epoch, count, speed, name, value))
+            }
+          } else {
+            logger.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec".format(epoch, count, speed))
+          }
+          tic = System.currentTimeMillis
+        }
+      } else {
+        init = true
+        tic = System.currentTimeMillis
+      }
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    val lesk = new ExampleMultiTask
+    val parser: CmdLineParser = new CmdLineParser(lesk)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(lesk.dataPath != null)
+
+      val batchSize = 100
+      val numEpoch = 100
+      val ctx = if (lesk.gpu != -1) Context.gpu(lesk.gpu) else Context.cpu()
+      val lr = 0.001f
+      val network = buildNetwork()
+      val (trainIter, valIter) =
+        Data.mnistIterator(lesk.dataPath, batchSize = batchSize, inputShape = Shape(784))
+      val trainMultiIter = new MultiMnistIterator(trainIter)
+      val valMultiIter = new MultiMnistIterator(valIter)
+
+      val datasAndLabels = trainMultiIter.provideData ++ trainMultiIter.provideLabel
+      val (argShapes, outputShapes, auxShapes) = network.inferShape(datasAndLabels)
+
+      val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
+
+      val argNames = network.listArguments()
+      val argDict = argNames.zip(argShapes.map(NDArray.empty(_, ctx))).toMap
+      val auxNames = network.listAuxiliaryStates()
+      val auxDict = auxNames.zip(auxShapes.map(NDArray.empty(_, ctx))).toMap
+
+      val gradDict = argNames.zip(argShapes).filter { case (name, shape) =>
+        !datasAndLabels.contains(name)
+      }.map(x => x._1 -> NDArray.empty(x._2, ctx) ).toMap
+
+      argDict.foreach { case (name, ndArray) =>
+        if (!datasAndLabels.contains(name)) {
+          initializer.initWeight(name, ndArray)
+        }
+      }
+
+      val data = argDict("data")
+      val label1 = argDict("softmax1_label")
+      val label2 = argDict("softmax2_label")
+
+      val maxGradNorm = 0.5f
+      val executor = network.bind(ctx, argDict, gradDict)
+
+      val opt = new RMSProp(learningRate = lr, wd = 0.00001f)
+
+      val paramsGrads = gradDict.toList.zipWithIndex.map { case ((name, grad), idx) =>
+        (idx, name, grad, opt.createState(idx, argDict(name)))
+      }
+
+      val evalMetric = new MultiAccuracy(num = 2, name = "multi_accuracy")
+      val batchEndCallback = new Speedometer(batchSize, 50)
+
+      for (epoch <- 0 until numEpoch) {
+        // Training phase
+        val tic = System.currentTimeMillis
+        evalMetric.reset()
+        var nBatch = 0
+        var epochDone = false
+        // Iterate over training data.
+        trainMultiIter.reset()
+
+        while (!epochDone) {
+          var doReset = true
+          while (doReset && trainMultiIter.hasNext) {
+            val dataBatch = trainMultiIter.next()
+
+            data.set(dataBatch.data(0))
+            label1.set(dataBatch.label(0))
+            label2.set(dataBatch.label(1))
+
+            executor.forward(isTrain = true)
+            executor.backward()
+
+            val norm = Math.sqrt(paramsGrads.map { case (idx, name, grad, optimState) =>
+              val l2Norm = NDArray.norm(grad / batchSize).toScalar
+              l2Norm * l2Norm
+            }.sum).toFloat
+
+            paramsGrads.foreach { case (idx, name, grad, optimState) =>
+              if (norm > maxGradNorm) {
+                grad.set(grad.toArray.map(_ * (maxGradNorm / norm)))
+                opt.update(idx, argDict(name), grad, optimState)
+              } else opt.update(idx, argDict(name), grad, optimState)
+            }
+
+            // evaluate at end, so out_cpu_array can lazy copy
+            evalMetric.update(dataBatch.label, executor.outputs)
+
+            nBatch += 1
+            batchEndCallback.invoke(epoch, nBatch, evalMetric)
+          }
+          if (doReset) {
+            trainMultiIter.reset()
+          }
+          // this epoch is done
+          epochDone = true
+        }
+        var nameVals = evalMetric.get
+        nameVals.foreach { case (name, value) =>
+          logger.info(s"Epoch[$epoch] Train-$name=$value")
+        }
+        val toc = System.currentTimeMillis
+        logger.info(s"Epoch[$epoch] Time cost=${toc - tic}")
+
+        evalMetric.reset()
+        valMultiIter.reset()
+        while (valMultiIter.hasNext) {
+          val evalBatch = valMultiIter.next()
+
+          data.set(evalBatch.data(0))
+          label1.set(evalBatch.label(0))
+          label2.set(evalBatch.label(1))
+
+          executor.forward(isTrain = true)
+
+          evalMetric.update(evalBatch.label, executor.outputs)
+          evalBatch.dispose()
+        }
+
+        nameVals = evalMetric.get
+        nameVals.foreach { case (name, value) =>
+          logger.info(s"Epoch[$epoch] Validation-$name=$value")
+        }
+      }
+      executor.dispose()
+
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class ExampleMultiTask {
+  @Option(name = "--data-path", usage = "the mnist data path")
+  private val dataPath: String = null
+  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
+  private val gpu: Int = -1
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/ModelVgg19.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/ModelVgg19.scala
index 218b13eb30ac..f431d74acf22 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/ModelVgg19.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/ModelVgg19.scala
@@ -17,66 +17,66 @@ object ModelVgg19 {
   def getSymbol(): (Symbol, Symbol) = {
     // declare symbol
     val data = Symbol.Variable("data")
-    val conv1_1 = Symbol.Convolution("conv1_1")(Map("data" -> data , "num_filter" -> 64,
+    val conv1_1 = Symbol.Convolution("conv1_1")()(Map("data" -> data , "num_filter" -> 64,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu1_1 = Symbol.Activation("relu1_1")(Map("data" -> conv1_1 , "act_type" -> "relu"))
-    val conv1_2 = Symbol.Convolution("conv1_2")(Map("data" -> relu1_1 , "num_filter" -> 64,
+    val relu1_1 = Symbol.Activation("relu1_1")()(Map("data" -> conv1_1 , "act_type" -> "relu"))
+    val conv1_2 = Symbol.Convolution("conv1_2")()(Map("data" -> relu1_1 , "num_filter" -> 64,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu1_2 = Symbol.Activation("relu1_2")(Map("data" -> conv1_2 , "act_type" -> "relu"))
-    val pool1 = Symbol.Pooling("pool1")(Map("data" -> relu1_2 , "pad" -> "(0,0)",
+    val relu1_2 = Symbol.Activation("relu1_2")()(Map("data" -> conv1_2 , "act_type" -> "relu"))
+    val pool1 = Symbol.Pooling("pool1")()(Map("data" -> relu1_2 , "pad" -> "(0,0)",
                                     "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv2_1 = Symbol.Convolution("conv2_1")(Map("data" -> pool1 , "num_filter" -> 128,
+    val conv2_1 = Symbol.Convolution("conv2_1")()(Map("data" -> pool1 , "num_filter" -> 128,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu2_1 = Symbol.Activation("relu2_1")(Map("data" -> conv2_1 , "act_type" -> "relu"))
-    val conv2_2 = Symbol.Convolution("conv2_2")(Map("data" -> relu2_1 , "num_filter" -> 128,
+    val relu2_1 = Symbol.Activation("relu2_1")()(Map("data" -> conv2_1 , "act_type" -> "relu"))
+    val conv2_2 = Symbol.Convolution("conv2_2")()(Map("data" -> relu2_1 , "num_filter" -> 128,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu2_2 = Symbol.Activation("relu2_2")(Map("data" -> conv2_2 , "act_type" -> "relu"))
-    val pool2 = Symbol.Pooling("pool2")(Map("data" -> relu2_2 , "pad" -> "(0,0)",
+    val relu2_2 = Symbol.Activation("relu2_2")()(Map("data" -> conv2_2 , "act_type" -> "relu"))
+    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> relu2_2 , "pad" -> "(0,0)",
                                     "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv3_1 = Symbol.Convolution("conv3_1")(Map("data" -> pool2 , "num_filter" -> 256,
+    val conv3_1 = Symbol.Convolution("conv3_1")()(Map("data" -> pool2 , "num_filter" -> 256,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu3_1 = Symbol.Activation("relu3_1")(Map("data" -> conv3_1 , "act_type" -> "relu"))
-    val conv3_2 = Symbol.Convolution("conv3_2")(Map("data" -> relu3_1 , "num_filter" -> 256,
+    val relu3_1 = Symbol.Activation("relu3_1")()(Map("data" -> conv3_1 , "act_type" -> "relu"))
+    val conv3_2 = Symbol.Convolution("conv3_2")()(Map("data" -> relu3_1 , "num_filter" -> 256,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu3_2 = Symbol.Activation("'relu3_2")(Map("data" -> conv3_2 , "act_type" -> "relu"))
-    val conv3_3 = Symbol.Convolution("conv3_3")(Map("data" -> relu3_2 , "num_filter" -> 256,
+    val relu3_2 = Symbol.Activation("'relu3_2")()(Map("data" -> conv3_2 , "act_type" -> "relu"))
+    val conv3_3 = Symbol.Convolution("conv3_3")()(Map("data" -> relu3_2 , "num_filter" -> 256,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu3_3 = Symbol.Activation("relu3_3")(Map("data" -> conv3_3 , "act_type" -> "relu"))
-    val conv3_4 = Symbol.Convolution("conv3_4")(Map("data" -> relu3_3 , "num_filter" -> 256,
+    val relu3_3 = Symbol.Activation("relu3_3")()(Map("data" -> conv3_3 , "act_type" -> "relu"))
+    val conv3_4 = Symbol.Convolution("conv3_4")()(Map("data" -> relu3_3 , "num_filter" -> 256,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu3_4 = Symbol.Activation("relu3_4")(Map("data" -> conv3_4 , "act_type" -> "relu"))
-    val pool3 = Symbol.Pooling("pool3")(Map("data" -> relu3_4 , "pad" -> "(0,0)",
+    val relu3_4 = Symbol.Activation("relu3_4")()(Map("data" -> conv3_4 , "act_type" -> "relu"))
+    val pool3 = Symbol.Pooling("pool3")()(Map("data" -> relu3_4 , "pad" -> "(0,0)",
                                     "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv4_1 = Symbol.Convolution("conv4_1")(Map("data" -> pool3 , "num_filter" -> 512,
+    val conv4_1 = Symbol.Convolution("conv4_1")()(Map("data" -> pool3 , "num_filter" -> 512,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu4_1 = Symbol.Activation("relu4_1")(Map("data" -> conv4_1 , "act_type" -> "relu"))
-    val conv4_2 = Symbol.Convolution("conv4_2")(Map("data" -> relu4_1 , "num_filter" -> 512,
+    val relu4_1 = Symbol.Activation("relu4_1")()(Map("data" -> conv4_1 , "act_type" -> "relu"))
+    val conv4_2 = Symbol.Convolution("conv4_2")()(Map("data" -> relu4_1 , "num_filter" -> 512,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu4_2 = Symbol.Activation("relu4_2")(Map("data" -> conv4_2 , "act_type" -> "relu"))
-    val conv4_3 = Symbol.Convolution("conv4_3")(Map("data" -> relu4_2 , "num_filter" -> 512,
+    val relu4_2 = Symbol.Activation("relu4_2")()(Map("data" -> conv4_2 , "act_type" -> "relu"))
+    val conv4_3 = Symbol.Convolution("conv4_3")()(Map("data" -> relu4_2 , "num_filter" -> 512,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu4_3 = Symbol.Activation("relu4_3")(Map("data" -> conv4_3 , "act_type" -> "relu"))
-    val conv4_4 = Symbol.Convolution("conv4_4")(Map("data" -> relu4_3 , "num_filter" -> 512,
+    val relu4_3 = Symbol.Activation("relu4_3")()(Map("data" -> conv4_3 , "act_type" -> "relu"))
+    val conv4_4 = Symbol.Convolution("conv4_4")()(Map("data" -> relu4_3 , "num_filter" -> 512,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu4_4 = Symbol.Activation("relu4_4")(Map("data" -> conv4_4 , "act_type" -> "relu"))
-    val pool4 = Symbol.Pooling("pool4")(Map("data" -> relu4_4 , "pad" -> "(0,0)",
+    val relu4_4 = Symbol.Activation("relu4_4")()(Map("data" -> conv4_4 , "act_type" -> "relu"))
+    val pool4 = Symbol.Pooling("pool4")()(Map("data" -> relu4_4 , "pad" -> "(0,0)",
                                     "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
-    val conv5_1 = Symbol.Convolution("conv5_1")(Map("data" -> pool4 , "num_filter" -> 512,
+    val conv5_1 = Symbol.Convolution("conv5_1")()(Map("data" -> pool4 , "num_filter" -> 512,
                                         "pad" -> "(1,1)", "kernel" -> "(3,3)", "stride" -> "(1,1)",
                                         "no_bias" -> false, "workspace" -> 1024))
-    val relu5_1 = Symbol.Activation("relu5_1")(Map("data" -> conv5_1 , "act_type" -> "relu"))
+    val relu5_1 = Symbol.Activation("relu5_1")()(Map("data" -> conv5_1 , "act_type" -> "relu"))
 
     // style and content layers
     val style = Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/NeuralStyle.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/NeuralStyle.scala
index 7f027b6d756a..c8d6795fa00c 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/NeuralStyle.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/NeuralStyle.scala
@@ -92,10 +92,10 @@ object NeuralStyle {
     var gradScale = List[Int]()
     for (i <- 0 until style.listOutputs().length) {
       val shape = outputShape(i)
-      val x = Symbol.Reshape()(Map("data" -> style.get(i),
+      val x = Symbol.Reshape()()(Map("data" -> style.get(i),
           "target_shape" -> Shape(shape(1), shape(2) * shape(3))))
       // use fully connected to quickly do dot(x, x^T)
-      val gram = Symbol.FullyConnected()(Map("data" -> x, "weight" -> x,
+      val gram = Symbol.FullyConnected()()(Map("data" -> x, "weight" -> x,
           "no_bias" -> true, "num_hidden" -> shape(1)))
       gramList = gramList :+ gram
       gradScale = gradScale :+ (shape(1) * shape(2) * shape(3) * shape(1))
@@ -107,10 +107,10 @@ object NeuralStyle {
     var gramLoss = List[Symbol]()
     for (i <- 0 until gram.listOutputs().length) {
       val gvar = Symbol.Variable(s"target_gram_$i")
-      gramLoss = gramLoss :+ Symbol.sum(Symbol.square(gvar - gram.get(i)))
+      gramLoss = gramLoss :+ Symbol.sum()(Symbol.square()(gvar - gram.get(i))())()
     }
     val cvar = Symbol.Variable("target_content")
-    val contentLoss = Symbol.sum(Symbol.square(cvar - content))
+    val contentLoss = Symbol.sum()(Symbol.square()(cvar - content)())()
     (Symbol.Group(gramLoss: _*), contentLoss)
   }
 
@@ -121,12 +121,12 @@ object NeuralStyle {
     val nChannel = img.shape(1)
     val sImg = Symbol.Variable("img")
     val sKernel = Symbol.Variable("kernel")
-    val channels = Symbol.SliceChannel()(Array(sImg), Map("num_outputs" -> nChannel))
+    val channels = Symbol.SliceChannel()(sImg)(Map("num_outputs" -> nChannel))
     val out = Symbol.Concat()((0 until nChannel).map { i =>
-      Symbol.Convolution()(Map("data" -> channels.get(i), "weight" -> sKernel,
+      Symbol.Convolution()()(Map("data" -> channels.get(i), "weight" -> sKernel,
                     "num_filter" -> 1, "kernel" -> "(3,3)", "pad" -> "(1,1)",
                     "no_bias" -> true, "stride" -> "(1,1)"))
-    }.toArray) * tvWeight
+    }: _*)() * tvWeight
     val kernel = {
       val tmp = NDArray.empty(Shape(1, 1, 3, 3), ctx)
       tmp.set(Array[Float](0, -1, 0, -1, 4, -1, 0, -1, 0))
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Basic.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Basic.scala
new file mode 100644
index 000000000000..5ce1b71b0347
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Basic.scala
@@ -0,0 +1,106 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Initializer
+import org.slf4j.LoggerFactory
+
+/**
+ * @author Depeng Liang
+ */
+object Basic {
+
+  class PretrainedInit(prefix: String, params: Map[String, NDArray],
+      verbose: Boolean = false) extends Initializer {
+
+    private val logger = LoggerFactory.getLogger(classOf[PretrainedInit])
+
+    private val prefixLen = prefix.length() + 1
+    private val argParams = params.filter(_._1.startsWith("arg:"))
+    private val auxParams = params.filter(_._1.startsWith("aux:"))
+    private val argNames = argParams.keySet.map(_.substring(4))
+    private val auxNames = auxParams.keySet.map(_.substring(4))
+
+    override def initWeight(name: String, arr: NDArray): Unit = {
+      val key = name.substring(prefixLen)
+      if (this.argNames.contains(key)) {
+        if (verbose) logger.info(s"Init $name")
+        arr.set(this.argParams(s"arg:$key"))
+      } else if (this.auxNames.contains(key)) {
+        if (verbose) logger.info(s"Init $name")
+        arr.set(this.auxParams(s"aux:$key"))
+      } else {
+        logger.info(s"Unknown params: $name, init with 0")
+        arr.set(0f)
+      }
+    }
+
+    override def initDefault(name: String, arr: NDArray): Unit = {
+    }
+  }
+
+  def getStyleModule(prefix: String, dShape: Shape,
+      ctx: Context, params: Map[String, NDArray]): Module = {
+    val inputShape = Map(s"${prefix}_data" -> dShape)
+    val (style, content) = ModelVgg19.getVggSymbol(prefix)
+    val (gram, gScale) = styleGramSymbol(inputShape, style)
+    val init = new PretrainedInit(prefix, params, true)
+    new Module(symbol = gram, context = ctx,
+                        dataShapes = Map(s"${prefix}_data" -> dShape),
+                        initializer = init, forTraining = false)
+  }
+
+  def styleGramSymbol(inputShape: Map[String, Shape], style: Symbol): (Symbol, List[Int]) = {
+    val (_, outputShape, _) = style.inferShape(inputShape)
+    var gramList = List[Symbol]()
+    var gradScale = List[Int]()
+    for (i <- 0 until style.listOutputs().length) {
+      val shape = outputShape(i)
+      val x = Symbol.Reshape()()(Map("data" -> style.get(i),
+          "shape" -> Shape(shape(1), shape(2) * shape(3))))
+      // use fully connected to quickly do dot(x, x^T)
+      val gram = Symbol.FullyConnected()()(Map("data" -> x, "weight" -> x,
+          "no_bias" -> true, "num_hidden" -> shape(1)))
+      gramList = gramList :+ gram
+      gradScale = gradScale :+ (shape(1) * shape(2) * shape(3) * shape(1))
+    }
+    (Symbol.Group(gramList: _*), gradScale)
+  }
+
+  def getLoss(gram: Symbol, content: Symbol): (Symbol, Symbol) = {
+    var gramLoss = List[Symbol]()
+    for (i <- 0 until gram.listOutputs().length) {
+      val gvar = Symbol.Variable(s"target_gram_$i")
+      gramLoss = gramLoss :+ Symbol.sum()(Symbol.square()(gvar - gram.get(i))())()
+    }
+    val cvar = Symbol.Variable("target_content")
+    val contentLoss = Symbol.sum()(Symbol.square()(cvar - content)())()
+    (Symbol.Group(gramLoss: _*), contentLoss)
+  }
+
+  def getContentModule(prefix: String, dShape: Shape,
+      ctx: Context, params: Map[String, NDArray]): Module = {
+    val (_, sym) = ModelVgg19.getVggSymbol(prefix, true)
+    val init = new PretrainedInit(prefix, params)
+    new Module(symbol = sym, context = ctx,
+                    dataShapes = Map(s"${prefix}_data" -> dShape),
+                    initializer = init, forTraining = false)
+  }
+
+  def getLossModule(prefix: String, dShape: Shape,
+      ctx: Context, params: Map[String, NDArray]): (Module, List[Int]) = {
+    val inputShape = Map(s"${prefix}_data" -> dShape)
+    val (style, content) = ModelVgg19.getVggSymbol(prefix)
+    val (gram, gScale) = styleGramSymbol(inputShape, style)
+    val (styleLoss, contentLoss) = getLoss(gram, content)
+    val sym = Symbol.Group(styleLoss, contentLoss)
+    val init = new PretrainedInit(prefix, params, true)
+    val mod = new Module(symbol = sym, context = ctx,
+                         dataShapes = Map(s"${prefix}_data" -> dShape),
+                         initializer = init, forTraining = true,
+                         inputsNeedGrad = true)
+    (mod, gScale)
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostInference.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostInference.scala
new file mode 100644
index 000000000000..472956be4177
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostInference.scala
@@ -0,0 +1,70 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import org.slf4j.LoggerFactory
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import scala.collection.JavaConverters._
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Context
+
+/**
+ * @author Depeng Liang
+ */
+object BoostInference {
+
+  private val logger = LoggerFactory.getLogger(classOf[BoostInference])
+
+  def main(args: Array[String]): Unit = {
+    val stce = new BoostInference
+    val parser: CmdLineParser = new CmdLineParser(stce)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(stce.modelPath != null
+          && stce.inputImage != null
+          && stce.outputPath != null)
+
+      val dShape = Shape(1, 3, 480, 640)
+      val clipNorm = 1.0f * dShape.product
+      val ctx = if (stce.gpu == -1) Context.cpu() else Context.gpu(stce.gpu)
+
+      // generator
+      val gens = Array(
+          GenV4.getModule("g0", dShape, ctx, isTrain = false),
+          GenV3.getModule("g1", dShape, ctx, isTrain = false),
+          GenV3.getModule("g2", dShape, ctx, isTrain = false),
+          GenV4.getModule("g3", dShape, ctx, isTrain = false)
+      )
+      gens.zipWithIndex.foreach { case (gen, i) =>
+        gen.loadParams(s"${stce.modelPath}/$i/v3_0002-0026000.params")
+      }
+
+      val contentNp =
+        DataProcessing.preprocessContentImage(s"${stce.inputImage}", dShape, ctx)
+      var data = Array(contentNp)
+      for (i <- 0 until gens.length) {
+        gens(i).forward(data.takeRight(1))
+        val newImg = gens(i).getOutputs()(0)
+        data :+= newImg
+        DataProcessing.saveImage(newImg, s"${stce.outputPath}/out_${i}.jpg", stce.guassianRadius)
+      }
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class BoostInference {
+  @Option(name = "--model-path", usage = "the save model path")
+  private val modelPath: String = null
+  @Option(name = "--input-image", usage = "the style image")
+  private val inputImage: String = null
+  @Option(name = "--output-path", usage = "the output result path")
+  private val outputPath: String = null
+  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
+  private val gpu: Int = -1
+  @Option(name = "--guassian-radius", usage = "the gaussian blur filter radius")
+  private val guassianRadius: Int = 2
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostTrain.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostTrain.scala
new file mode 100644
index 000000000000..91b2a5f5d38a
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostTrain.scala
@@ -0,0 +1,191 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import org.slf4j.LoggerFactory
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import scala.collection.JavaConverters._
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.DataBatch
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Executor
+import ml.dmlc.mxnet.optimizer.SGD
+import java.io.File
+import javax.imageio.ImageIO
+import scala.util.Random
+import ml.dmlc.mxnet.optimizer.Adam
+
+/**
+ * @author Depeng Liang
+ */
+object BoostTrain {
+
+  private val logger = LoggerFactory.getLogger(classOf[BoostTrain])
+
+  def getTvGradExecutor(img: NDArray, ctx: Context, tvWeight: Float): Executor = {
+    // create TV gradient executor with input binded on img
+    if (tvWeight <= 0.0f) null
+
+    val nChannel = img.shape(1)
+    val sImg = Symbol.Variable("img")
+    val sKernel = Symbol.Variable("kernel")
+    val channels = Symbol.SliceChannel()(sImg)(Map("num_outputs" -> nChannel))
+    val out = Symbol.Concat()((0 until nChannel).map { i =>
+      Symbol.Convolution()()(Map("data" -> channels.get(i), "weight" -> sKernel,
+                    "num_filter" -> 1, "kernel" -> "(3,3)", "pad" -> "(1,1)",
+                    "no_bias" -> true, "stride" -> "(1,1)"))
+    }.toArray: _*)() * tvWeight
+    val kernel = {
+      val tmp = NDArray.empty(Shape(1, 1, 3, 3), ctx)
+      tmp.set(Array[Float](0, -1, 0, -1, 4, -1, 0, -1, 0))
+      tmp / 8.0f
+    }
+    out.bind(ctx, Map("img" -> img, "kernel" -> kernel))
+  }
+
+  def main(args: Array[String]): Unit = {
+    val stin = new BoostTrain
+    val parser: CmdLineParser = new CmdLineParser(stin)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(stin.dataPath != null
+          && stin.vggModelPath != null
+          && stin.saveModelPath != null
+          && stin.styleImage != null)
+      // params
+      val vggParams = NDArray.load2Map(stin.vggModelPath)
+      val styleWeight = 1.2f
+      val contentWeight = 10f
+      val dShape = Shape(1, 3, 384, 384)
+      val clipNorm = 0.05f * dShape.product
+      val modelPrefix = "v3"
+      val ctx = if (stin.gpu == -1) Context.cpu() else Context.gpu(stin.gpu)
+
+      // init style
+      val styleNp = DataProcessing.preprocessStyleImage(stin.styleImage, dShape, ctx)
+      var styleMod = Basic.getStyleModule("style", dShape, ctx, vggParams)
+      styleMod.forward(Array(styleNp))
+      val styleArray = styleMod.getOutputs().map(_.copyTo(Context.cpu()))
+      styleMod.dispose()
+      styleMod = null
+
+      // content
+      val contentMod = Basic.getContentModule("content", dShape, ctx, vggParams)
+
+      // loss
+      val (loss, gScale) = Basic.getLossModule("loss", dShape, ctx, vggParams)
+      val extraArgs = (0 until styleArray.length)
+                                  .map( i => s"target_gram_$i" -> styleArray(i)).toMap
+      loss.setParams(extraArgs)
+      var gradArray = Array[NDArray]()
+      for (i <- 0 until styleArray.length) {
+        gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * (styleWeight / gScale(i)))
+      }
+      gradArray = gradArray :+ (NDArray.ones(Shape(1), ctx) * contentWeight)
+
+      // generator
+      val gens = Array(
+          GenV4.getModule("g0", dShape, ctx),
+          GenV3.getModule("g1", dShape, ctx),
+          GenV3.getModule("g2", dShape, ctx),
+          GenV4.getModule("g3", dShape, ctx)
+      )
+      gens.foreach { gen =>
+        val opt = new SGD(learningRate = 1e-4f,
+                          momentum = 0.9f,
+                          wd = 5e-3f,
+                          clipGradient = 5f)
+        gen.initOptimizer(opt)
+      }
+
+      var filelist = new File(stin.dataPath).list().toList
+      val numImage = filelist.length
+      logger.info(s"Dataset size: $numImage")
+
+      val tvWeight = 1e-2f
+
+      val startEpoch = 0
+      val endEpoch = 3
+
+      for (k <- 0 until gens.length) {
+        val path = new File(s"${stin.saveModelPath}/$k")
+        if (!path.exists()) path.mkdir()
+      }
+
+      // train
+      for (i <- startEpoch until endEpoch) {
+        filelist = Random.shuffle(filelist)
+        for (idx <- filelist.indices) {
+          var dataArray = Array[NDArray]()
+          var lossGradArray = Array[NDArray]()
+          val data =
+            DataProcessing.preprocessContentImage(s"${stin.dataPath}/${filelist(idx)}", dShape, ctx)
+          dataArray = dataArray :+ data
+          // get content
+          contentMod.forward(Array(data))
+          // set target content
+          loss.setParams(Map("target_content" -> contentMod.getOutputs()(0)))
+          // gen_forward
+          for (k <- 0 until gens.length) {
+            gens(k).forward(dataArray.takeRight(1))
+            dataArray = dataArray :+ gens(k).getOutputs()(0)
+            // loss forward
+            loss.forward(dataArray.takeRight(1))
+            loss.backward(gradArray)
+            lossGradArray = lossGradArray :+ loss.getInputGrads()(0)
+          }
+          val grad = NDArray.zeros(data.shape, ctx)
+          for (k <- gens.length - 1 to 0 by -1) {
+            val tvGradExecutor = getTvGradExecutor(gens(k).getOutputs()(0), ctx, tvWeight)
+            tvGradExecutor.forward()
+            grad += lossGradArray(k) + tvGradExecutor.outputs(0)
+            val gNorm = NDArray.norm(grad)
+            if (gNorm.toScalar > clipNorm) {
+              grad *= clipNorm / gNorm.toScalar
+            }
+            gens(k).backward(Array(grad))
+            gens(k).update()
+            gNorm.dispose()
+            tvGradExecutor.dispose()
+          }
+          grad.dispose()
+          if (idx % 20 == 0) {
+            logger.info(s"Epoch $i: Image $idx")
+            for (k <- 0 until gens.length) {
+              val n = NDArray.norm(gens(k).getInputGrads()(0))
+              logger.info(s"Data Norm : ${n.toScalar / dShape.product}")
+              n.dispose()
+            }
+          }
+          if (idx % 1000 == 0) {
+            for (k <- 0 until gens.length) {
+              gens(k).saveParams(
+                  s"${stin.saveModelPath}/$k/${modelPrefix}_" +
+                  s"${"%04d".format(i)}-${"%07d".format(idx)}.params")
+            }
+          }
+          data.dispose()
+        }
+      }
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class BoostTrain {
+  @Option(name = "--data-path", usage = "the input train data path")
+  private val dataPath: String = null
+  @Option(name = "--vgg--model-path", usage = "the pretrained model to use: ['vgg']")
+  private val vggModelPath: String = null
+  @Option(name = "--save--model-path", usage = "the save model path")
+  private val saveModelPath: String = null
+  @Option(name = "--style-image", usage = "the style image")
+  private val styleImage: String = null
+  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
+  private val gpu: Int = -1
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/DataProcessing.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/DataProcessing.scala
new file mode 100644
index 000000000000..daa0e8f856da
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/DataProcessing.scala
@@ -0,0 +1,77 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import com.sksamuel.scrimage.Image
+import com.sksamuel.scrimage.Pixel
+import com.sksamuel.scrimage.filter.GaussianBlurFilter
+import com.sksamuel.scrimage.nio.JpegWriter
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.NDArray
+import java.io.File
+import ml.dmlc.mxnet.Shape
+import scala.util.Random
+
+/**
+ * @author Depeng Liang
+ */
+object DataProcessing {
+
+  def preprocessContentImage(path: String,
+      dShape: Shape = null, ctx: Context): NDArray = {
+    val img = Image(new File(path))
+    val resizedImg = img.scaleTo(dShape(3), dShape(2))
+    val sample = NDArray.empty(Shape(1, 3, resizedImg.height, resizedImg.width), ctx)
+    val datas = {
+      val rgbs = resizedImg.iterator.toArray.map { p =>
+        (p.red, p.green, p.blue)
+      }
+      val r = rgbs.map(_._1 - 123.68f)
+      val g = rgbs.map(_._2 - 116.779f)
+      val b = rgbs.map(_._3 - 103.939f)
+      r ++ g ++ b
+    }
+    sample.set(datas)
+    sample
+  }
+
+  def preprocessStyleImage(path: String, shape: Shape, ctx: Context): NDArray = {
+    val img = Image(new File(path))
+    val resizedImg = img.scaleTo(shape(3), shape(2))
+    val sample = NDArray.empty(Shape(1, 3, shape(2), shape(3)), ctx)
+    val datas = {
+      val rgbs = resizedImg.iterator.toArray.map { p =>
+        (p.red, p.green, p.blue)
+      }
+      val r = rgbs.map(_._1 - 123.68f)
+      val g = rgbs.map(_._2 - 116.779f)
+      val b = rgbs.map(_._3 - 103.939f)
+      r ++ g ++ b
+    }
+    sample.set(datas)
+    sample
+  }
+
+  def clip(array: Array[Float]): Array[Float] = array.map { a =>
+    if (a < 0) 0f
+    else if (a > 255) 255f
+    else a
+  }
+
+  def postprocessImage(img: NDArray): Image = {
+    val datas = img.toArray
+    val spatialSize = img.shape(2) * img.shape(3)
+    val r = clip(datas.take(spatialSize).map(_ + 123.68f))
+    val g = clip(datas.drop(spatialSize).take(spatialSize).map(_ + 116.779f))
+    val b = clip(datas.takeRight(spatialSize).map(_ + 103.939f))
+    val pixels = for (i <- 0 until spatialSize)
+      yield Pixel(r(i).toInt, g(i).toInt, b(i).toInt, 255)
+    Image(img.shape(3), img.shape(2), pixels.toArray)
+  }
+
+  def saveImage(img: NDArray, filename: String, radius: Int): Unit = {
+    val out = postprocessImage(img)
+    val gauss = GaussianBlurFilter(radius).op
+    val result = Image(out.width, out.height)
+    gauss.filter(out.awt, result.awt)
+    result.output(filename)(JpegWriter())
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV3.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV3.scala
new file mode 100644
index 000000000000..edf545df2e2f
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV3.scala
@@ -0,0 +1,68 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.Xavier
+
+/**
+ * @author Depeng Liang
+ */
+object GenV3 {
+  def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (5, 5),
+      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
+    var sym = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> numFilter,
+        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> false))
+    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
+    sym = Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
+    sym
+  }
+
+  def Deconv(data: Symbol, numFilter: Int, imHw: (Int, Int),
+      kernel: (Int, Int) = (7, 7), pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2),
+      crop: Boolean = true, out: Boolean = false): Symbol = {
+    var sym = Symbol.Deconvolution()()(Map("data" -> data, "num_filter" -> numFilter,
+        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> true))
+    if (crop) sym = Symbol.Crop()(sym)(
+        Map("offset" -> "(1, 1)", "h_w" -> s"$imHw", "num_args" -> 1))
+    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
+    if (out == false) Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
+    else Symbol.Activation()()(Map("data" -> sym, "act_type" -> "tanh"))
+  }
+
+  def getGenerator(prefix: String, imHw: (Int, Int)): Symbol = {
+    val data = Symbol.Variable(s"${prefix}_data")
+    val conv1 = Conv(data, 64) // 192
+    val conv1_1 = Conv(conv1, 48, kernel = (3, 3), pad = (1, 1), stride = (1, 1))
+    val conv2 = Conv(conv1_1, 128) // 96
+    val conv2_1 = Conv(conv2, 96, kernel = (3, 3), pad = (1, 1), stride = (1, 1))
+    val conv3 = Conv(conv2_1, 256) // 48
+    val conv3_1 = Conv(conv3, 192, kernel = (3, 3), pad = (1, 1), stride = (1, 1))
+    val deconv1 = Deconv(conv3_1, 128, (imHw._1 / 4, imHw._2 / 4)) + conv2
+    val conv4_1 = Conv(deconv1, 160, kernel = (3, 3), pad = (1, 1), stride = (1, 1))
+    val deconv2 = Deconv(conv4_1, 64, (imHw._1 / 2, imHw._2 / 2)) + conv1
+    val conv5_1 = Conv(deconv2, 96, kernel = (3, 3), pad = (1, 1), stride = (1, 1))
+    val deconv3 = Deconv(conv5_1, 3, imHw, kernel = (8, 8), pad = (3, 3), out = true, crop = false)
+    val rawOut = (deconv3 * 128) + 128
+    val norm = Symbol.SliceChannel()(rawOut)(Map("num_outputs" -> 3))
+    val rCh = norm.get(0) - 123.68f
+    val gCh = norm.get(1) - 116.779f
+    val bCh = norm.get(2) - 103.939f
+    val normOut = Symbol.Concat()(rCh, gCh, bCh)() * 0.4f + data * 0.6f
+    normOut
+  }
+
+  def getModule(prefix: String, dShape: Shape, ctx: Context, isTrain: Boolean = true): Module = {
+    val sym = getGenerator(prefix, (dShape(2), dShape(3)))
+    val (dataShapes, forTraining, inputsNeedGrad) = {
+      val dataShape = Map(s"${prefix}_data" -> dShape)
+      if (isTrain) (dataShape, true, true)
+      else (dataShape, false, false)
+    }
+    val mod = new Module(symbol = sym, context = ctx,
+                         dataShapes = dataShapes,
+                         initializer = new Xavier(magnitude = 2f),
+                         forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
+    mod
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV4.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV4.scala
new file mode 100644
index 000000000000..7426654ee16c
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV4.scala
@@ -0,0 +1,90 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.Xavier
+
+/**
+ * @author Depeng Liang
+ */
+object GenV4 {
+
+  def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (5, 5),
+      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2)): Symbol = {
+    var sym = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> numFilter,
+        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> false))
+    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
+    sym = Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
+    sym
+  }
+
+  def Deconv(data: Symbol, numFilter: Int, imHw: (Int, Int), kernel: (Int, Int) = (6, 6),
+      pad: (Int, Int) = (2, 2), stride: (Int, Int) = (2, 2), out: Boolean = false): Symbol = {
+    var sym = Symbol.Deconvolution()()(Map("data" -> data, "num_filter" -> numFilter,
+        "kernel" -> s"$kernel", "stride" -> s"$stride", "pad" -> s"$pad", "no_bias" -> true))
+    sym = Symbol.BatchNorm()()(Map("data" -> sym, "fix_gamma" -> false))
+    if (out == false) Symbol.LeakyReLU()()(Map("data" -> sym, "act_type" -> "leaky"))
+    else Symbol.Activation()()(Map("data" -> sym, "act_type" -> "tanh"))
+  }
+
+  def getGenerator(prefix: String, imHw: (Int, Int)): Symbol = {
+    val data = Symbol.Variable(s"${prefix}_data")
+
+    var conv1_1 = Symbol.Convolution()()(Map("data" -> data, "num_filter" -> 48,
+        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
+    conv1_1 = Symbol.BatchNorm()()(Map("data" -> conv1_1, "fix_gamma" -> false))
+    conv1_1 = Symbol.LeakyReLU()()(Map("data" -> conv1_1, "act_type" -> "leaky"))
+
+    var conv2_1 = Symbol.Convolution()()(Map("data" -> conv1_1, "num_filter" -> 32,
+        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
+    conv2_1 = Symbol.BatchNorm()()(Map("data" -> conv2_1, "fix_gamma" -> false))
+    conv2_1 = Symbol.LeakyReLU()()(Map("data" -> conv2_1, "act_type" -> "leaky"))
+
+    var conv3_1 = Symbol.Convolution()()(Map("data" -> conv2_1, "num_filter" -> 64,
+        "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "no_bias" -> false, "workspace" -> 4096))
+    conv3_1 = Symbol.BatchNorm()()(Map("data" -> conv3_1, "fix_gamma" -> false))
+    conv3_1 = Symbol.LeakyReLU()()(Map("data" -> conv3_1, "act_type" -> "leaky"))
+
+    var conv4_1 = Symbol.Convolution()()(Map("data" -> conv3_1, "num_filter" -> 32,
+        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
+    conv4_1 = Symbol.BatchNorm()()(Map("data" -> conv4_1, "fix_gamma" -> false))
+    conv4_1 = Symbol.LeakyReLU()()(Map("data" -> conv4_1, "act_type" -> "leaky"))
+
+    var conv5_1 = Symbol.Convolution()()(Map("data" -> conv4_1, "num_filter" -> 48,
+        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> false, "workspace" -> 4096))
+    conv5_1 = Symbol.BatchNorm()()(Map("data" -> conv5_1, "fix_gamma" -> false))
+    conv5_1 = Symbol.LeakyReLU()()(Map("data" -> conv5_1, "act_type" -> "leaky"))
+
+    var conv6_1 = Symbol.Convolution()()(Map("data" -> conv5_1, "num_filter" -> 32,
+        "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "no_bias" -> true, "workspace" -> 4096))
+    conv6_1 = Symbol.BatchNorm()()(Map("data" -> conv6_1, "fix_gamma" -> false))
+    conv6_1 = Symbol.LeakyReLU()()(Map("data" -> conv6_1, "act_type" -> "leaky"))
+
+    var out = Symbol.Convolution()()(Map("data" -> conv6_1, "num_filter" -> 3, "kernel" -> "(3, 3)",
+        "pad" -> "(1, 1)", "no_bias" -> true, "workspace" -> 4096))
+    out = Symbol.BatchNorm()()(Map("data" -> out, "fix_gamma" -> false))
+    out = Symbol.Activation()()(Map("data" -> out, "act_type" -> "tanh"))
+    val rawOut = (out * 128) + 128
+    val norm = Symbol.SliceChannel()(rawOut)(Map("num_outputs" -> 3))
+    val rCh = norm.get(0) - 123.68f
+    val gCh = norm.get(1) - 116.779f
+    val bCh = norm.get(2) - 103.939f
+    val normOut = Symbol.Concat()(rCh, gCh, bCh)() * 0.4f + data * 0.6f
+    normOut
+  }
+
+  def getModule(prefix: String, dShape: Shape, ctx: Context, isTrain: Boolean = true): Module = {
+    val sym = getGenerator(prefix, (dShape(2), dShape(3)))
+    val (dataShapes, forTraining, inputsNeedGrad) = {
+      val dataShape = Map(s"${prefix}_data" -> dShape)
+      if (isTrain) (dataShape, true, true)
+      else (dataShape, false, false)
+    }
+    val mod = new Module(symbol = sym, context = ctx,
+                         dataShapes = dataShapes,
+                         initializer = new Xavier(magnitude = 2f),
+                         forTraining = forTraining, inputsNeedGrad = inputsNeedGrad)
+    mod
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/ModelVgg19.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/ModelVgg19.scala
new file mode 100644
index 000000000000..2c8fcdad334b
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/ModelVgg19.scala
@@ -0,0 +1,100 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import ml.dmlc.mxnet.Executor
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.Shape
+
+/**
+ * @author Depeng Liang
+ */
+object ModelVgg19 {
+  case class ConvExecutor(executor: Executor, data: NDArray, dataGrad: NDArray,
+                      style: Array[NDArray], content: NDArray, argDict: Map[String, NDArray])
+
+  def getVggSymbol(prefix: String, contentOnly: Boolean = false): (Symbol, Symbol) = {
+    // declare symbol
+    val data = Symbol.Variable(s"${prefix}_data")
+    val conv1_1 = Symbol.Convolution(s"${prefix}_conv1_1")()(Map("data" -> data,
+                            "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu1_1 = Symbol.Activation(s"${prefix}_relu1_1")()(Map("data" -> conv1_1,
+                            "act_type" -> "relu"))
+    val conv1_2 = Symbol.Convolution(s"${prefix}_conv1_2")()(Map("data" -> relu1_1,
+                            "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu1_2 = Symbol.Activation(s"${prefix}_relu1_2")()(Map("data" -> conv1_2,
+                            "act_type" -> "relu"))
+    val pool1 = Symbol.Pooling(s"${prefix}_pool1")()(Map("data" -> relu1_2 , "pad" -> "(0,0)",
+                            "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
+    val conv2_1 = Symbol.Convolution(s"${prefix}_conv2_1")()(Map("data" -> pool1,
+                            "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu2_1 = Symbol.Activation(s"${prefix}_relu2_1")()(Map("data" -> conv2_1,
+                            "act_type" -> "relu"))
+    val conv2_2 = Symbol.Convolution(s"${prefix}_conv2_2")()(Map("data" -> relu2_1,
+                            "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu2_2 = Symbol.Activation(s"${prefix}_relu2_2")()(Map("data" -> conv2_2,
+                            "act_type" -> "relu"))
+    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> relu2_2 , "pad" -> "(0,0)",
+                            "kernel" -> "(2,2)", "stride" -> "(2,2)", "pool_type" -> "avg"))
+    val conv3_1 = Symbol.Convolution(s"${prefix}_conv3_1")()(Map("data" -> pool2,
+                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu3_1 = Symbol.Activation(s"${prefix}_relu3_1")()(Map("data" -> conv3_1,
+                            "act_type" -> "relu"))
+    val conv3_2 = Symbol.Convolution(s"${prefix}_conv3_2")()(Map("data" -> relu3_1,
+                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu3_2 = Symbol.Activation(s"${prefix}_relu3_2")()(Map("data" -> conv3_2,
+                            "act_type" -> "relu"))
+    val conv3_3 = Symbol.Convolution(s"${prefix}_conv3_3")()(Map("data" -> relu3_2,
+                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu3_3 = Symbol.Activation(s"${prefix}_relu3_3")()(Map("data" -> conv3_3,
+                            "act_type" -> "relu"))
+    val conv3_4 = Symbol.Convolution(s"${prefix}_conv3_4")()(Map("data" -> relu3_3,
+                            "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu3_4 = Symbol.Activation(s"${prefix}_relu3_4")()(Map("data" -> conv3_4 ,
+                            "act_type" -> "relu"))
+    val pool3 = Symbol.Pooling(s"${prefix}_pool3")()(Map("data" -> relu3_4,
+                            "pad" -> "(0,0)", "kernel" -> "(2,2)", "stride" -> "(2,2)",
+                            "pool_type" -> "avg"))
+    val conv4_1 = Symbol.Convolution(s"${prefix}_conv4_1")()(Map("data" -> pool3,
+                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu4_1 = Symbol.Activation(s"${prefix}_relu4_1")()(Map("data" -> conv4_1,
+                            "act_type" -> "relu"))
+    val conv4_2 = Symbol.Convolution(s"${prefix}_conv4_2")()(Map("data" -> relu4_1,
+                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu4_2 = Symbol.Activation(s"${prefix}_relu4_2")()(Map("data" -> conv4_2,
+                            "act_type" -> "relu"))
+    val conv4_3 = Symbol.Convolution(s"${prefix}_conv4_3")()(Map("data" -> relu4_2,
+                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu4_3 = Symbol.Activation(s"${prefix}_relu4_3")()(Map("data" -> conv4_3,
+                            "act_type" -> "relu"))
+    val conv4_4 = Symbol.Convolution(s"${prefix}_conv4_4")()(Map("data" -> relu4_3,
+                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu4_4 = Symbol.Activation(s"${prefix}_relu4_4")()(Map("data" -> conv4_4,
+                            "act_type" -> "relu"))
+    val pool4 = Symbol.Pooling(s"${prefix}_pool4")()(Map("data" -> relu4_4,
+                            "pad" -> "(0,0)", "kernel" -> "(2,2)", "stride" -> "(2,2)",
+                            "pool_type" -> "avg"))
+    val conv5_1 = Symbol.Convolution(s"${prefix}_conv5_1")()(Map("data" -> pool4,
+                            "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)",
+                            "stride" -> "(1,1)", "no_bias" -> false, "workspace" -> 1024))
+    val relu5_1 = Symbol.Activation(s"${prefix}_relu5_1")()(Map("data" -> conv5_1,
+                            "act_type" -> "relu"))
+
+    // style and content layers
+    val style = if (contentOnly) null else Symbol.Group(relu1_1, relu2_1, relu3_1, relu4_1, relu5_1)
+    val content = Symbol.Group(relu4_2)
+    (style, content)
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Module.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Module.scala
new file mode 100644
index 000000000000..f8a8716ce77e
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Module.scala
@@ -0,0 +1,131 @@
+package ml.dmlc.mxnet.examples.neuralstyle.end2end
+
+import ml.dmlc.mxnet.Context
+import org.slf4j.LoggerFactory
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.Optimizer
+import ml.dmlc.mxnet.Executor
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Uniform
+import ml.dmlc.mxnet.Initializer
+import ml.dmlc.mxnet.DataBatch
+
+/**
+ * @author Depeng Liang
+ */
+class Module(symbol: Symbol,
+             context: Context,
+             dataShapes: Map[String, Shape],
+             labelShapes: Map[String, Shape] = Map[String, Shape](),
+             initializer: Initializer = new Uniform(0.01f),
+             forTraining: Boolean = true,
+             inputsNeedGrad: Boolean = false) {
+
+  private val logger = LoggerFactory.getLogger(classOf[Module])
+
+  private val dataLabelShape = dataShapes ++ labelShapes
+  private val (argDict, gradDict, auxDict) = {
+    val (argShapes, outShapes, auxShapes) = symbol.inferShape(dataLabelShape)
+    val argNames = symbol.listArguments()
+    val argDict = argNames.zip(argShapes.map(NDArray.empty(_, context))).toMap
+
+    val filterShapes = if (inputsNeedGrad) labelShapes else dataLabelShape
+    val gradDict = argNames.zip(argShapes).filter { case (name, shape) =>
+      !filterShapes.contains(name)
+    }.map(x => x._1 -> NDArray.empty(x._2, context) ).toMap
+
+    val auxDict = symbol.listAuxiliaryStates().zip(auxShapes.map(NDArray.empty(_, context))).toMap
+
+    (argDict, gradDict, auxDict)
+  }
+
+  private val dataArrs = dataShapes.keys.toArray.map(argDict(_))
+  private val labelArrs = labelShapes.keys.toArray.map(argDict(_))
+  private val dataGrads = {
+    if (inputsNeedGrad) dataShapes.keys.toArray.map(gradDict(_))
+    else null
+  }
+
+  argDict.foreach { case (name, ndArray) =>
+    if (!dataLabelShape.contains(name)) initializer(name, ndArray)
+  }
+
+  private val executor = symbol.bind(context, argDict, gradDict, "write", auxDict, null, null)
+
+  private var optimizer: Optimizer = null
+  private var paramsGrads: List[(Int, String, NDArray, AnyRef)] = null
+  private var optimizerInitialized: Boolean = false
+
+  def initOptimizer(opt: Optimizer): Unit = {
+    this.optimizer = opt
+    this.paramsGrads = gradDict.toList.zipWithIndex.map { case ((name, grad), idx) =>
+      (idx, name, grad, this.optimizer.createState(idx, argDict(name)))
+    }
+    this.optimizerInitialized = true
+  }
+
+  def forward(datas: Array[NDArray], labels: Array[NDArray] = Array[NDArray]()): Unit = {
+    datas.zip(this.dataArrs).foreach { case (src, dest) => dest.set(src) }
+    labels.zip(this.labelArrs).foreach { case (src, dest) => dest.set(src) }
+    this.executor.forward(isTrain = forTraining)
+  }
+
+  def backward(outGrads: Array[NDArray]): Unit = {
+    this.executor.backward(outGrads)
+  }
+
+  def update(): Unit = {
+    assert(this.optimizerInitialized)
+    paramsGrads.foreach { case (idx, name, grad, optimState) =>
+      this.optimizer.update(idx, argDict(name), grad, optimState)
+    }
+  }
+
+  def dispose(): Unit = {
+    this.executor.dispose()
+    this.argDict.foreach(_._2.dispose())
+    this.gradDict.foreach(_._2.dispose())
+    this.auxDict.foreach(_._2.dispose())
+  }
+
+  def setParams(params: Map[String, NDArray]): Unit = {
+    params.foreach { case (name, arr) =>
+      if (this.argDict.contains(name)) {
+        this.argDict(name).set(arr)
+      }
+      else if (this.auxDict.contains(name)) {
+        this.auxDict(name).set(arr)
+      }
+      else logger.info(name)
+    }
+  }
+
+  def loadParams(fName: String): Unit = {
+    val saveDict = NDArray.load2Map(fName)
+    var params = Map[String, NDArray]()
+    saveDict.foreach { case (k, v) =>
+      val (argType, name) = {
+        val tmp = k.split(":")
+        (tmp(0), tmp(1))
+      }
+      if (argType == "arg" || argType == "aux") {
+        params += name -> v
+      }
+    }
+    this.setParams(params)
+  }
+
+  def saveParams(fName: String): Unit = {
+    val saveDict = {
+      argDict.filter(x => !dataLabelShape.contains(x._1))
+      .map { case (k, v) => s"arg:$k" -> v } ++
+      auxDict.map { case (k, v) => s"aux:$k" -> v }
+    }
+    NDArray.save(fName, saveDict)
+  }
+
+  def getOutputs(): Array[NDArray] = this.executor.outputs
+
+  def getInputGrads(): Array[NDArray] = this.dataGrads
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
index 0e2e5f7de66b..6eda1cd860f9 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
@@ -16,26 +16,27 @@ object Lstm {
   def lstm(numHidden: Int, inData: Symbol, prevState: LSTMState,
            param: LSTMParam, seqIdx: Int, layerIdx: Int, dropout: Float = 0f): LSTMState = {
     val inDataa = {
-      if (dropout > 0f) Symbol.Dropout()(Map("data" -> inData, "p" -> dropout))
+      if (dropout > 0f) Symbol.Dropout()()(Map("data" -> inData, "p" -> dropout))
       else inData
     }
-    val i2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_i2h")(Map("data" -> inDataa,
+    val i2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_i2h")()(Map("data" -> inDataa,
                                                        "weight" -> param.i2hWeight,
                                                        "bias" -> param.i2hBias,
                                                        "num_hidden" -> numHidden * 4))
-    val h2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_h2h")(Map("data" -> prevState.h,
+    val h2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_h2h")()(Map("data" -> prevState.h,
                                                        "weight" -> param.h2hWeight,
                                                        "bias" -> param.h2hBias,
                                                        "num_hidden" -> numHidden * 4))
     val gates = i2h + h2h
-    val sliceGates = Symbol.SliceChannel(s"t${seqIdx}_l${layerIdx}_slice")(Array(gates),
-        Map("num_outputs" -> 4))
-    val ingate = Symbol.Activation()(Map("data" -> sliceGates.get(0), "act_type" -> "sigmoid"))
-    val inTransform = Symbol.Activation()(Map("data" -> sliceGates.get(1), "act_type" -> "tanh"))
-    val forgetGate = Symbol.Activation()(Map("data" -> sliceGates.get(2), "act_type" -> "sigmoid"))
-    val outGate = Symbol.Activation()(Map("data" -> sliceGates.get(3), "act_type" -> "sigmoid"))
+    val sliceGates = Symbol.SliceChannel(s"t${seqIdx}_l${layerIdx}_slice")(
+      gates)(Map("num_outputs" -> 4))
+    val ingate = Symbol.Activation()()(Map("data" -> sliceGates.get(0), "act_type" -> "sigmoid"))
+    val inTransform = Symbol.Activation()()(Map("data" -> sliceGates.get(1), "act_type" -> "tanh"))
+    val forgetGate = Symbol.Activation()()(
+      Map("data" -> sliceGates.get(2), "act_type" -> "sigmoid"))
+    val outGate = Symbol.Activation()()(Map("data" -> sliceGates.get(3), "act_type" -> "sigmoid"))
     val nextC = (forgetGate * prevState.c) + (ingate * inTransform)
-    val nextH = outGate * Symbol.Activation()(Map("data" -> nextC, "act_type" -> "tanh"))
+    val nextH = outGate * Symbol.Activation()()(Map("data" -> nextC, "act_type" -> "tanh"))
     LSTMState(c = nextC, h = nextH)
   }
 
@@ -65,9 +66,9 @@ object Lstm {
     // embeding layer
     val data = Symbol.Variable("data")
     var label = Symbol.Variable("softmax_label")
-    val embed = Symbol.Embedding("embed")(Map("data" -> data, "input_dim" -> inputSize,
+    val embed = Symbol.Embedding("embed")()(Map("data" -> data, "input_dim" -> inputSize,
                                            "weight" -> embedWeight, "output_dim" -> numEmbed))
-    val wordvec = Symbol.SliceChannel()(Array(embed),
+    val wordvec = Symbol.SliceChannel()(embed)(
       Map("num_outputs" -> seqLen, "squeeze_axis" -> true))
 
     var hiddenAll = Array[Symbol]()
@@ -86,15 +87,15 @@ object Lstm {
         lastStates(i) = nextState
       }
       // decoder
-      if (dropout > 0f) hidden = Symbol.Dropout()(Map("data" -> hidden, "p" -> dropout))
+      if (dropout > 0f) hidden = Symbol.Dropout()()(Map("data" -> hidden, "p" -> dropout))
       hiddenAll = hiddenAll :+ hidden
     }
-    val hiddenConcat = Symbol.Concat()(hiddenAll, Map("dim" -> 0))
-    val pred = Symbol.FullyConnected("pred")(Map("data" -> hiddenConcat, "num_hidden" -> numLabel,
-                                            "weight" -> clsWeight, "bias" -> clsBias))
-    label = Symbol.transpose(label)
-    label = Symbol.Reshape()(Map("data" -> label, "target_shape" -> "(0,)"))
-    val sm = Symbol.SoftmaxOutput("softmax")(Map("data" -> pred, "label" -> label))
+    val hiddenConcat = Symbol.Concat()(hiddenAll: _*)(Map("dim" -> 0))
+    val pred = Symbol.FullyConnected("pred")()(Map("data" -> hiddenConcat, "num_hidden" -> numLabel,
+                                                   "weight" -> clsWeight, "bias" -> clsBias))
+    label = Symbol.transpose()(label)()
+    label = Symbol.Reshape()()(Map("data" -> label, "target_shape" -> "(0,)"))
+    val sm = Symbol.SoftmaxOutput("softmax")()(Map("data" -> pred, "label" -> label))
     sm
   }
 
@@ -119,8 +120,8 @@ object Lstm {
 
     val data = Symbol.Variable("data")
 
-    var hidden = Symbol.Embedding("embed")(Map("data" -> data, "input_dim" -> inputSize,
-                                           "weight" -> embedWeight, "output_dim" -> numEmbed))
+    var hidden = Symbol.Embedding("embed")()(Map("data" -> data, "input_dim" -> inputSize,
+                                             "weight" -> embedWeight, "output_dim" -> numEmbed))
 
     var dpRatio = 0f
     // stack LSTM
@@ -134,10 +135,10 @@ object Lstm {
       lastStates(i) = nextState
     }
     // decoder
-    if (dropout > 0f) hidden = Symbol.Dropout()(Map("data" -> hidden, "p" -> dropout))
-    val fc = Symbol.FullyConnected("pred")(Map("data" -> hidden, "num_hidden" -> numLabel,
+    if (dropout > 0f) hidden = Symbol.Dropout()()(Map("data" -> hidden, "p" -> dropout))
+    val fc = Symbol.FullyConnected("pred")()(Map("data" -> hidden, "num_hidden" -> numLabel,
                                       "weight" -> clsWeight, "bias" -> clsBias))
-    val sm = Symbol.SoftmaxOutput("softmax")(Map("data" -> fc))
+    val sm = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc))
     var output = Array(sm)
     for (state <- lastStates) {
       output = output :+ state.c
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/AlexNet.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/AlexNet.scala
new file mode 100644
index 000000000000..a8d0f88da306
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/AlexNet.scala
@@ -0,0 +1,55 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import ml.dmlc.mxnet.Symbol
+
+/**
+ * @author Depeng Liang
+ */
+object AlexNet {
+
+  def getSymbol(numClasses: Int = 1000): Symbol = {
+    val inputData = Symbol.Variable("data")
+    // stage 1
+    val conv1 = Symbol.Convolution()()(Map(
+        "data" -> inputData, "kernel" -> "(11, 11)", "stride" -> "(4, 4)", "num_filter" -> 96))
+    val relu1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "relu"))
+    val pool1 = Symbol.Pooling()()(Map(
+        "data" -> relu1, "pool_type" -> "max", "kernel" -> "(3, 3)", "stride" -> "(2,2)"))
+    val lrn1 = Symbol.LRN()()(Map("data" -> pool1,
+        "alpha" -> 0.0001f, "beta" -> 0.75f, "knorm" -> 1f, "nsize" -> 5))
+    // stage 2
+    val conv2 = Symbol.Convolution()()(Map(
+        "data" -> lrn1, "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "num_filter" -> 256))
+    val relu2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "relu"))
+    val pool2 = Symbol.Pooling()()(Map("data" -> relu2,
+        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> "max"))
+    val lrn2 = Symbol.LRN()()(Map("data" -> pool2,
+        "alpha" -> 0.0001f, "beta" -> 0.75f, "knorm" -> 1f, "nsize" -> 5))
+    // stage 3
+    val conv3 = Symbol.Convolution()()(Map(
+        "data" -> lrn2, "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "num_filter" -> 384))
+    val relu3 = Symbol.Activation()()(Map("data" -> conv3, "act_type" -> "relu"))
+    val conv4 = Symbol.Convolution()()(Map(
+        "data" -> relu3, "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "num_filter" -> 384))
+    val relu4 = Symbol.Activation()()(Map("data" -> conv4, "act_type" -> "relu"))
+    val conv5 = Symbol.Convolution()()(Map(
+        "data" -> relu4, "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "num_filter" -> 256))
+    val relu5 = Symbol.Activation()()(Map("data" -> conv5, "act_type" -> "relu"))
+    val pool3 = Symbol.Pooling()()(Map("data" -> relu5,
+        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> "max"))
+    // stage 4
+    val flatten = Symbol.Flatten()()(Map("data" -> pool3))
+    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 4096))
+    val relu6 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "relu"))
+    val dropout1 = Symbol.Dropout()()(Map("data" -> relu6, "p" -> 0.5f))
+    // stage 5
+    val fc2 = Symbol.FullyConnected()()(Map("data" -> dropout1, "num_hidden" -> 4096))
+    val relu7 = Symbol.Activation()()(Map("data" -> fc2, "act_type" -> "relu"))
+    val dropout2 = Symbol.Dropout()()(Map("data" -> relu7, "p" -> 0.5f))
+    // stage 6
+    val fc3 = Symbol.FullyConnected()()(
+        Map("data" -> dropout2, "num_hidden" -> numClasses))
+    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc3))
+    softmax
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ExampleVis.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ExampleVis.scala
new file mode 100644
index 000000000000..f60d68a90354
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ExampleVis.scala
@@ -0,0 +1,73 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+import scala.collection.JavaConverters._
+import scala.util.parsing.json._
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Visualization
+
+/**
+ * @author Depeng Liang
+ */
+object ExampleVis {
+  private val logger = LoggerFactory.getLogger(classOf[ExampleVis])
+
+  val netsList = List("LeNet", "AlexNet", "VGG", "GoogleNet",
+      "Inception_BN", "Inception_V3", "ResNet_Small")
+
+  val netShapes = Map(
+      "LeNet" -> Shape(1, 1, 28, 28),
+      "AlexNet" -> Shape(1, 1, 224, 224),
+      "VGG" -> Shape(1, 1, 224, 224),
+      "GoogleNet" -> Shape(1, 1, 299, 299),
+      "Inception_BN" -> Shape(1, 1, 299, 299),
+      "Inception_V3" -> Shape(1, 1, 299, 299),
+      "ResNet_Small" -> Shape(1, 1, 28, 28)
+  )
+
+  def getNetSymbol(net: String): (Symbol, Shape) = {
+    assert(netsList.contains(net), s"Supported nets: ${netsList.mkString(", ")}")
+    net match {
+      case "LeNet" => (LeNet.getSymbol(), netShapes(net))
+      case "AlexNet" => (AlexNet.getSymbol(), netShapes(net))
+      case "VGG" => (VGG.getSymbol(), netShapes(net))
+      case "GoogleNet" => (GoogleNet.getSymbol(), netShapes(net))
+      case "Inception_BN" => (Inception_BN.getSymbol(), netShapes(net))
+      case "Inception_V3" => (Inception_V3.getSymbol(), netShapes(net))
+      case "ResNet_Small" => (ResNet_Small.getSymbol(), netShapes(net))
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    val leis = new ExampleVis
+    val parser: CmdLineParser = new CmdLineParser(leis)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(leis.outDir != null)
+
+      val (sym, shape) = getNetSymbol(leis.net)
+
+      val dot = Visualization.plotNetwork(symbol = sym,
+          title = leis.net, shape = Map("data" -> shape),
+          nodeAttrs = Map("shape" -> "rect", "fixedsize" -> "false"))
+
+      dot.render(engine = "dot", format = "pdf", fileName = leis.net, path = leis.outDir)
+
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class ExampleVis {
+  @Option(name = "--out-dir", usage = "the output path")
+  private val outDir: String = null
+  @Option(name = "--net", usage = "network to visualize, e.g. LeNet, AlexNet, VGG ...")
+  private val net: String = "LeNet"
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/GoogleNet.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/GoogleNet.scala
new file mode 100644
index 000000000000..1ad30e2a27ac
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/GoogleNet.scala
@@ -0,0 +1,77 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import ml.dmlc.mxnet.Symbol
+
+/**
+ * @author Depeng Liang
+ */
+object GoogleNet {
+
+  def ConvFactory(data: Symbol, numFilter: Int, kernel: (Int, Int), stride: (Int, Int) = (1, 1),
+      pad: (Int, Int) = (0, 0), name: String = "", suffix: String = ""): Symbol = {
+    val conv = Symbol.Convolution(s"conv_${name}${suffix}")()(
+      Map("data" -> data, "num_filter" -> numFilter, "kernel" -> s"$kernel",
+        "stride" -> s"$stride", "pad" -> s"$pad"))
+    val act = Symbol.Activation(s"relu_${name}${suffix}")()(
+        Map("data" -> conv, "act_type" -> "relu"))
+    act
+  }
+
+  def InceptionFactory(data: Symbol, num1x1: Int, num3x3red: Int, num3x3: Int,
+      numd5x5red: Int, numd5x5: Int, pool: String, proj: Int, name: String): Symbol = {
+      // 1x1
+      val c1x1 = ConvFactory(data = data, numFilter = num1x1,
+          kernel = (1, 1), name = s"${name}_1x1")
+      // 3x3 reduce + 3x3
+      val c3x3r = ConvFactory(data = data, numFilter = num3x3red,
+          kernel = (1, 1), name = s"${name}_3x3", suffix = "_reduce")
+      val c3x3 = ConvFactory(data = c3x3r, numFilter = num3x3,
+          kernel = (3, 3), pad = (1, 1), name = s"${name}_3x3")
+      // double 3x3 reduce + double 3x3
+      val cd5x5r = ConvFactory(data = data, numFilter = numd5x5red,
+          kernel = (1, 1), name = s"${name}_5x5", suffix = "_reduce")
+      val cd5x5 = ConvFactory(data = cd5x5r, numFilter = numd5x5,
+          kernel = (5, 5), pad = (2, 2), name = s"${name}_5x5")
+      // pool + proj
+      val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(Map("data" -> data,
+          "kernel" -> "(3, 3)", "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
+      val cproj = ConvFactory(data = pooling, numFilter = proj,
+          kernel = (1, 1), name = s"${name}_proj")
+      // concat
+      val concat =
+        Symbol.Concat(s"ch_concat_${name}_chconcat")(c1x1, c3x3, cd5x5, cproj)()
+      concat
+  }
+
+  def getSymbol(numClasses: Int = 1000): Symbol = {
+    val data = Symbol.Variable("data")
+    val conv1 = ConvFactory(data, 64, kernel = (7, 7),
+        stride = (2, 2), pad = (3, 3), name = "conv1")
+    val pool1 = Symbol.Pooling()()(Map("data" -> conv1, "kernel" -> "(3, 3)",
+        "stride" -> "(2, 2)", "pool_type" -> "max"))
+    val conv2 = ConvFactory(pool1, 64, kernel = (1, 1), stride = (1, 1), name = "conv2")
+    val conv3 = ConvFactory(conv2, 192, kernel = (3, 3),
+        stride = (1, 1), pad = (1, 1), name = "conv3")
+    val pool3 = Symbol.Pooling()()(Map("data" -> conv3,
+        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> "max"))
+    val in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, "max", 32, name = "in3a")
+    val in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, "max", 64, name = "in3b")
+    val pool4 = Symbol.Pooling()()(Map("data" -> in3b, "kernel" -> "(3, 3)",
+        "stride" -> "(2, 2)", "pool_type" -> "max"))
+    val in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, "max", 64, name = "in4a")
+    val in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, "max", 64, name = "in4b")
+    val in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, "max", 64, name = "in4c")
+    val in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, "max", 64, name = "in4d")
+    val in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, "max", 128, name = "in4e")
+    val pool5 = Symbol.Pooling()()(Map("data" -> in4e, "kernel" -> "(3, 3)",
+        "stride" -> "(2, 2)", "pool_type" -> "max"))
+    val in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, "max", 128, name = "in5a")
+    val in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, "max", 128, name = "in5b")
+    val pool6 = Symbol.Pooling()()(Map("data" -> in5b, "kernel" -> "(7, 7)",
+        "stride" -> "(1,1)", "pool_type" -> "avg"))
+    val flatten = Symbol.Flatten()()(Map("data" -> pool6))
+    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> numClasses))
+    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc1))
+    softmax
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_BN.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_BN.scala
new file mode 100644
index 000000000000..c0a0b5ecacff
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_BN.scala
@@ -0,0 +1,110 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import ml.dmlc.mxnet.Symbol
+
+/**
+ * @author Depeng Liang
+ */
+object Inception_BN {
+
+  def ConvFactory(data: Symbol, numFilter: Int, kernel: (Int, Int), stride: (Int, Int) = (1, 1),
+      pad: (Int, Int) = (0, 0), name: String = "", suffix: String = ""): Symbol = {
+    val conv = Symbol.Convolution(s"conv_${name}${suffix}")()(
+        Map("data" -> data, "num_filter" -> numFilter, "kernel" -> s"$kernel",
+            "stride" -> s"$stride", "pad" -> s"$pad"))
+    val bn = Symbol.BatchNorm(s"bn_${name}${suffix}")()(Map("data" -> conv))
+    val act = Symbol.Activation(s"relu_${name}${suffix}")()(
+        Map("data" -> bn, "act_type" -> "relu"))
+    act
+  }
+
+  def InceptionFactoryA(data: Symbol, num1x1: Int, num3x3red: Int, num3x3: Int,
+      numd3x3red: Int, numd3x3: Int, pool: String, proj: Int, name: String): Symbol = {
+    // 1x1
+    val c1x1 = ConvFactory(data = data, numFilter = num1x1,
+        kernel = (1, 1), name = s"${name}_1x1")
+    // 3x3 reduce + 3x3
+    val c3x3r = ConvFactory(data = data, numFilter = num3x3red,
+        kernel = (1, 1), name = s"${name}_3x3", suffix = "_reduce")
+    val c3x3 = ConvFactory(data = c3x3r, numFilter = num3x3,
+        kernel = (3, 3), pad = (1, 1), name = s"${name}_3x3")
+    // double 3x3 reduce + double 3x3
+    val cd3x3r = ConvFactory(data = data, numFilter = numd3x3red,
+        kernel = (1, 1), name = s"${name}_double_3x3", suffix = "_reduce")
+    var cd3x3 = ConvFactory(data = cd3x3r, numFilter = numd3x3,
+        kernel = (3, 3), pad = (1, 1), name = s"${name}_double_3x3_0")
+    cd3x3 = ConvFactory(data = cd3x3, numFilter = numd3x3,
+        kernel = (3, 3), pad = (1, 1), name = s"${name}_double_3x3_1")
+    // pool + proj
+    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
+        Map("data" -> data, "kernel" -> "(3, 3)", "stride" -> "(1, 1)",
+            "pad" -> "(1, 1)", "pool_type" -> pool))
+    val cproj = ConvFactory(data = pooling, numFilter = proj,
+        kernel = (1, 1), name = s"${name}_proj")
+    // concat
+    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(c1x1, c3x3, cd3x3, cproj)()
+    concat
+  }
+
+  def InceptionFactoryB(data: Symbol, num3x3red : Int, num3x3 : Int,
+      numd3x3red : Int, numd3x3 : Int, name: String): Symbol = {
+    // 3x3 reduce + 3x3
+    val c3x3r = ConvFactory(data = data, numFilter = num3x3red,
+        kernel = (1, 1), name = s"${name}_3x3", suffix = "_reduce")
+    val c3x3 = ConvFactory(data = c3x3r, numFilter = num3x3,
+        kernel = (3, 3), pad = (1, 1), stride = (2, 2), name = s"${name}_3x3")
+    // double 3x3 reduce + double 3x3
+    val cd3x3r = ConvFactory(data = data, numFilter = numd3x3red,
+        kernel = (1, 1), name = s"${name}_double_3x3", suffix = "_reduce")
+    var cd3x3 = ConvFactory(data = cd3x3r, numFilter = numd3x3,
+        kernel = (3, 3), pad = (1, 1), stride = (1, 1), name = s"${name}_double_3x3_0")
+    cd3x3 = ConvFactory(data = cd3x3, numFilter = numd3x3,
+        kernel = (3, 3), pad = (1, 1), stride = (2, 2), name = s"${name}_double_3x3_1")
+    // pool + proj
+    val pooling = Symbol.Pooling(s"max_pool_${name}_pool")()(
+        Map("data" -> data, "kernel" -> "(3, 3)", "stride" -> "(2, 2)",
+            "pad" -> "(1, 1)", "pool_type" -> "max"))
+    // concat
+    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(c3x3, cd3x3, pooling)()
+    concat
+  }
+
+  def getSymbol(numClasses: Int = 1000): Symbol = {
+    // data
+    val data = Symbol.Variable("data")
+    // stage 1
+    val conv1 = ConvFactory(data = data, numFilter = 64,
+        kernel = (7, 7), stride = (2, 2), pad = (3, 3), name = "conv1")
+    val pool1 = Symbol.Pooling("pool1")()(Map("data" -> conv1, "kernel" -> "(3, 3)",
+        "stride" -> "(2, 2)", "pool_type" -> "max"))
+    // stage 2
+    val conv2red = ConvFactory(data = pool1, numFilter = 64,
+        kernel = (1, 1), stride = (1, 1), name = "conv2red")
+    val conv2 = ConvFactory(data = conv2red, numFilter = 192,
+        kernel = (3, 3), stride = (1, 1), pad = (1, 1), name = "conv2")
+    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> conv2, "kernel" -> "(3, 3)",
+        "stride" -> "(2, 2)", "pool_type" -> "max"))
+    // stage 2
+    val in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, "3a")
+    val in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, "3b")
+    val in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, "3c")
+    // stage 3
+    val in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, "4a")
+    val in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, "4b")
+    val in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, "4c")
+    val in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, "4d")
+    val in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, "4e")
+    // stage 4
+    val in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, "5a")
+    val in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, "5b")
+    // global avg pooling
+    val avg = Symbol.Pooling("global_pool")()(Map("data" -> in5b, "kernel" -> "(7, 7)",
+        "stride" -> "(1, 1)", "pool_type" -> "avg"))
+    // linear classifier
+    val flatten = Symbol.Flatten("flatten")()(Map("data" -> avg))
+    val fc1 = Symbol.FullyConnected("fc1")()(
+        Map("data" -> flatten, "num_hidden" -> numClasses))
+    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc1))
+    softmax
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_V3.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_V3.scala
new file mode 100644
index 000000000000..f00f39f5d670
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_V3.scala
@@ -0,0 +1,236 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import ml.dmlc.mxnet.Symbol
+
+/**
+ * @author Depeng Liang
+ */
+object Inception_V3 {
+
+  def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (1, 1), stride: (Int, Int) = (1, 1),
+      pad: (Int, Int) = (0, 0), name: String = "", suffix: String = ""): Symbol = {
+    val conv = Symbol.Convolution(s"${name}${suffix}_conv2d")()(
+        Map("data" -> data,
+        "num_filter" -> numFilter, "kernel" -> s"$kernel", "stride" -> s"$stride"
+        , "pad" -> s"$pad", "no_bias" -> true))
+    val bn = Symbol.BatchNorm(s"${name}${suffix}_batchnorm")()(
+        Map("data" -> conv, "fix_gamma" -> true))
+    val act = Symbol.Activation(s"${name}${suffix}_relu")()(
+        Map("data" -> bn, "act_type" -> "relu"))
+    act
+  }
+
+  def Inception7A(
+      data: Symbol,
+      num_1x1: Int,
+      num_3x3_red: Int, num_3x3_1: Int, num_3x3_2: Int,
+      num_5x5_red: Int, num_5x5: Int,
+      pool: String, proj: Int,
+      name: String): Symbol = {
+    val tower_1x1 = Conv(data, num_1x1, name = s"${name}_conv")
+    var tower_5x5 = Conv(data, num_5x5_red,
+        name = s"${name}_tower", suffix = "_conv")
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel = (5, 5),
+        pad = (2, 2), name = s"${name}_tower", suffix = "_conv_1")
+    var tower_3x3 = Conv(data, num_3x3_red,
+        name = s"${name}_tower_1", suffix = "_conv")
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel = (3, 3),
+        pad = (1, 1), name = s"${name}_tower_1", suffix = "_conv_1")
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel = (3, 3),
+        pad = (1, 1), name = s"${name}_tower_1", suffix = "_conv_2")
+    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
+        Map("data" -> data, "kernel" -> "(3, 3)",
+        "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
+    val cproj = Conv(pooling, proj, name = s"${name}_tower_2", suffix = "_conv")
+    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
+        tower_1x1, tower_5x5, tower_3x3, cproj)()
+    concat
+  }
+
+  // First Downsample
+  def Inception7B(
+      data: Symbol,
+      num_3x3: Int,
+      num_d3x3_red: Int, num_d3x3_1: Int, num_d3x3_2: Int,
+      pool: String,
+      name: String): Symbol = {
+    val tower_3x3 = Conv(data, num_3x3, kernel = (3, 3), pad = (0, 0),
+        stride = (2, 2), name = s"${name}_conv")
+    var tower_d3x3 = Conv(data, num_d3x3_red,
+        name = s"${name}_tower", suffix = "_conv")
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel = (3, 3),
+        pad = (1, 1), stride = (1, 1), name = s"${name}_tower", suffix = "_conv_1")
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel = (3, 3),
+        pad = (0, 0), stride = (2, 2), name = s"${name}_tower", suffix = "_conv_2")
+    val pooling = Symbol.Pooling(s"max_pool_${name}_pool")()(Map("data" -> data,
+        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pad" -> "(0,0)", "pool_type" -> "max"))
+    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
+        tower_3x3, tower_d3x3, pooling)()
+    concat
+  }
+
+  // scalastyle:off parameterNum
+  def Inception7C(
+      data: Symbol,
+      num_1x1: Int,
+      num_d7_red: Int, num_d7_1: Int, num_d7_2: Int,
+      num_q7_red: Int, num_q7_1: Int, num_q7_2: Int, num_q7_3: Int, num_q7_4: Int,
+      pool: String, proj: Int,
+      name: String): Symbol = {
+    val tower_1x1 = Conv(data = data, numFilter = num_1x1,
+        kernel = (1, 1), name = s"${name}_conv")
+    var tower_d7 = Conv(data = data, numFilter = num_d7_red,
+        name = s"${name}_tower", suffix = "_conv")
+    tower_d7 = Conv(data = tower_d7, numFilter = num_d7_1, kernel = (1, 7),
+        pad = (0, 3), name = s"${name}_tower", suffix = "_conv_1")
+    tower_d7 = Conv(data = tower_d7, numFilter = num_d7_2, kernel = (7, 1),
+        pad = (3, 0), name = s"${name}_tower", suffix = "_conv_2")
+    var tower_q7 = Conv(data = data, numFilter = num_q7_red,
+        name = s"${name}_tower_1", suffix = "_conv")
+    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_1, kernel = (7, 1),
+        pad = (3, 0), name = s"${name}_tower_1", suffix = "_conv_1")
+    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_2, kernel = (1, 7),
+        pad = (0, 3), name = s"${name}_tower_1", suffix = "_conv_2")
+    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_3, kernel = (7, 1),
+        pad = (3, 0), name = s"${name}_tower_1", suffix = "_conv_3")
+    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_4, kernel = (1, 7),
+        pad = (0, 3), name = s"${name}_tower_1", suffix = "_conv_4")
+    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
+        Map("data" -> data, "kernel" -> "(3, 3)",
+        "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
+    val cproj = Conv(data = pooling, numFilter = proj, kernel = (1, 1),
+        name = s"${name}_tower_2", suffix = "_conv")
+    // concat
+    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
+        tower_1x1, tower_d7, tower_q7, cproj)()
+    concat
+  }
+
+  def Inception7D(
+      data: Symbol,
+      num_3x3_red: Int, num_3x3: Int,
+      num_d7_3x3_red: Int, num_d7_1: Int, num_d7_2: Int, num_d7_3x3: Int,
+      pool: String,
+      name: String): Symbol = {
+    var tower_3x3 = Conv(data = data, numFilter = num_3x3_red,
+        name = s"${name}_tower", suffix = "_conv")
+    tower_3x3 = Conv(data = tower_3x3, numFilter = num_3x3, kernel = (3, 3),
+        pad = (0, 0), stride = (2, 2), name = s"${name}_tower", suffix = "_conv_1")
+    var tower_d7_3x3 = Conv(data = data, numFilter = num_d7_3x3_red,
+        name = s"${name}_tower_1", suffix = "_conv")
+    tower_d7_3x3 = Conv(data = tower_d7_3x3, numFilter = num_d7_1,
+        kernel = (1, 7), pad = (0, 3), name = s"${name}_tower_1", suffix = "_conv_1")
+    tower_d7_3x3 = Conv(data = tower_d7_3x3, numFilter = num_d7_2,
+        kernel = (7, 1), pad = (3, 0), name = s"${name}_tower_1", suffix = "_conv_2")
+    tower_d7_3x3 = Conv(data = tower_d7_3x3, numFilter = num_d7_3x3,
+        kernel = (3, 3), stride = (2, 2), name = s"${name}_tower_1", suffix = "_conv_3")
+    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
+        Map("data" -> data, "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> pool))
+    // concat
+    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
+        tower_3x3, tower_d7_3x3, pooling)()
+    concat
+  }
+
+  def Inception7E(
+      data: Symbol,
+      num_1x1: Int,
+      num_d3_red: Int, num_d3_1: Int, num_d3_2: Int,
+      num_3x3_d3_red: Int, num_3x3: Int, num_3x3_d3_1: Int, num_3x3_d3_2: Int,
+      pool: String, proj: Int,
+      name: String): Symbol = {
+    val tower_1x1 = Conv(data = data, numFilter = num_1x1,
+        kernel = (1, 1), name = s"${name}_conv")
+    val tower_d3 = Conv(data = data, numFilter = num_d3_red,
+        name = s"${name}_tower", suffix = "_conv")
+    val tower_d3_a = Conv(data = tower_d3, numFilter = num_d3_1, kernel = (1, 3),
+        pad = (0, 1), name = s"${name}_tower", suffix = "_mixed_conv")
+    val tower_d3_b = Conv(data = tower_d3, numFilter = num_d3_2, kernel = (3, 1),
+        pad = (1, 0), name = s"${name}_tower", suffix = "_mixed_conv_1")
+    var tower_3x3_d3 = Conv(data = data, numFilter = num_3x3_d3_red,
+        name = s"${name}_tower_1", suffix = "_conv")
+    tower_3x3_d3 = Conv(data = tower_3x3_d3, numFilter = num_3x3, kernel = (3, 3),
+        pad = (1, 1), name = s"${name}_tower_1", suffix = "_conv_1")
+    val tower_3x3_d3_a = Conv(data = tower_3x3_d3, numFilter = num_3x3_d3_1,
+        kernel = (1, 3), pad = (0, 1), name = s"${name}_tower_1", suffix = "_mixed_conv")
+    val tower_3x3_d3_b = Conv(data = tower_3x3_d3, numFilter = num_3x3_d3_2,
+        kernel = (3, 1), pad = (1, 0), name = s"${name}_tower_1", suffix = "_mixed_conv_1")
+    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(Map("data" -> data,
+        "kernel" -> "(3, 3)", "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
+    val cproj = Conv(data = pooling, numFilter = proj, kernel = (1, 1),
+        name = s"${name}_tower_2", suffix = "_conv")
+    // concat
+    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
+        tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj)()
+    concat
+  }
+  // scalastyle:on parameterNum
+
+  def getSymbol(numClasses: Int = 1000): Symbol = {
+    val data = Symbol.Variable("data")
+    // stage 1
+    val conv = Conv(data, 32, kernel = (3, 3), stride = (2, 2), name = "conv")
+    val conv_1 = Conv(conv, 32, kernel = (3, 3), name = "conv_1")
+    val conv_2 = Conv(conv_1, 64, kernel = (3, 3), pad = (1, 1), name = "conv_2")
+    var pool = Symbol.Pooling("pool")()(Map("data" -> conv_2, "kernel" -> "(3, 3)",
+        "stride" -> "(2, 2)", "pool_type" -> "max"))
+    // stage 2
+    val conv_3 = Conv(pool, 80, kernel = (1, 1), name = "conv_3")
+    val conv_4 = Conv(conv_3, 192, kernel = (3, 3), name = "conv_4")
+    val pool1 = Symbol.Pooling("pool1")()(Map("data" -> conv_4, "kernel" -> "(3, 3)",
+        "stride" -> "(2, 2)", "pool_type" -> "max"))
+    // stage 3
+    val in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    val in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    val in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    val in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    // stage 4
+    val in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    val in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    val in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    val in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    val in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    // stage 5
+    val in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    val in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    // pool
+    pool = Symbol.Pooling("global_pool")()(Map("data" -> in5b,
+        "kernel" -> "(8, 8)", "stride" -> "(1, 1)", "pool_type" -> "avg"))
+    val flatten = Symbol.Flatten("flatten")()(Map("data" -> pool))
+    val fc1 = Symbol.FullyConnected("fc1")()(
+        Map("data" -> flatten, "num_hidden" -> numClasses))
+    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc1))
+    softmax
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/LeNet.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/LeNet.scala
new file mode 100644
index 000000000000..c4c4fa197f3d
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/LeNet.scala
@@ -0,0 +1,35 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import ml.dmlc.mxnet.Symbol
+
+/**
+ * @author Depeng Liang
+ */
+object LeNet {
+
+  def getSymbol(numClasses: Int = 10): Symbol = {
+    val data = Symbol.Variable("data")
+    // first conv
+    val conv1 = Symbol.Convolution()()(
+      Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
+    val tanh1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "tanh"))
+    val pool1 = Symbol.Pooling()()(Map("data" -> tanh1, "pool_type" -> "max",
+                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
+    // second conv
+    val conv2 = Symbol.Convolution()()(
+      Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
+    val tanh2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "tanh"))
+    val pool2 = Symbol.Pooling()()(Map("data" -> tanh2, "pool_type" -> "max",
+                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
+    // first fullc
+    val flatten = Symbol.Flatten()()(Map("data" -> pool2))
+    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 500))
+    val tanh3 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "tanh"))
+    // second fullc
+    val fc2 = Symbol.FullyConnected()()(
+        Map("data" -> tanh3, "num_hidden" -> numClasses))
+    // loss
+    val lenet = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc2))
+    lenet
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ResNet_Small.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ResNet_Small.scala
new file mode 100644
index 000000000000..cf31497841e1
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ResNet_Small.scala
@@ -0,0 +1,93 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import ml.dmlc.mxnet.Symbol
+
+/**
+ * @author Depeng Liang
+ */
+object ResNet_Small {
+
+  sealed trait ConvType
+  case object ConvWithoutAct extends ConvType
+  case object ConvWitAct extends ConvType
+
+  def convFactory(data: Symbol, numFilter: Int, kernel: (Int, Int),
+      stride: (Int, Int), pad: (Int, Int), actType: String = "relu",
+      convType: ConvType = ConvWitAct): Symbol = convType match {
+    case ConvWitAct => {
+      val conv = Symbol.Convolution()()(Map("data" -> data,
+          "num_filter" -> numFilter, "kernel" -> s"$kernel",
+          "stride" -> s"$stride", "pad" -> s"$pad"))
+      val bn = Symbol.BatchNorm()()(Map("data" -> conv))
+      val act = Symbol.Activation()()(Map("data" -> bn, "act_type" -> actType))
+      act
+    }
+    case ConvWithoutAct => {
+      val conv = Symbol.Convolution()()(Map("data" -> data,
+          "num_filter" -> numFilter, "kernel" -> s"$kernel",
+          "stride" -> s"$stride", "pad" -> s"$pad"))
+      val bn = Symbol.BatchNorm()()(Map("data" -> conv))
+      bn
+    }
+  }
+
+  def residualFactory(data: Symbol, numFilter: Int, dimMatch: Boolean): Symbol = {
+    if (dimMatch == true) {
+        val identityData = data
+        val conv1 = convFactory(data = data, numFilter = numFilter, kernel = (3, 3),
+            stride = (1, 1), pad = (1, 1), actType = "relu", convType = ConvWitAct)
+
+        val conv2 = convFactory(data = conv1, numFilter = numFilter, kernel = (3, 3),
+            stride = (1, 1), pad = (1, 1), convType = ConvWithoutAct)
+        val newData = identityData + conv2
+        val act = Symbol.Activation()()(Map("data" -> newData, "act_type" -> "relu"))
+        act
+    } else {
+        val conv1 = convFactory(data = data, numFilter = numFilter, kernel = (3, 3),
+            stride = (2, 2), pad = (1, 1), actType = "relu", convType = ConvWitAct)
+        val conv2 = convFactory(data = conv1, numFilter = numFilter, kernel = (3, 3),
+            stride = (1, 1), pad = (1, 1), convType = ConvWithoutAct)
+
+        // adopt project method in the paper when dimension increased
+        val projectData = convFactory(data = data, numFilter = numFilter, kernel = (1, 1),
+            stride = (2, 2), pad = (0, 0), convType = ConvWithoutAct)
+        val newData = projectData + conv2
+        val act = Symbol.Activation()()(Map("data" -> newData, "act_type" -> "relu"))
+        act
+    }
+  }
+
+  def residualNet(data: Symbol, n: Int): Symbol = {
+    // fisrt 2n layers
+    val data1 = (data /: (0 until n)) { (acc, elem) =>
+      residualFactory(data = acc, numFilter = 16, dimMatch = true)
+    }
+
+    // second 2n layers
+    val data2 = (data1 /: (0 until n)) { (acc, elem) =>
+      if (elem == 0) residualFactory(data = acc, numFilter = 32, dimMatch = false)
+      else residualFactory(data = acc, numFilter = 32, dimMatch = true)
+    }
+
+    // third 2n layers
+    val data3 = (data2 /: (0 until n)) { (acc, elem) =>
+      if (elem == 0) residualFactory(data = acc, numFilter = 64, dimMatch = false)
+      else residualFactory(data = acc, numFilter = 64, dimMatch = true)
+    }
+     data3
+  }
+
+  def getSymbol(numClasses: Int = 1000): Symbol = {
+    val conv = convFactory(data = Symbol.Variable("data"), numFilter = 16,
+        kernel = (3, 3), stride = (1, 1), pad = (1, 1), actType = "relu", convType = ConvWitAct)
+    // set n = 3 means get a model with 3*6+2=20 layers, set n = 9 means 9*6+2=56 layers
+    val n = 3
+    val resNet = residualNet(conv, n)
+    val pool = Symbol.Pooling()()(Map("data" -> resNet,
+        "kernel" -> "(7,7)", "pool_type" -> "avg"))
+    val flatten = Symbol.Flatten("flatten")()(Map("data" -> pool))
+    val fc = Symbol.FullyConnected("fc1")()(Map("data" -> flatten, "num_hidden" -> numClasses))
+    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc))
+    softmax
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/VGG.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/VGG.scala
new file mode 100644
index 000000000000..a8359e572805
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/VGG.scala
@@ -0,0 +1,67 @@
+package ml.dmlc.mxnet.examples.visualization
+
+import ml.dmlc.mxnet.Symbol
+
+/**
+ * @author Depeng Liang
+ */
+object VGG {
+
+  def getSymbol(numClasses: Int = 1000): Symbol = {
+    // define alexnet
+    val data = Symbol.Variable("data")
+    // group 1
+    val conv1_1 = Symbol.Convolution("conv1_1")()(
+        Map("data" -> data, "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu1_1 = Symbol.Activation("relu1_1")()(Map("data" -> conv1_1, "act_type" -> "relu"))
+    val pool1 = Symbol.Pooling("pool1")()(
+        Map("data" -> relu1_1, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
+    // group 2
+    val conv2_1 = Symbol.Convolution("conv2_1")()(
+        Map("data" -> pool1, "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu2_1 = Symbol.Activation("relu2_1")()(Map("data" -> conv2_1, "act_type" -> "relu"))
+    val pool2 = Symbol.Pooling("pool2")()(
+        Map("data" -> relu2_1, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
+    // group 3
+    val conv3_1 = Symbol.Convolution("conv3_1")()(
+        Map("data" -> pool2, "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu3_1 = Symbol.Activation("relu3_1")()(Map("data" -> conv3_1, "act_type" -> "relu"))
+    val conv3_2 = Symbol.Convolution("conv3_2")()(
+        Map("data" -> relu3_1, "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu3_2 = Symbol.Activation("relu3_2")()(Map("data" -> conv3_2 , "act_type" -> "relu"))
+    val pool3 = Symbol.Pooling("pool3")()(
+        Map("data" -> relu3_2, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
+    // group 4
+    val conv4_1 = Symbol.Convolution("conv4_1")()(
+        Map("data" -> pool3, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu4_1 = Symbol.Activation("relu4_1")()(Map("data" -> conv4_1 , "act_type" -> "relu"))
+    val conv4_2 = Symbol.Convolution("conv4_2")()(
+        Map("data" -> relu4_1, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu4_2 = Symbol.Activation("relu4_2")()(Map("data" -> conv4_2 , "act_type" -> "relu"))
+    val pool4 = Symbol.Pooling("pool4")()(
+        Map("data" -> relu4_2, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
+    // group 5
+    val conv5_1 = Symbol.Convolution("conv5_1")()(
+        Map("data" -> pool4, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu5_1 = Symbol.Activation("relu5_1")()(Map("data" -> conv5_1, "act_type" -> "relu"))
+    val conv5_2 = Symbol.Convolution("conv5_2")()(
+        Map("data" -> relu5_1, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
+    val relu5_2 = Symbol.Activation("relu5_2")()(Map("data" -> conv5_2, "act_type" -> "relu"))
+    val pool5 = Symbol.Pooling("pool5")()(
+        Map("data" -> relu5_2, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
+    // group 6
+    val flatten = Symbol.Flatten("flatten")()(Map("data" -> pool5))
+    val fc6 = Symbol.FullyConnected("fc6")()(Map("data" -> flatten, "num_hidden" -> 4096))
+    val relu6 = Symbol.Activation("relu6")()(Map("data" -> fc6, "act_type" -> "relu"))
+    val drop6 = Symbol.Dropout("drop6")()(Map("data" -> relu6, "p" -> 0.5f))
+    // group 7
+    val fc7 = Symbol.FullyConnected("fc7")()(Map("data" -> drop6, "num_hidden" -> 4096))
+    val relu7 = Symbol.Activation("relu7")()(Map("data" -> fc7, "act_type" -> "relu"))
+    val drop7 = Symbol.Dropout("drop7")()(Map("data" -> relu7, "p" -> 0.5f))
+    // output
+    val fc8 = Symbol.FullyConnected("fc8")()(
+        Map("data" -> drop7, "num_hidden" -> numClasses))
+    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc8))
+    softmax
+  }
+}
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
new file mode 100644
index 000000000000..aa0960c528c0
--- /dev/null
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>ml.dmlc.mxnet</groupId>
+    <artifactId>mxnet-scala-init-native-parent</artifactId>
+    <version>0.1.2-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <groupId>ml.dmlc.mxnet</groupId>
+  <artifactId>libmxnet-init-scala-linux-x86_64</artifactId>
+  <version>0.1.2-SNAPSHOT</version>
+  <name>MXNet Scala Package - Initializer Native Linux-x86_64</name>
+  <url>http://maven.apache.org</url>
+
+  <packaging>so</packaging>
+
+  <dependencies>
+    <dependency>
+      <groupId>ml.dmlc.mxnet</groupId>
+      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>jar</type>
+      <scope>compile</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>native-maven-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <!--  trigger javah -->
+          <javahOS>linux</javahOS>
+          <compilerProvider>generic-classic</compilerProvider>
+          <compilerExecutable>${cxx}</compilerExecutable>
+          <linkerExecutable>${cxx}</linkerExecutable>
+          <sources>
+            <source>
+              <directory>../src/main/native</directory>
+              <fileNames>
+                <fileName>ml_dmlc_mxnet_init_native_c_api.cc</fileName>
+              </fileNames>
+            </source>
+          </sources>
+          <compilerStartOptions>
+            <compilerStartOption>-std=c++0x</compilerStartOption>
+          </compilerStartOptions>
+          <compilerEndOptions>
+            <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>${cflags}</compilerEndOption>
+          </compilerEndOptions>
+          <linkerStartOptions>
+            <linkerStartOption>-shared</linkerStartOption>
+          </linkerStartOptions>
+          <linkerMiddleOptions>
+            <linkerMiddleOption>-Wl,--whole-archive</linkerMiddleOption>
+            <linkerMiddleOption>${lddeps}</linkerMiddleOption>
+            <linkerMiddleOption>../../../lib/libmxnet.a</linkerMiddleOption>
+            <linkerMiddleOption>-Wl,--no-whole-archive</linkerMiddleOption>
+          </linkerMiddleOptions>
+          <linkerEndOptions>
+            <linkerEndOption>${ldflags}</linkerEndOption>
+            <linkerEndOption>-fopenmp</linkerEndOption>
+          </linkerEndOptions>
+        </configuration>
+
+        <executions>
+          <execution>
+            <id>javah</id>
+            <phase>generate-sources</phase>
+            <configuration>
+              <javahOS>linux</javahOS>
+              <javahProvider>default</javahProvider>
+              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
+              <workingDirectory>${basedir}</workingDirectory>
+              <javahOutputFileName>ml_dmlc_mxnet_init_native_c_api.h</javahOutputFileName>
+              <javahClassNames>
+                <javahClassName>ml.dmlc.mxnet.init.LibInfo</javahClassName>
+              </javahClassNames>
+            </configuration>
+            <goals>
+              <goal>javah</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
new file mode 100644
index 000000000000..0d0c4843124a
--- /dev/null
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>ml.dmlc.mxnet</groupId>
+    <artifactId>mxnet-scala-init-native-parent</artifactId>
+    <version>0.1.2-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <groupId>ml.dmlc.mxnet</groupId>
+  <artifactId>libmxnet-init-scala-osx-x86_64</artifactId>
+  <version>0.1.2-SNAPSHOT</version>
+  <name>MXNet Scala Package - Initializer Native OSX-x86_64</name>
+  <url>http://maven.apache.org</url>
+
+  <packaging>jnilib</packaging>
+
+  <dependencies>
+    <dependency>
+      <groupId>ml.dmlc.mxnet</groupId>
+      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>jar</type>
+      <scope>compile</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>native-maven-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <!--  trigger javah -->
+          <javahOS>darwin</javahOS>
+          <compilerProvider>generic-classic</compilerProvider>
+          <compilerExecutable>${cxx}</compilerExecutable>
+          <linkerExecutable>${cxx}</linkerExecutable>
+          <sources>
+            <source>
+              <directory>../src/main/native</directory>
+              <fileNames>
+                <fileName>ml_dmlc_mxnet_init_native_c_api.cc</fileName>
+              </fileNames>
+            </source>
+          </sources>
+          <compilerStartOptions>
+            <compilerStartOption>-std=c++0x</compilerStartOption>
+          </compilerStartOptions>
+          <compilerEndOptions>
+            <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>${cflags}</compilerEndOption>
+          </compilerEndOptions>
+          <linkerStartOptions>
+            <linkerStartOption>-shared</linkerStartOption>
+          </linkerStartOptions>
+          <linkerMiddleOptions>
+            <linkerMiddleOption>-framework JavaVM</linkerMiddleOption>
+            <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
+            <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
+            <linkerMiddleOption>${lddeps}</linkerMiddleOption>
+            <linkerMiddleOption>-force_load ../../../lib/libmxnet.a</linkerMiddleOption>
+          </linkerMiddleOptions>
+          <linkerEndOptions>
+            <linkerEndOption>${ldflags}</linkerEndOption>
+          </linkerEndOptions>
+        </configuration>
+
+        <executions>
+          <execution>
+            <id>javah</id>
+            <phase>generate-sources</phase>
+            <configuration>
+              <javahOS>darwin</javahOS>
+              <javahProvider>default</javahProvider>
+              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
+              <workingDirectory>${basedir}</workingDirectory>
+              <javahOutputFileName>ml_dmlc_mxnet_init_native_c_api.h</javahOutputFileName>
+              <javahClassNames>
+                <javahClassName>ml.dmlc.mxnet.init.LibInfo</javahClassName>
+              </javahClassNames>
+            </configuration>
+            <goals>
+              <goal>javah</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
new file mode 100644
index 000000000000..2e75c038ec8b
--- /dev/null
+++ b/scala-package/init-native/pom.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>ml.dmlc.mxnet</groupId>
+    <artifactId>mxnet-parent_2.11</artifactId>
+    <version>0.1.2-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>mxnet-scala-init-native-parent</artifactId>
+  <version>0.1.2-SNAPSHOT</version>
+  <name>MXNet Scala Package - Initializer Native Parent</name>
+  <packaging>pom</packaging>
+
+  <profiles>
+    <profile>
+      <id>osx-x86_64-cpu</id>
+      <modules>
+        <module>osx-x86_64</module>
+      </modules>
+    </profile>
+    <profile>
+      <id>linux-x86_64-cpu</id>
+      <modules>
+        <module>linux-x86_64</module>
+      </modules>
+    </profile>
+    <profile>
+      <id>linux-x86_64-gpu</id>
+      <modules>
+        <module>linux-x86_64</module>
+      </modules>
+    </profile>
+  </profiles>
+</project>
diff --git a/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc b/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
new file mode 100644
index 000000000000..f0be4f81f4a8
--- /dev/null
+++ b/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
@@ -0,0 +1,70 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file ml_dmlc_mxnet_native_c_api.cc
+ * \brief JNI function implementations
+ */
+#include "ml_dmlc_mxnet_init_native_c_api.h"  // generated by javah
+#include <dmlc/base.h>
+#include <mxnet/c_api.h>
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_init_LibInfo_mxSymbolListAtomicSymbolCreators
+  (JNIEnv *env, jobject obj, jobject symbolList) {
+  mx_uint outSize;
+  AtomicSymbolCreator *outArray;
+  int ret = MXSymbolListAtomicSymbolCreators(&outSize, &outArray);
+
+  jclass longCls = env->FindClass("java/lang/Long");
+  jmethodID longConst = env->GetMethodID(longCls, "<init>", "(J)V");
+
+  jclass listCls = env->FindClass("scala/collection/mutable/ListBuffer");
+  jmethodID listAppend = env->GetMethodID(listCls,
+    "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ListBuffer;");
+
+  for (size_t i = 0; i < outSize; ++i) {
+    env->CallObjectMethod(symbolList, listAppend,
+                          env->NewObject(longCls, longConst, outArray[i]));
+  }
+
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_init_LibInfo_mxSymbolGetAtomicSymbolInfo
+  (JNIEnv *env, jobject obj, jlong symbolPtr, jobject name, jobject desc, jobject numArgs,
+    jobject argNames, jobject argTypes, jobject argDescs, jobject keyVarNumArgs) {
+
+  const char *cName;
+  const char *cDesc;
+  mx_uint cNumArgs;
+  const char **cArgNames;
+  const char **cArgTypes;
+  const char **cArgDescs;
+  const char *cKeyVarNumArgs;
+
+  int ret = MXSymbolGetAtomicSymbolInfo(reinterpret_cast<AtomicSymbolCreator>(symbolPtr),
+                                        &cName, &cDesc, &cNumArgs,
+                                        &cArgNames, &cArgTypes, &cArgDescs,
+                                        &cKeyVarNumArgs);
+
+  jclass refIntClass = env->FindClass("ml/dmlc/mxnet/init/Base$RefInt");
+  jfieldID valueInt = env->GetFieldID(refIntClass, "value", "I");
+
+  jclass refStringClass = env->FindClass("ml/dmlc/mxnet/init/Base$RefString");
+  jfieldID valueStr = env->GetFieldID(refStringClass, "value", "Ljava/lang/String;");
+
+  // scala.collection.mutable.ListBuffer append method
+  jclass listClass = env->FindClass("scala/collection/mutable/ListBuffer");
+  jmethodID listAppend = env->GetMethodID(listClass, "$plus$eq",
+      "(Ljava/lang/Object;)Lscala/collection/mutable/ListBuffer;");
+
+  env->SetObjectField(name, valueStr, env->NewStringUTF(cName));
+  env->SetObjectField(desc, valueStr, env->NewStringUTF(cDesc));
+  env->SetObjectField(keyVarNumArgs, valueStr, env->NewStringUTF(cKeyVarNumArgs));
+  env->SetIntField(numArgs, valueInt, static_cast<jint>(cNumArgs));
+  for (size_t i = 0; i < cNumArgs; ++i) {
+    env->CallObjectMethod(argNames, listAppend, env->NewStringUTF(cArgNames[i]));
+    env->CallObjectMethod(argTypes, listAppend, env->NewStringUTF(cArgTypes[i]));
+    env->CallObjectMethod(argDescs, listAppend, env->NewStringUTF(cArgDescs[i]));
+  }
+
+  return ret;
+}
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
new file mode 100644
index 000000000000..ddaf3cfa6652
--- /dev/null
+++ b/scala-package/init/pom.xml
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>ml.dmlc.mxnet</groupId>
+    <artifactId>mxnet-parent_2.11</artifactId>
+    <version>0.1.2-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <groupId>ml.dmlc.mxnet</groupId>
+  <artifactId>mxnet-init_2.11</artifactId>
+  <version>0.1.2-SNAPSHOT</version>
+  <name>MXNet Scala Package - Initializer</name>
+
+  <profiles>
+    <profile>
+      <id>osx-x86_64-cpu</id>
+      <properties>
+        <platform>osx-x86_64-cpu</platform>
+      </properties>
+    </profile>
+    <profile>
+      <id>linux-x86_64-cpu</id>
+      <properties>
+        <platform>linux-x86_64-cpu</platform>
+      </properties>
+    </profile>
+    <profile>
+      <id>linux-x86_64-gpu</id>
+      <properties>
+        <platform>linux-x86_64-gpu</platform>
+      </properties>
+    </profile>
+  </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <exclude>META-INF/*.SF</exclude>
+            <exclude>META-INF/*.DSA</exclude>
+            <exclude>META-INF/*.RSA</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.scalastyle</groupId>
+        <artifactId>scalastyle-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>log4j</groupId>
+      <artifactId>log4j</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/scala-package/init/src/main/scala/ml/dmlc/mxnet/init/Base.scala b/scala-package/init/src/main/scala/ml/dmlc/mxnet/init/Base.scala
new file mode 100644
index 000000000000..a7ec996c3f3c
--- /dev/null
+++ b/scala-package/init/src/main/scala/ml/dmlc/mxnet/init/Base.scala
@@ -0,0 +1,35 @@
+package ml.dmlc.mxnet.init
+
+object Base {
+  tryLoadInitLibrary()
+  val _LIB = new LibInfo
+
+  // type definitions
+  class RefInt(val value: Int = 0)
+  class RefLong(val value: Long = 0)
+  class RefFloat(val value: Float = 0)
+  class RefString(val value: String = null)
+
+  type CPtrAddress = Long
+
+  type NDArrayHandle = CPtrAddress
+  type FunctionHandle = CPtrAddress
+  type KVStoreHandle = CPtrAddress
+  type ExecutorHandle = CPtrAddress
+  type SymbolHandle = CPtrAddress
+
+  @throws(classOf[UnsatisfiedLinkError])
+  private def tryLoadInitLibrary(): Unit = {
+    val baseDir = System.getProperty("user.dir") + "/init-native"
+    val os = System.getProperty("os.name")
+    // ref: http://lopica.sourceforge.net/os.html
+    if (os.startsWith("Linux")) {
+      System.load(s"$baseDir/linux-x86_64/target/libmxnet-init-scala-linux-x86_64.so")
+    } else if (os.startsWith("Mac")) {
+      System.load(s"$baseDir/osx-x86_64/target/libmxnet-init-scala-osx-x86_64.jnilib")
+    } else {
+      // TODO(yizhi) support windows later
+      throw new UnsatisfiedLinkError()
+    }
+  }
+}
diff --git a/scala-package/init/src/main/scala/ml/dmlc/mxnet/init/LibInfo.scala b/scala-package/init/src/main/scala/ml/dmlc/mxnet/init/LibInfo.scala
new file mode 100644
index 000000000000..aa249b3cdd2c
--- /dev/null
+++ b/scala-package/init/src/main/scala/ml/dmlc/mxnet/init/LibInfo.scala
@@ -0,0 +1,17 @@
+package ml.dmlc.mxnet.init
+
+import ml.dmlc.mxnet.init.Base._
+
+import scala.collection.mutable.ListBuffer
+
+class LibInfo {
+  @native def mxSymbolListAtomicSymbolCreators(symbolList: ListBuffer[SymbolHandle]): Int
+  @native def mxSymbolGetAtomicSymbolInfo(handle: SymbolHandle,
+                                          name: RefString,
+                                          desc: RefString,
+                                          numArgs: RefInt,
+                                          argNames: ListBuffer[String],
+                                          argTypes: ListBuffer[String],
+                                          argDescs: ListBuffer[String],
+                                          keyVarNumArgs: RefString): Int
+}
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
new file mode 100644
index 000000000000..70f0745e729a
--- /dev/null
+++ b/scala-package/macros/pom.xml
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>ml.dmlc.mxnet</groupId>
+    <artifactId>mxnet-parent_2.11</artifactId>
+    <version>0.1.2-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>mxnet-macros_2.11</artifactId>
+  <version>0.1.2-SNAPSHOT</version>
+  <name>MXNet Scala Package - Macros</name>
+
+  <profiles>
+    <profile>
+      <id>osx-x86_64-cpu</id>
+      <properties>
+        <platform>osx-x86_64</platform>
+        <libtype>jnilib</libtype>
+      </properties>
+    </profile>
+    <profile>
+      <id>linux-x86_64-cpu</id>
+      <properties>
+        <platform>linux-x86_64</platform>
+        <libtype>so</libtype>
+      </properties>
+    </profile>
+    <profile>
+      <id>linux-x86_64-gpu</id>
+      <properties>
+        <platform>linux-x86_64</platform>
+        <libtype>so</libtype>
+      </properties>
+    </profile>
+  </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <exclude>META-INF/*.SF</exclude>
+            <exclude>META-INF/*.DSA</exclude>
+            <exclude>META-INF/*.RSA</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.scalastyle</groupId>
+        <artifactId>scalastyle-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>ml.dmlc.mxnet</groupId>
+      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>ml.dmlc.mxnet</groupId>
+      <artifactId>libmxnet-init-scala-${platform}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+      <type>${libtype}</type>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-reflect</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>log4j</groupId>
+      <artifactId>log4j</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/scala-package/macros/src/main/scala/ml/dmlc/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/ml/dmlc/mxnet/SymbolMacro.scala
new file mode 100644
index 000000000000..e22f3db5b328
--- /dev/null
+++ b/scala-package/macros/src/main/scala/ml/dmlc/mxnet/SymbolMacro.scala
@@ -0,0 +1,145 @@
+package ml.dmlc.mxnet
+
+import scala.annotation.StaticAnnotation
+import scala.collection.mutable.ListBuffer
+import scala.language.experimental.macros
+import scala.reflect.macros.blackbox
+
+import ml.dmlc.mxnet.init.Base._
+
+private[mxnet] class AddSymbolFunctions extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) = macro SymbolImplMacros.addDefs
+}
+
+private[mxnet] object SymbolImplMacros {
+  case class SymbolFunction(handle: SymbolHandle, keyVarNumArgs: String)
+
+  // scalastyle:off havetype
+  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
+    impl(c)(false, annottees: _*)
+  }
+  // scalastyle:off havetype
+
+  private val symbolFunctions: Map[String, SymbolFunction] = initSymbolModule()
+
+  private def impl(c: blackbox.Context)(addSuper: Boolean, annottees: c.Expr[Any]*): c.Expr[Any] = {
+    import c.universe._
+
+    val AST_TYPE_MAP_STRING_ANY = AppliedTypeTree(Ident(TypeName("Map")),
+      List(Ident(TypeName("String")), Ident(TypeName("Any"))))
+    val AST_TYPE_MAP_STRING_STRING = AppliedTypeTree(Ident(TypeName("Map")),
+      List(Ident(TypeName("String")), Ident(TypeName("String"))))
+    val AST_TYPE_SYMBOL_VARARG = AppliedTypeTree(
+      Select(
+        Select(Ident(termNames.ROOTPKG), TermName("scala")),
+        TypeName("<repeated>")
+      ),
+      List(Select(Select(Select(
+        Ident(TermName("ml")), TermName("dmlc")), TermName("mxnet")), TypeName("Symbol")))
+    )
+
+    val functionDefs = symbolFunctions map { case (funcName, funcProp) =>
+      val functionScope = if (funcName.startsWith("_")) Modifiers(Flag.PRIVATE) else Modifiers()
+      // It will generate definition something like,
+      // def Concat(name: String = null, attr: Map[String, String] = null)
+      //           (args: Symbol*)(kwargs: Map[String, Any] = null)
+      DefDef(functionScope, TermName(funcName), List(),
+        List(
+          List(
+            ValDef(Modifiers(Flag.PARAM | Flag.DEFAULTPARAM), TermName("name"),
+              Ident(TypeName("String")), Literal(Constant(null))),
+            ValDef(Modifiers(Flag.PARAM | Flag.DEFAULTPARAM), TermName("attr"),
+              AST_TYPE_MAP_STRING_STRING, Literal(Constant(null)))
+          ),
+          List(
+            ValDef(Modifiers(), TermName("args"), AST_TYPE_SYMBOL_VARARG, EmptyTree)
+          ),
+          List(
+            ValDef(Modifiers(Flag.PARAM | Flag.DEFAULTPARAM), TermName("kwargs"),
+              AST_TYPE_MAP_STRING_ANY, Literal(Constant(null)))
+          )
+        ), TypeTree(),
+        Apply(
+          Ident(TermName("createSymbolGeneral")),
+          List(
+            Literal(Constant(funcName)),
+            Ident(TermName("name")),
+            Ident(TermName("attr")),
+            Ident(TermName("args")),
+            Ident(TermName("kwargs"))
+          )
+        )
+      )
+    }
+
+    val inputs = annottees.map(_.tree).toList
+    // pattern match on the inputs
+    val modDefs = inputs map {
+      case ClassDef(mods, name, something, template) =>
+        val q = template match {
+          case Template(superMaybe, emptyValDef, defs) =>
+            Template(superMaybe, emptyValDef, defs ++ functionDefs)
+          case ex =>
+            throw new IllegalArgumentException(s"Invalid template: $ex")
+        }
+        ClassDef(mods, name, something, q)
+      case ModuleDef(mods, name, template) =>
+        val q = template match {
+          case Template(superMaybe, emptyValDef, defs) =>
+            Template(superMaybe, emptyValDef, defs ++ functionDefs)
+          case ex =>
+            throw new IllegalArgumentException(s"Invalid template: $ex")
+        }
+        ModuleDef(mods, name, q)
+      case ex =>
+        throw new IllegalArgumentException(s"Invalid macro input: $ex")
+    }
+    // wrap the result up in an Expr, and return it
+    val result = c.Expr(Block(modDefs, Literal(Constant())))
+    result
+  }
+
+  // List and add all the atomic symbol functions to current module.
+  private def initSymbolModule(): Map[String, SymbolFunction] = {
+    val symbolList = ListBuffer.empty[SymbolHandle]
+    _LIB.mxSymbolListAtomicSymbolCreators(symbolList)
+    symbolList.map(makeAtomicSymbolFunction).toMap
+  }
+
+  // Create an atomic symbol function by handle and function name.
+  private def makeAtomicSymbolFunction(handle: SymbolHandle): (String, SymbolFunction) = {
+    val name = new RefString
+    val desc = new RefString
+    val keyVarNumArgs = new RefString
+    val numArgs = new RefInt
+    val argNames = ListBuffer.empty[String]
+    val argTypes = ListBuffer.empty[String]
+    val argDescs = ListBuffer.empty[String]
+
+    _LIB.mxSymbolGetAtomicSymbolInfo(
+      handle, name, desc, numArgs, argNames, argTypes, argDescs, keyVarNumArgs)
+    val paramStr = ctypes2docstring(argNames, argTypes, argDescs)
+    val extraDoc: String = if (keyVarNumArgs.value != null && keyVarNumArgs.value.length > 0) {
+        s"This function support variable length of positional input (${keyVarNumArgs.value})."
+      } else {
+        ""
+      }
+    val docStr = s"${name.value}\n${desc.value}\n\n$paramStr\n$extraDoc\n"
+    // scalastyle:off println
+    println("Atomic Symbol function defination:\n" + docStr)
+    // scalastyle:on println
+    (name.value, new SymbolFunction(handle, keyVarNumArgs.value))
+  }
+
+  // Convert ctypes returned doc string information into parameters docstring.
+  def ctypes2docstring(argNames: Seq[String],
+                       argTypes: Seq[String],
+                       argDescs: Seq[String]): String = {
+    val params =
+      (argNames zip argTypes zip argDescs) map { case ((argName, argType), argDesc) =>
+        val desc = if (argDesc.isEmpty) "" else s"\n$argDesc"
+        s"$argName : $argType$desc"
+      }
+    s"Parameters\n----------\n${params.mkString("\n")}\n"
+  }
+}
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index 8a6bd6b85f25..296aa561633e 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -5,7 +5,7 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-parent_2.10</artifactId>
+    <artifactId>mxnet-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
diff --git a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
index 394176487172..de29bbe880d9 100644
--- a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
@@ -596,6 +596,22 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxKVStoreGetRank
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxKVStoreGetNumDeadNode
+  (JNIEnv * env, jobject obj, jlong kvStorePtr, jint nodeId, jobject numberRef) {
+  int number;
+  int ret = MXKVStoreGetNumDeadNode(reinterpret_cast<KVStoreHandle>(kvStorePtr),
+                                    static_cast<const int>(nodeId),
+                                    &number);
+  SetIntField(env, numberRef, number);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxKVStoreSetBarrierBeforeExit
+  (JNIEnv * env, jobject obj, jlong kvStorePtr, jint doBarrier) {
+  return MXKVStoreSetBarrierBeforeExit(reinterpret_cast<KVStoreHandle>(kvStorePtr),
+                                       static_cast<const int>(doBarrier));
+}
+
 JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxKVStoreFree
   (JNIEnv * env, jobject obj, jlong ptr) {
   return MXKVStoreFree(reinterpret_cast<KVStoreHandle>(ptr));
@@ -1447,3 +1463,156 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxNotifyShutdown
   (JNIEnv *env, jobject obj) {
   return MXNotifyShutdown();
 }
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOWriterCreate
+  (JNIEnv *env, jobject obj, jstring juri, jobject handle) {
+  RecordIOHandle out;
+  const char *uri = env->GetStringUTFChars(juri, 0);
+  int ret = MXRecordIOWriterCreate(uri, &out);
+  env->ReleaseStringUTFChars(juri, uri);
+  SetLongField(env, handle, reinterpret_cast<jlong>(out));
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOReaderCreate
+  (JNIEnv *env, jobject obj, jstring juri, jobject handle) {
+  RecordIOHandle out;
+  const char *uri = env->GetStringUTFChars(juri, 0);
+  int ret = MXRecordIOReaderCreate(uri, &out);
+  env->ReleaseStringUTFChars(juri, uri);
+  SetLongField(env, handle, reinterpret_cast<jlong>(out));
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOWriterFree
+  (JNIEnv *env, jobject obj, jlong handle) {
+  RecordIOHandle recordIOHandle = reinterpret_cast<RecordIOHandle>(handle);
+  int ret = MXRecordIOWriterFree(recordIOHandle);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOReaderFree
+  (JNIEnv *env, jobject obj, jlong handle) {
+  RecordIOHandle recordIOHandle = reinterpret_cast<RecordIOHandle>(handle);
+  int ret = MXRecordIOReaderFree(&recordIOHandle);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOWriterWriteRecord
+  (JNIEnv *env, jobject obj, jlong handle, jstring jbuf, jint size) {
+  const char *buf = env->GetStringUTFChars(jbuf, 0);
+  RecordIOHandle *recordIOHandle = reinterpret_cast<RecordIOHandle *>(handle);
+  int ret = MXRecordIOWriterWriteRecord(recordIOHandle, buf, size);
+  env->ReleaseStringUTFChars(jbuf, buf);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOReaderReadRecord
+  (JNIEnv *env, jobject obj, jlong handle, jobject buf) {
+  RecordIOHandle *recordIOHandle = reinterpret_cast<RecordIOHandle *>(handle);
+  size_t size;
+  char const  *out;
+  int ret = MXRecordIOReaderReadRecord(recordIOHandle, &out, &size);
+  SetStringField(env, buf, out);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOWriterTell
+  (JNIEnv *env, jobject obj, jlong handle, jobject jpos) {
+  RecordIOHandle *recordIOHandle = reinterpret_cast<RecordIOHandle *>(handle);
+  size_t pos;
+  int ret = MXRecordIOWriterTell(recordIOHandle, &pos);
+  SetIntField(env, jpos, pos);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxRecordIOReaderSeek
+  (JNIEnv *env, jobject obj, jlong handle, jint pos) {
+  RecordIOHandle *recordIOHandle = reinterpret_cast<RecordIOHandle *>(handle);
+  int ret = MXRecordIOReaderSeek(recordIOHandle, pos);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxOptimizerFindCreator
+  (JNIEnv *env, jobject obj, jstring jkey, jobject out) {
+  OptimizerCreator creator;
+  const char *key = env->GetStringUTFChars(jkey, 0);
+  int ret = MXOptimizerFindCreator(key, &creator);
+  env->ReleaseStringUTFChars(jkey, key);
+  SetLongField(env, out, reinterpret_cast<jlong>(creator));
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxOptimizerCreateOptimizer
+  (JNIEnv *env, jobject obj, jlong jcreator, jint num_param,
+    jobjectArray jkeys, jobjectArray jvals, jobject out) {
+  OptimizerHandle handle;
+  OptimizerCreator creator = reinterpret_cast<OptimizerCreator>(jcreator);
+  int len = env->GetArrayLength(jkeys);
+  const char **keys = NULL;
+  if (jkeys != NULL) {
+    keys = new const char *[len];
+    for (size_t i = 0; i < len; i++) {
+      jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jkeys, i));
+      const char *key = env->GetStringUTFChars(jkey, 0);
+      keys[i] = key;
+      env->DeleteLocalRef(jkey);
+    }
+  }
+  const char **vals = NULL;
+  if (jvals != NULL) {
+    vals = new const char *[len];
+    for (size_t i = 0; i < len; i++) {
+      jstring jval = reinterpret_cast<jstring>(env->GetObjectArrayElement(jvals, i));
+      const char *val = env->GetStringUTFChars(jval, 0);
+      vals[i] = val;
+      env->DeleteLocalRef(jval);
+    }
+  }
+  int ret = MXOptimizerCreateOptimizer(creator,
+                                       num_param,
+                                       keys,
+                                       vals,
+                                       &handle);
+  SetLongField(env, out, reinterpret_cast<jlong>(handle));
+  // release allocated memory
+  if (jkeys != NULL) {
+    for (size_t i = 0; i < len; i++) {
+      jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jkeys, i));
+      env->ReleaseStringUTFChars(jkey, keys[i]);
+      env->DeleteLocalRef(jkey);
+    }
+    delete[] keys;
+  }
+  if (jvals != NULL) {
+    for (size_t i = 0; i < len; i++) {
+      jstring jval = reinterpret_cast<jstring>(env->GetObjectArrayElement(jvals, i));
+      env->ReleaseStringUTFChars(jval, vals[i]);
+      env->DeleteLocalRef(jval);
+    }
+    delete[] vals;
+  }
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxOptimizerFree
+  (JNIEnv *env, jobject obj, jlong jhandle) {
+  OptimizerHandle handle = reinterpret_cast<OptimizerHandle>(jhandle);
+  int ret = MXOptimizerFree(handle);
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxOptimizerUpdate
+  (JNIEnv *env, jobject obj, jlong jhandle, jint index, jlong jweight,
+    jlong jgrad, jfloat lr, jfloat wd) {
+  OptimizerHandle handle = reinterpret_cast<OptimizerHandle>(jhandle);
+  NDArrayHandle weight = reinterpret_cast<NDArrayHandle>(jweight);
+  NDArrayHandle grad = reinterpret_cast<NDArrayHandle>(jgrad);
+  int ret = MXOptimizerUpdate(handle,
+                              index,
+                              weight,
+                              grad,
+                              lr,
+                              wd);
+  return ret;
+}
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 32d0777fadcf..51f94fafe964 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -4,7 +4,7 @@
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
   <modelVersion>4.0.0</modelVersion>
   <groupId>ml.dmlc.mxnet</groupId>
-  <artifactId>mxnet-parent_2.10</artifactId>
+  <artifactId>mxnet-parent_2.11</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/dmlc/mxnet/tree/master/scala-package</url>
@@ -48,12 +48,15 @@
   </developers>
 
   <properties>
-    <scala.version>2.10.4</scala.version>
-    <scala.binary.version>2.10</scala.binary.version>
+    <scala.version>2.11.8</scala.version>
+    <scala.binary.version>2.11</scala.binary.version>
   </properties>
 
   <packaging>pom</packaging>
   <modules>
+    <module>init</module>
+    <module>init-native</module>
+    <module>macros</module>
     <module>core</module>
     <module>native</module>
     <module>examples</module>
@@ -303,6 +306,16 @@
           <groupId>net.alchim31.maven</groupId>
           <artifactId>scala-maven-plugin</artifactId>
           <version>3.2.2</version>
+          <configuration>
+            <recompileMode>incremental</recompileMode>
+            <compilerPlugins>
+              <compilerPlugin>
+                <groupId>org.scalamacros</groupId>
+                <artifactId>paradise_${scala.version}</artifactId>
+                <version>2.1.0</version>
+              </compilerPlugin>
+            </compilerPlugins>
+          </configuration>
           <executions>
             <execution>
               <id>compile</id>
@@ -420,6 +433,11 @@
         <version>1.11.3</version>
         <scope>test</scope>
       </dependency>
+      <dependency>
+        <groupId>org.scala-lang.modules</groupId>
+        <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
+        <version>1.0.4</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 </project>
diff --git a/scala-package/scalastyle-config.xml b/scala-package/scalastyle-config.xml
index 583a815a6fbe..61ef26b4076a 100644
--- a/scala-package/scalastyle-config.xml
+++ b/scala-package/scalastyle-config.xml
@@ -87,7 +87,7 @@ You can also disable only one rule, by specifying its rule id, as specified in:
     </parameters>
   </check>
 
-  <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
+  <check customId="havetype" level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
 
   <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
 
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index a264615c83b9..6f60842b68d0 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -5,12 +5,12 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
-    <artifactId>mxnet-parent_2.10</artifactId>
+    <artifactId>mxnet-parent_2.11</artifactId>
     <version>0.1.2-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-spark_2.10</artifactId>
+  <artifactId>mxnet-spark_2.11</artifactId>
   <version>0.1.2-SNAPSHOT</version>
   <name>MXNet Scala Package - Spark ML</name>
   <packaging>pom</packaging>
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala
index 0da7a72f4aa4..1346bf2ac579 100644
--- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala
@@ -3,7 +3,6 @@ package ml.dmlc.mxnet.spark
 import ml.dmlc.mxnet._
 import ml.dmlc.mxnet.optimizer.SGD
 import ml.dmlc.mxnet.spark.io.LabeledPointIter
-import org.apache.spark.{SparkFiles, SparkContext}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 import org.slf4j.{Logger, LoggerFactory}
@@ -61,6 +60,16 @@ class MXNet extends Serializable {
     this
   }
 
+  /**
+   * The application (including parameter scheduler & servers)
+   * will exist if it hasn't received heart beat for over timeout seconds
+   * @param timeout timeout in seconds (default 300)
+   */
+  def setTimeout(timeout: Int): this.type = {
+    params.timeout = timeout
+    this
+  }
+
   /**
    * These jars are required by the KVStores at runtime.
    * They will be uploaded and distributed to each node automatically
@@ -99,7 +108,8 @@ class MXNet extends Serializable {
     logger.info("Starting scheduler on {}:{}", schedulerIP, schedulerPort)
     val scheduler = new ParameterServer(params.runtimeClasspath, role = "scheduler",
       rootUri = schedulerIP, rootPort = schedulerPort,
-      numServer = params.numServer, numWorker = params.numWorker, java = params.javabin)
+      numServer = params.numServer, numWorker = params.numWorker,
+      timeout = params.timeout, java = params.javabin)
     require(scheduler.startProcess(), "Failed to start ps scheduler process")
 
     sc.parallelize(1 to params.numServer, params.numServer).foreachPartition { p =>
@@ -109,6 +119,7 @@ class MXNet extends Serializable {
         rootUri = schedulerIP, rootPort = schedulerPort,
         numServer = params.numServer,
         numWorker = params.numWorker,
+        timeout = params.timeout,
         java = params.javabin)
       require(server.startProcess(), "Failed to start ps server process")
     }
@@ -131,11 +142,14 @@ class MXNet extends Serializable {
 
       logger.info("Launching worker ...")
       logger.info("Batch {}", params.batchSize)
+      // give enough time for ps-lite to detect the dead nodes
+      Thread.sleep(20000)
       KVStoreServer.init(ParameterServer.buildEnv(role = "worker",
         rootUri = schedulerIP, rootPort = schedulerPort,
         numServer = params.numServer,
         numWorker = params.numWorker))
       val kv = KVStore.create("dist_async")
+      kv.setBarrierBeforeExit(false)
 
       val optimizer: Optimizer = new SGD(learningRate = 0.01f,
         momentum = 0.9f, wd = 0.00001f)
@@ -158,7 +172,7 @@ class MXNet extends Serializable {
 
       logger.info("Training finished, waiting for other workers ...")
       dataIter.dispose()
-      kv.barrier()
+      kv.setBarrierBeforeExit(true)
       kv.dispose()
       Iterator(new MXNetModel(
         model, params.dimension, params.batchSize,
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNetParams.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNetParams.scala
index dd5f6bbdd9a2..ecc1736e4971 100644
--- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNetParams.scala
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNetParams.scala
@@ -39,6 +39,8 @@ private[mxnet] class MXNetParams extends Serializable {
   var dataName: String = "data"
   var labelName: String = "label"
 
+  var timeout: Int = 300
+
   // jars on executors for running mxnet application
   var jars: Array[String] = null
   def runtimeClasspath: String = {
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
index 06cea29b100e..81ed5fdad14d 100644
--- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
@@ -1,6 +1,6 @@
 package ml.dmlc.mxnet.spark
 
-import java.io.IOException
+import java.io.{IOException, InputStream, OutputStream}
 import java.util.concurrent.atomic.AtomicReference
 
 import ml.dmlc.mxnet.KVStoreServer
@@ -25,7 +25,7 @@ object ParameterServer {
       KVStoreServer.init(buildEnv(
         cmdLine.role, cmdLine.rootUri, cmdLine.rootPort,
         cmdLine.numServer, cmdLine.numWorker))
-      KVStoreServer.start()
+      KVStoreServer.start(dieIfOthersGoOutTimeout = cmdLine.timeout)
     } catch {
       case e: Throwable =>
         logger.error(e.getMessage, e)
@@ -55,6 +55,8 @@ object ParameterServer {
     val numServer: Int = 1
     @Option(name = "--num-worker", usage = "PS worker number")
     val numWorker: Int = 1
+    @Option(name = "--timeout", usage = "PS go out timeout")
+    val timeout: Int = 0
 
     def checkArguments(): Unit = {
       require(role != null, "Undefined role")
@@ -72,19 +74,52 @@ class ParameterServer(private val classpath: String,
                       private val rootPort: Int,
                       private val numServer: Int = 1,
                       private val numWorker: Int = 1,
+                      private val timeout: Int = 0,
                       private val java: String = "java",
                       private val jvmOpts: String = "") {
   private val logger: Logger = LoggerFactory.getLogger(classOf[ParameterServer])
   private val trackerProcess: AtomicReference[Process] = new AtomicReference[Process]
 
+  /**
+   * A utility class to redirect the child process's stdout or stderr.
+   */
+  private class RedirectThread(
+      in: InputStream,
+      out: OutputStream,
+      name: String,
+      propagateEof: Boolean = false)
+    extends Thread(name) {
+
+    setDaemon(true)
+    override def run() {
+      val buf = new Array[Byte](1024)
+      var len = in.read(buf)
+      while (len != -1) {
+        out.write(buf, 0, len)
+        out.flush()
+        len = in.read(buf)
+      }
+      if (propagateEof) {
+        out.close()
+      }
+    }
+  }
+
   def startProcess(): Boolean = {
     val cp = if (classpath == null) "" else s"-cp $classpath"
     val cmd = s"$java $jvmOpts $cp $runningClass " +
       s"--role=$role --root-uri=$rootUri --root-port=$rootPort " +
-      s"--num-server=$numServer --num-worker=$numWorker"
+      s"--num-server=$numServer --num-worker=$numWorker --timeout=$timeout"
     logger.info(s"Start process: $cmd")
     try {
-      trackerProcess.set(Runtime.getRuntime.exec(cmd))
+      val childProcess = Runtime.getRuntime.exec(cmd)
+      trackerProcess.set(childProcess)
+      val inputStream = childProcess.getInputStream
+      val errorStream = childProcess.getErrorStream
+      logger.info("Starting InputStream-Redirecter Thread")
+      new RedirectThread(inputStream, System.out, "InputStream-Redirecter", true).start()
+      logger.info("Starting ErrorStream-Redirecter Thread")
+      new RedirectThread(errorStream, System.err, "ErrorStream-Redirecter", true).start()
       true
     } catch {
       case ioe: IOException =>
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/example/ClassificationExample.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/example/ClassificationExample.scala
index 4a30cad6fd2b..eee31a7fd498 100644
--- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/example/ClassificationExample.scala
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/example/ClassificationExample.scala
@@ -134,12 +134,12 @@ object ClassificationExample {
 
   def getMlp: Symbol = {
     val data = Symbol.Variable("data")
-    val fc1 = Symbol.FullyConnected(name = "fc1")(Map("data" -> data, "num_hidden" -> 128))
-    val act1 = Symbol.Activation(name = "relu1")(Map("data" -> fc1, "act_type" -> "relu"))
-    val fc2 = Symbol.FullyConnected(name = "fc2")(Map("data" -> act1, "num_hidden" -> 64))
-    val act2 = Symbol.Activation(name = "relu2")(Map("data" -> fc2, "act_type" -> "relu"))
-    val fc3 = Symbol.FullyConnected(name = "fc3")(Map("data" -> act2, "num_hidden" -> 10))
-    val mlp = Symbol.SoftmaxOutput(name = "softmax")(Map("data" -> fc3))
+    val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
+    val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
+    val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
+    val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
+    val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
+    val mlp = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc3))
     mlp
   }
 
@@ -149,23 +149,25 @@ object ClassificationExample {
   def getLenet: Symbol = {
     val data = Symbol.Variable("data")
     // first conv
-    val conv1 = Symbol.Convolution()(Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
-    val tanh1 = Symbol.Activation()(Map("data" -> conv1, "act_type" -> "tanh"))
-    val pool1 = Symbol.Pooling()(Map("data" -> tanh1, "pool_type" -> "max",
+    val conv1 = Symbol.Convolution()()(
+      Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
+    val tanh1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "tanh"))
+    val pool1 = Symbol.Pooling()()(Map("data" -> tanh1, "pool_type" -> "max",
       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
     // second conv
-    val conv2 = Symbol.Convolution()(Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
-    val tanh2 = Symbol.Activation()(Map("data" -> conv2, "act_type" -> "tanh"))
-    val pool2 = Symbol.Pooling()(Map("data" -> tanh2, "pool_type" -> "max",
+    val conv2 = Symbol.Convolution()()(
+      Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
+    val tanh2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "tanh"))
+    val pool2 = Symbol.Pooling()()(Map("data" -> tanh2, "pool_type" -> "max",
       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
     // first fullc
-    val flatten = Symbol.Flatten()(Map("data" -> pool2))
-    val fc1 = Symbol.FullyConnected()(Map("data" -> flatten, "num_hidden" -> 500))
-    val tanh3 = Symbol.Activation()(Map("data" -> fc1, "act_type" -> "tanh"))
+    val flatten = Symbol.Flatten()()(Map("data" -> pool2))
+    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 500))
+    val tanh3 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "tanh"))
     // second fullc
-    val fc2 = Symbol.FullyConnected()(Map("data" -> tanh3, "num_hidden" -> 10))
+    val fc2 = Symbol.FullyConnected()()(Map("data" -> tanh3, "num_hidden" -> 10))
     // loss
-    val lenet = Symbol.SoftmaxOutput(name = "softmax")(Map("data" -> fc2))
+    val lenet = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc2))
     lenet
   }
 }
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala
new file mode 100644
index 000000000000..4e48c9ec2b06
--- /dev/null
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala
@@ -0,0 +1,68 @@
+package ml.dmlc.mxnet.spark.utils
+
+import javax.imageio.ImageIO
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.SparkConf
+import org.apache.spark.input._
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+
+/**
+ * Img2Vector tools could convert imgae directory into Vectorized RDD,for example:
+ * Images stored in hdfs://namenode:9000/user/xxx/images/
+ * val sc = new SparkContext(conf)
+ * val imagesArrayRDD = Img2Vector.getRGBArray(sc, "hdfs://namenode:9000/user/xxx/images/")
+ * val imagesVectorRDD = Img2Vector.getRGBVector(sc, "hdfs://namenode:9000/user/xxx/images/")
+ * @author Yuance.Li
+ */
+object Img2Vector{
+  def getImgRGB(PDS: PortableDataStream, fullcolor: Boolean): Array[Double] = {
+    val img = ImageIO.read(PDS.open())
+    val R = ArrayBuffer[Double]()
+    val G = ArrayBuffer[Double]()
+    val B = ArrayBuffer[Double]()
+    val RGB = ArrayBuffer[Double]()
+    val w = img.getWidth
+    val h = img.getHeight
+    if (fullcolor) {
+      for (x <- 0 until w){
+        for (y <- 0 until h) {
+          val color = img.getRGB(w - x - 1, y) & 0xffffff
+          R += (color & 0xff0000) / 65536
+          G += (color & 0xff00) / 256
+          B += (color & 0xff)
+        }
+      }
+      RGB ++= R ++= G ++= B
+      RGB.toArray
+    } else {
+      for (x <- 0 until w) {
+        for (y <- 0 until h){
+          val color = img.getRGB(w - x - 1, y) & 0xffffff
+          R += (color & 0xff0000) / 65536 * 0.3
+          G += (color & 0xff00) / 256 * 0.59
+          B += (color & 0xff) * 0.11
+        }
+      }
+      val grayArr = new Array[Double](w * h)
+      for (i <- 0 until w * h) {
+        grayArr(i) = R(i) + G(i) + B(i)
+      }
+      grayArr
+    }
+  }
+
+  def getRGBArray(sc: SparkContext, path: String, fullcolor: Boolean = true): RDD[Array[Double]] = {
+    val rgbArray = sc.binaryFiles(path).map(_._2).map(getImgRGB(_, fullcolor))
+    rgbArray
+  }
+
+  def getRGBVector(sc: SparkContext, path: String, fullcolor: Boolean = true): RDD[Vector] = {
+    val rgbArray = sc.binaryFiles(path).map(_._2).map(getImgRGB(_, fullcolor))
+    val rgbVector = rgbArray.map(x => Vectors.dense(x))
+    rgbVector
+  }
+}
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/RepIterator.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/RepIterator.scala
new file mode 100644
index 000000000000..a9aa21582893
--- /dev/null
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/RepIterator.scala
@@ -0,0 +1,32 @@
+package ml.dmlc.mxnet.spark.utils
+
+import scala.collection.Iterator
+
+/**
+ * Repeatable Iterator useful in mapPartitions
+ * @author Yuance.Li
+ */
+class RepIterator[T](iteratorInternal: Iterator[T], repetition: Int = 1) extends Iterator[T] {
+  assert(repetition > 0)
+  var counter = repetition - 1
+  var (currentIter, backupIter) = iteratorInternal.duplicate
+
+  override def hasNext: Boolean = {
+    currentIter.hasNext || counter > 0
+  }
+
+  override def next(): T = {
+    assert(hasNext)
+    if(currentIter.hasNext) {
+      currentIter.next()
+    } else if (counter > 0) {
+      counter = counter - 1
+      var iterTuple = backupIter.duplicate
+      currentIter = iterTuple._1
+      backupIter = iterTuple._2
+      currentIter.next()
+    } else {
+      throw new NoSuchElementException("No element in this collection")
+    }
+  }
+}
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 15b2083a1c48..bb48f5a747f7 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1292,6 +1292,13 @@ int MXKVStoreBarrier(KVStoreHandle handle) {
   API_END();
 }
 
+int MXKVStoreSetBarrierBeforeExit(KVStoreHandle handle,
+                                  const int barrier_before_exit) {
+  API_BEGIN();
+  static_cast<KVStore*>(handle)->set_barrier_before_exit(barrier_before_exit);
+  API_END();
+}
+
 int MXInitPSEnv(mx_uint num_vars,
                 const char **keys,
                 const char **vals) {
@@ -1351,6 +1358,15 @@ int MXKVStoreGetType(KVStoreHandle handle,
   API_END();
 }
 
+int MXKVStoreGetNumDeadNode(KVStoreHandle handle,
+                            const int node_id,
+                            int *number,
+                            const int timeout_sec) {
+  API_BEGIN();
+  *number = static_cast<KVStore*>(handle)->get_num_dead_node(node_id, timeout_sec);
+  API_END();
+}
+
 struct MXRecordIOContext {
   dmlc::RecordIOWriter *writer;
   dmlc::RecordIOReader *reader;
@@ -1389,6 +1405,14 @@ int MXRecordIOWriterWriteRecord(RecordIOHandle *handle,
   API_END();
 }
 
+int MXRecordIOWriterTell(RecordIOHandle *handle, size_t *pos) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  *pos = context->writer->Tell();
+  API_END();
+}
+
 int MXRecordIOReaderCreate(const char *uri,
                            RecordIOHandle *out) {
   API_BEGIN();
@@ -1427,6 +1451,14 @@ int MXRecordIOReaderReadRecord(RecordIOHandle *handle,
   API_END();
 }
 
+int MXRecordIOReaderSeek(RecordIOHandle *handle, size_t pos) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  context->reader->Seek(pos);
+  API_END();
+}
+
 int MXRtcCreate(char* name, mx_uint num_input, mx_uint num_output,
                 char** input_names, char** output_names,
                 NDArrayHandle* inputs, NDArrayHandle* outputs,
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
index 3b4fa0302435..10674ec20355 100644
--- a/src/io/image_recordio.h
+++ b/src/io/image_recordio.h
@@ -40,13 +40,17 @@ struct ImageRecordIO {
   };
   /*! \brief header of image recordio */
   Header header;
+  /*! \brief point to label */
+  float *label;
+  /*! \brief number of float labels */
+  int num_label;
   /*! \brief pointer to data content */
   uint8_t *content;
   /*! \brief size of the content */
   size_t content_size;
   /*! \brief constructor */
   ImageRecordIO(void)
-      : content(NULL), content_size(0) {
+      : label(NULL), num_label(0), content(NULL), content_size(0) {
     memset(&header, 0, sizeof(header));
   }
   /*! \brief get image id from record */
@@ -63,6 +67,16 @@ struct ImageRecordIO {
     std::memcpy(&header, buf, sizeof(header));
     content = reinterpret_cast<uint8_t*>(buf) + sizeof(header);
     content_size = size - sizeof(header);
+    if (header.flag > 0) {
+      CHECK(content_size >= sizeof(float)*header.flag);
+      label = reinterpret_cast<float*>(content);
+      num_label = header.flag;
+      content = reinterpret_cast<uint8_t*>(label + header.flag);
+      content_size -= sizeof(float)*header.flag;
+    } else {
+      label = NULL;
+      num_label = 0;
+    }
   }
   /*!
    * \brief save the record header
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 42ae5c89757c..8346a5a9a1df 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -201,8 +201,6 @@ inline void ImageRecordIOParser::Init(
   if (param_.path_imglist.length() != 0) {
     label_map_.reset(new ImageLabelMap(param_.path_imglist.c_str(),
       param_.label_width, !param_.verbose));
-  } else {
-    param_.label_width = 1;
   }
   CHECK(param_.path_imgrec.length() != 0)
       << "ImageRecordIOIterator: must specify image_rec";
@@ -276,7 +274,16 @@ ParseNext(std::vector<InstVector> *out_vec) {
       mshadow::Tensor<cpu, 1> label = out.label().Back();
       if (label_map_ != nullptr) {
         mshadow::Copy(label, label_map_->Find(rec.image_index()));
+      } else if (rec.label != NULL) {
+        CHECK_EQ(param_.label_width, rec.num_label)
+          << "rec file provide " << rec.num_label << "-dimensional label "
+             "but label_width is set to " << param_.label_width;
+        mshadow::Copy(label, mshadow::Tensor<cpu, 1>(rec.label,
+                                                     mshadow::Shape1(rec.num_label)));
       } else {
+        CHECK_EQ(param_.label_width, 1)
+          << "label_width must be 1 unless an imglist is provided "
+             "or the rec file is packed with multi dimensional label";
         label[0] = rec.header.label;
       }
       res.release();
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 0765827df13a..665a5504ae73 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -25,10 +25,19 @@ namespace io {
 struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
   /*! \brief number of prefetched batches */
   size_t prefetch_buffer;
+  /*! \brief data type */
+  int dtype;
+
   // declare parameters
   DMLC_DECLARE_PARAMETER(PrefetcherParam) {
     DMLC_DECLARE_FIELD(prefetch_buffer).set_default(4)
         .describe("Backend Param: Number of prefetched parameters");
+    DMLC_DECLARE_FIELD(dtype)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("float16", mshadow::kFloat16)
+      .set_default(mshadow::default_type_flag)
+      .describe("Data type.");
   }
 };
 
@@ -36,7 +45,7 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
   explicit PrefetcherIter(IIterator<TBlobBatch>* base)
-      : out_(nullptr), loader_(base) {
+      : loader_(base), out_(nullptr) {
   }
 
   ~PrefetcherIter() {
@@ -70,7 +79,9 @@ class PrefetcherIter : public IIterator<DataBatch> {
           (*dptr)->data.resize(batch.data.size());
           (*dptr)->index.resize(batch.batch_size);
           for (size_t i = 0; i < batch.data.size(); ++i) {
-            (*dptr)->data.at(i) = NDArray(batch.data[i].shape_, Context::CPU());
+            (*dptr)->data.at(i) = NDArray(batch.data[i].shape_,
+                                          Context::CPU(), false,
+                                          param_.dtype);
           }
         }
         CHECK(batch.data.size() == (*dptr)->data.size());
@@ -102,7 +113,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
     // do recycle
     if (recycle_queue_.size() == param_.prefetch_buffer) {
       DataBatch *old_batch =  recycle_queue_.front();
-      // can be more efficienct on engine
+      // can be more efficient on engine
       for (NDArray& arr : old_batch->data) {
         arr.WaitToWrite();
       }
@@ -115,17 +126,19 @@ class PrefetcherIter : public IIterator<DataBatch> {
     return *out_;
   }
 
- private:
+ protected:
   /*! \brief prefetcher parameters */
   PrefetcherParam param_;
-  // output data
+  /*! \brief internal batch loader */
+  std::unique_ptr<IIterator<TBlobBatch> > loader_;
+
+ private:
+  /*! \brief output data */
   DataBatch *out_;
-  // queue to be recycled
+  /*! \brief queue to be recycled */
   std::queue<DataBatch*> recycle_queue_;
-  // backend thread
+  /*! \brief backend thread */
   dmlc::ThreadedIter<DataBatch> iter_;
-  // internal batch loader
-  std::unique_ptr<IIterator<TBlobBatch> > loader_;
 };
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
new file mode 100644
index 000000000000..eda367080284
--- /dev/null
+++ b/src/kvstore/comm.h
@@ -0,0 +1,350 @@
+/**
+ * Copyright (c) 2015 by Contributors
+ */
+#ifndef MXNET_KVSTORE_COMM_H_
+#define MXNET_KVSTORE_COMM_H_
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <vector>
+#include "mxnet/ndarray.h"
+namespace mxnet {
+namespace kvstore {
+/**
+ * \brief multiple device commmunication
+ */
+class Comm {
+ public:
+  Comm() {
+    pinned_ctx_ = (MXNET_USE_CUDA != 0) ? Context::CPUPinned(0) : Context::CPU();
+  }
+  virtual ~Comm() { }
+  /**
+   * \brief init key with the data shape
+   */
+  virtual void Init(int key, const TShape &shape) = 0;
+  /**
+   * \brief returns src[0] + .. + src[src.size()-1]
+   */
+  virtual const NDArray& Reduce(
+      int key, const std::vector<NDArray>& src, int priority) = 0;
+  /**
+   * \brief copy from src to dst[i] for every i
+   */
+  virtual void Broadcast(
+      int key, const NDArray& src,
+      const std::vector<NDArray*> dst, int priority) = 0;
+
+ protected:
+  Context pinned_ctx_;
+};
+
+/**
+ * \brief an implemention of Comm that first copy data to CPU memeory, and then
+ * reduce there
+ */
+class CommCPU : public Comm {
+ public:
+  CommCPU() {
+    nthread_reduction_ = dmlc::GetEnv("MXNET_KVSTORE_REDUCTION_NTHREADS", 4);
+    bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000);
+  }
+  virtual ~CommCPU() { }
+
+  void Init(int key, const TShape &shape) override {
+    merge_buf_[key].merged = NDArray(shape, pinned_ctx_);
+  }
+
+  const NDArray& Reduce(int key, const std::vector<NDArray>& src,
+                        int priority) override {
+    // avoid extra copy for single device, but it may bring problems for
+    // abnormal usage of kvstore
+    if (src.size() == 1) {
+      return src[0];
+    }
+    std::vector<Engine::VarHandle> const_vars(src.size() - 1);
+    std::vector<NDArray> reduce(src.size());
+    auto& buf = merge_buf_[key];
+    CopyFromTo(src[0], &buf.merged, priority);
+    reduce[0] = buf.merged;
+
+    if (buf.copy_buf.empty()) {
+      buf.copy_buf.resize(src.size()-1);
+      for (size_t j = 0; j < src.size() - 1; ++j) {
+        buf.copy_buf[j] = NDArray(src[0].shape(), pinned_ctx_);
+      }
+    }
+    for (size_t i = 1; i < src.size(); ++i) {
+      CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority);
+      reduce[i] = buf.copy_buf[i-1];
+      const_vars[i-1] = reduce[i].var();
+    }
+
+    Engine::Get()->PushSync([reduce, this](RunContext rctx) {
+        ReduceSumCPU(reduce);
+      }, Context::CPU(), const_vars, {reduce[0].var()},
+      FnProperty::kCPUPrioritized, priority);
+
+    return buf.merged;
+  }
+
+  void Broadcast(int key, const NDArray& src,
+                 const std::vector<NDArray*> dst, int priority) override {
+    int mask = src.ctx().dev_mask();
+    if (mask == Context::kCPU) {
+      for (auto d : dst) CopyFromTo(src, d, priority);
+    } else {
+      // first copy data to cpu, then broadcast
+      auto& buf = merge_buf_[key];
+      CopyFromTo(src, &buf.merged, priority);
+      for (auto d : dst) CopyFromTo(buf.merged, d, priority);
+    }
+  }
+
+ private:
+  inline static void ReduceSumCPU(
+      const std::vector<real_t*> &dptr, size_t offset, index_t size) {
+    using namespace mshadow;  // NOLINT(*)
+    Tensor<cpu, 1> in_0(dptr[0] + offset, Shape1(size));
+    for (size_t i = 1; i < dptr.size(); i+=4) {
+      switch (dptr.size() - i) {
+        case 1: {
+          Tensor<cpu, 1> in_1(dptr[i] + offset, Shape1(size));
+          in_0 += in_1;
+          break;
+        }
+        case 2: {
+          Tensor<cpu, 1> in_1(dptr[i] + offset, Shape1(size));
+          Tensor<cpu, 1> in_2(dptr[i+1] + offset, Shape1(size));
+          in_0 += in_1 + in_2;
+          break;
+        }
+        case 3: {
+          Tensor<cpu, 1> in_1(dptr[i] + offset, Shape1(size));
+          Tensor<cpu, 1> in_2(dptr[i+1] + offset, Shape1(size));
+          Tensor<cpu, 1> in_3(dptr[i+2] + offset, Shape1(size));
+          in_0 += in_1 + in_2 + in_3;
+          break;
+        }
+        default: {
+          Tensor<cpu, 1> in_1(dptr[i] + offset, Shape1(size));
+          Tensor<cpu, 1> in_2(dptr[i+1] + offset, Shape1(size));
+          Tensor<cpu, 1> in_3(dptr[i+2] + offset, Shape1(size));
+          Tensor<cpu, 1> in_4(dptr[i+3] + offset, Shape1(size));
+          in_0 += in_1 + in_2 + in_3 + in_4;
+          break;
+        }
+      }
+    }
+  }
+  // reduce sum into val[0]
+  inline void ReduceSumCPU(const std::vector<NDArray> &in_data) {
+    const size_t step = std::min(bigarray_bound_, static_cast<size_t>(4 << 10));
+    // ge ptr out
+    std::vector<real_t*> dptr(in_data.size());
+    for (size_t i = 0; i < in_data.size(); ++i) {
+      TBlob data = in_data[i].data();
+      CHECK(data.CheckContiguous());
+      dptr[i] = data.FlatTo2D<cpu, real_t>().dptr_;
+    }
+    size_t total = in_data[0].shape().Size();
+    long ntask = (total + step - 1) / step; // NOLINT(*)
+    if (total < bigarray_bound_ || nthread_reduction_ <= 1) {
+      ReduceSumCPU(dptr, 0, total);
+    } else {
+      #pragma omp parallel for schedule(static) num_threads(nthread_reduction_)
+      for (long j = 0; j < ntask; ++j) { // NOLINT(*)
+        size_t k = static_cast<size_t>(j);
+        size_t begin = std::min(k * step, total);
+        size_t end = std::min((k + 1) * step, total);
+        if (j == ntask - 1) CHECK_EQ(end, total);
+        ReduceSumCPU(dptr, begin, static_cast<index_t>(end - begin));
+      }
+    }
+  }
+  /// \brief temperal space for pushing and pull
+  struct BufferEntry {
+    /// \brief the merged value
+    NDArray merged;
+    /// \brief the cpu buffer for gpu data
+    std::vector<NDArray> copy_buf;
+  };
+  std::unordered_map<int, BufferEntry> merge_buf_;
+  size_t bigarray_bound_;
+  int nthread_reduction_;
+};
+
+/**
+ * \brief an implementation of Comm that performs reduction on device
+ * directly.
+ *
+ * It is faster if the total device-to-device bandwidths is larger than
+ * device-to-cpu, which is often true for 4 or 8 GPUs. But it uses more device
+ * memory.
+ */
+class CommDevice : public Comm {
+ public:
+  CommDevice() {
+    inited_ = false;
+  }
+
+  virtual ~CommDevice() { }
+
+  void Init(int key, const TShape &shape) override {
+    sorted_key_shape_.push_back(std::make_pair(key, shape));
+  }
+
+  const NDArray& Reduce(int key, const std::vector<NDArray>& src,
+                        int priority) override {
+    // avoid extra copy for single device, but it may bring problems for
+    // abnormal usage of kvstore
+    if (src.size() == 1) {
+      return src[0];
+    }
+
+    if (!inited_) {
+      std::vector<Context> devs;
+      for (const auto& a : src) {
+        devs.push_back(a.ctx());
+      }
+      InitMergeBuffer(devs);
+      if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) {
+        EnableP2P(devs);
+      }
+    }
+
+    auto& buf = merge_buf_[key];
+    std::vector<NDArray> reduce(src.size());
+    CopyFromTo(src[0], &(buf.merged), priority);
+    reduce[0] = buf.merged;
+
+    if (buf.copy_buf.empty()) {
+      // TODO(mli) this results in large device memory usage for huge ndarray,
+      // such as the largest fullc in VGG. consider to do segment reduce with
+      // NDArray.Slice or gpu direct memory access. for the latter, we need to
+      // remove some ctx check, and also it reduces 20% perf
+      buf.copy_buf.resize(src.size()-1);
+      for (size_t i = 0; i < src.size()-1; ++i) {
+        buf.copy_buf[i] = NDArray(buf.merged.shape(), buf.merged.ctx());
+      }
+    }
+    for (size_t i = 0; i < src.size()-1; ++i) {
+      CopyFromTo(src[i+1], &(buf.copy_buf[i]), priority);
+      reduce[i+1] = buf.copy_buf[i];
+    }
+
+    ElementwiseSum(reduce, &buf.merged);
+
+    return buf.merged;
+  }
+
+  void Broadcast(int key, const NDArray& src,
+                 const std::vector<NDArray*> dst, int priority) override {
+    if (!inited_) {
+      // copy to a random device first
+      int dev_id = key % dst.size();
+      CopyFromTo(src, dst[dev_id], priority);
+      for (size_t i = 0; i < dst.size(); ++i) {
+        if (i != static_cast<size_t>(dev_id)) {
+          CopyFromTo(*dst[dev_id], dst[i], priority);
+        }
+      }
+    } else {
+      auto& buf = merge_buf_[key];
+      CopyFromTo(src, &buf.merged, priority);
+      for (auto d : dst) {
+        CopyFromTo(buf.merged, d, priority);
+      }
+    }
+  }
+
+ private:
+  void EnableP2P(const std::vector<Context>& devs) {
+#if MXNET_USE_CUDA
+    std::vector<int> gpus;
+    for (const auto& d : devs) {
+      if (d.dev_mask() == gpu::kDevMask) {
+        gpus.push_back(d.dev_id);
+      }
+    }
+    int n = static_cast<int>(gpus.size());
+    int enabled = 0;
+    std::vector<int> p2p(n*n);
+    for (int i = 0; i < n; ++i) {
+      cudaSetDevice(gpus[i]);
+      for (int j = 0; j < n; j++) {
+        int access;
+        cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
+        if (access) {
+          cudaError_t e = cudaDeviceEnablePeerAccess(gpus[j], 0);
+          if (e == cudaSuccess) {
+            ++enabled;
+            p2p[i*n+j] = 1;
+          }
+        }
+      }
+    }
+    if (enabled != n*(n-1)) {
+      // print warning info if not fully enabled
+      LOG(WARNING) << "only " << enabled <<  " out of "
+                   << n*(n-1) << " GPU pairs are enabled direct access. "
+                   << "It may affect the perofmrance. "
+                   << "You can set MXNET_ENABLE_GPU_P2P=0 to turn it off";
+      std::string access(n, '.');
+      for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+          access[j] = p2p[i*n+j] ? 'v' : '.';
+        }
+        LOG(WARNING) << access;
+      }
+    }
+#endif
+  }
+
+  using KeyShape = std::pair<int, TShape>;
+  // try to allocate buff on device evenly
+  void InitMergeBuffer(const std::vector<Context>& devs) {
+    std::sort(sorted_key_shape_.begin(), sorted_key_shape_.end(), [](
+              const KeyShape& a, const KeyShape& b) {
+      return a.second.Size() > b.second.Size();
+    });
+
+    std::unordered_map<int, std::pair<Context, size_t>> ctx_info;
+    for (auto d : devs) {
+      ctx_info[d.dev_id] = std::make_pair(d, 0);
+    }
+    for (size_t i = 0; i < sorted_key_shape_.size(); ++i) {
+      int k = sorted_key_shape_[i].first;
+      TShape s = sorted_key_shape_[i].second;
+      auto& buf = merge_buf_[k];
+      Context ctx;
+      size_t min_size = std::numeric_limits<size_t>::max();
+      for (auto it = ctx_info.begin(); it != ctx_info.end(); ++it) {
+        size_t size = it->second.second;
+        if (size <= min_size) {
+          ctx = it->second.first;
+          min_size = size;
+        }
+      }
+      buf.merged = NDArray(s, ctx);
+      ctx_info[ctx.dev_id].second += s.Size();
+    }
+    inited_ = true;
+  }
+
+  std::vector<KeyShape> sorted_key_shape_;
+  /// \brief temperal space for pushing and pull
+  struct BufferEntry {
+    /// \brief the merged value
+    NDArray merged;
+    /// \brief the gpu buffer
+    std::vector<NDArray> copy_buf;
+  };
+  std::unordered_map<int, BufferEntry> merge_buf_;
+  bool inited_;
+};
+
+}  // namespace kvstore
+}  // namespace mxnet
+#endif  // MXNET_KVSTORE_COMM_H_
diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc
index 95000fccae29..be5662e8a6db 100644
--- a/src/kvstore/kvstore.cc
+++ b/src/kvstore/kvstore.cc
@@ -7,7 +7,7 @@
 #include <stdlib.h>
 #include <dmlc/logging.h>
 #include "./kvstore_local.h"
-#include "./kvstore_device.h"
+// #include "./kvstore_device.h"
 #if MXNET_USE_DIST_KVSTORE
 #include "./kvstore_dist.h"
 #endif  // MXNET_USE_DIST_KVSTORE
@@ -18,24 +18,18 @@ KVStore* KVStore::Create(const char *type_name) {
   std::string tname = type_name;
   std::transform(tname.begin(), tname.end(), tname.begin(), ::tolower);
   KVStore* kv = nullptr;
-  if (tname == "local" ||
-      tname == "local_update_cpu" ||
-      tname == "local_allreduce_cpu") {
-    kv =  new kvstore::KVStoreLocal();
-  } else if (tname == "device" ||
-             tname == "local_update_device" ||
-             tname == "local_allreduce_device") {
-    kv = new kvstore::KVStoreDevice(true);
-  } else if (tname == "dist_async" ||
-             tname == "dist_sync" ||
-             tname == "dist_sync_device" ||
-             tname == "dist") {
+  bool use_device_comm = false;
+  auto has = [tname](const std::string& pattern) {
+    return tname.find(pattern) != std::string::npos;
+  };
+  if (has("device")) {
+    use_device_comm = true;
+  }
+
+  if (has("dist")) {
 #if MXNET_USE_DIST_KVSTORE
-    kv = new kvstore::KVStoreDist(
-        tname.find("device") != std::string::npos);
-    if (tname == "dist_sync" &&
-        kv->IsWorkerNode() &&
-        kv->get_rank() == 0) {
+    kv = new kvstore::KVStoreDist(use_device_comm);
+    if (!has("_async") && kv->IsWorkerNode() && kv->get_rank() == 0) {
       // configure the server to be the sync mode
       kv->SendCommandToServers(kvstore::kSyncMode, "");
     }
@@ -44,7 +38,7 @@ KVStore* KVStore::Create(const char *type_name) {
     return nullptr;
 #endif  // MXNET_USE_DIST_KVSTORE
   } else {
-    LOG(FATAL) << "Unknown KVStore type \"" << tname << "\"";
+    kv =  new kvstore::KVStoreLocal(use_device_comm);
   }
   kv->type_ = tname;
   return kv;
diff --git a/src/kvstore/kvstore_device.h b/src/kvstore/kvstore_device.h
deleted file mode 100644
index 82c04f9ec337..000000000000
--- a/src/kvstore/kvstore_device.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file kvstore_device.h
- * \brief Device implementation of KVStore that do reduction on GPU reduction.
- */
-#ifndef MXNET_KVSTORE_KVSTORE_DEVICE_H_
-#define MXNET_KVSTORE_KVSTORE_DEVICE_H_
-
-#include <mxnet/kvstore.h>
-#include <unordered_map>
-#include <vector>
-#include <utility>
-#include <algorithm>
-#include <limits>
-#include "./kvstore_local.h"
-#include "../common/utils.h"
-
-namespace mxnet {
-namespace kvstore {
-/*!
- * \brief Device implementation of KVStore that do reduction on GPU reduction.
- */
-class KVStoreDevice : public KVStoreLocal {
- public:
-  explicit KVStoreDevice(bool device_mode)
-      : device_mode_(device_mode) {}
-
- protected:
-  using KeyShape = std::pair<int, TShape>;
-  void Init(const std::vector<int>& keys,
-            const std::vector<NDArray>& values) override {
-    KVStoreLocal::Init(keys, values);
-
-    for (size_t i = 0; i < keys.size(); ++i) {
-      sorted_key_shape_.push_back(std::make_pair(keys[i], values[i].shape()));
-    }
-  }
-
-  void InitMergeBuffers(const std::vector<NDArray>& val) {
-    std::sort(sorted_key_shape_.begin(), sorted_key_shape_.end(), [](
-              const KeyShape& a, const KeyShape& b) {
-      return a.second.Size() > b.second.Size();
-    });
-
-    CHECK(!val.empty());
-    std::unordered_map<int, std::pair<Context, size_t>> ctx_info;
-    for (size_t i = 0; i < val.size(); ++i) {
-      int32_t dev_id = val[i].ctx().dev_id;
-      ctx_info[dev_id] = std::make_pair(val[i].ctx(), 0);
-    }
-    for (size_t i = 0; i < sorted_key_shape_.size(); ++i) {
-      int k = sorted_key_shape_[i].first;
-      TShape s = sorted_key_shape_[i].second;
-      auto& tm_buf = merge_buf_[k];
-      size_t min_size = std::numeric_limits<size_t>::max();
-      for (auto it = ctx_info.begin(); it != ctx_info.end(); ++it) {
-        size_t tm_size = it->second.second;
-        if (tm_size <= min_size) {
-          tm_buf.ctx = it->second.first;
-          min_size = tm_size;
-        }
-      }
-
-      tm_buf.merged = NDArray(s, Context::CPUPinned(tm_buf.ctx.dev_id));
-      tm_buf.merged_device = NDArray(s, tm_buf.ctx);
-      ctx_info[tm_buf.ctx.dev_id].second += s.Size();
-    }
-  }
-
-  const NDArray& MergePushValue(
-      int key, const std::vector<NDArray>& val, int priority) override {
-    if (!device_mode_) {
-      return KVStoreLocal::MergePushValue(key, val, priority);
-    }
-    if (!buf_initialized_) {
-      InitMergeBuffers(val);
-      buf_initialized_ = true;
-    }
-
-    auto& buf = merge_buf_[key];
-    std::vector<NDArray> reduce(val.size());
-    CHECK(!buf.merged_device.is_none());
-    CopyFromTo(val[0], &(buf.merged_device), priority);
-    reduce[0] = buf.merged_device;
-
-    for (size_t i = 1; i < val.size(); ++i) {
-      NDArray *copy_buf = buf.AllocCopyBuf(
-          i, buf.ctx, val[0].shape());
-      CopyFromTo(val[i], copy_buf, priority);
-      reduce[i] = *copy_buf;
-    }
-    ElementwiseSum(reduce, &buf.merged_device);
-
-    if (updater_ != nullptr) {
-      CopyFromTo(buf.merged_device, &(buf.merged));
-      return buf.merged;
-    } else {
-      return buf.merged_device;
-    }
-  }
-
-  void ScatterPullValue(
-      int key,
-      const NDArray& src,
-      const std::vector<NDArray*>& vals,
-      int priority) override {
-    if (!device_mode_) {
-      KVStoreLocal::ScatterPullValue(key, src, vals, priority);
-      return;
-    }
-    auto it = merge_buf_.find(key);
-    if (it != merge_buf_.end() && it->first == key) {
-      auto& buf = it->second;
-      if (!buf.merged_device.is_none()) {
-        CopyFromTo(src, &(buf.merged_device));
-        for (auto* vptr : vals) {
-          CopyFromTo(buf.merged_device, vptr, priority);
-        }
-        return;
-      }
-    }
-    // default, copy back
-    for (auto* vptr : vals) {
-      CopyFromTo(src, vptr, priority);
-    }
-  }
-
- private:
-  bool device_mode_;
-  bool buf_initialized_{false};
-  std::vector<KeyShape> sorted_key_shape_;
-};
-}  // namespace kvstore
-}  // namespace mxnet
-#endif  // MXNET_KVSTORE_KVSTORE_DEVICE_H_
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 2705effe0104..905b4e4ec67f 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -7,7 +7,7 @@
 #define MXNET_KVSTORE_KVSTORE_DIST_H_
 #include <string>
 #include <vector>
-#include "./kvstore_device.h"
+#include "./kvstore_local.h"
 #include "mxnet/engine.h"
 #include "ps/ps.h"
 #include "./kvstore_dist_server.h"
@@ -25,26 +25,32 @@ namespace kvstore {
  * it's the server node's job to control the data consistency among all
  * workers. see details on \ref ServerHandle::Start
  */
-class KVStoreDist : public KVStoreDevice {
+class KVStoreDist : public KVStoreLocal {
  public:
-  explicit KVStoreDist(bool device_mode)
-      : KVStoreDevice(device_mode),
-        ps_worker_(nullptr), server_(nullptr) {
+  explicit KVStoreDist(bool use_device_comm)
+      : KVStoreLocal(use_device_comm), ps_worker_(nullptr), server_(nullptr) {
     if (IsWorkerNode()) {
       ps_worker_ = new ps::KVWorker<real_t>(0);
-      ps::Start("mxnet\0");
+      ps::StartAsync("mxnet\0");
+      if (!ps::Postoffice::Get()->is_recovery()) {
+        ps::Postoffice::Get()->Barrier(
+          ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
+      }
     }
+    bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000);
   }
 
   virtual ~KVStoreDist() {
     Engine::Get()->WaitForAll();
     if (IsWorkerNode()) {
-      ps::Postoffice::Get()->Barrier(ps::kWorkerGroup);
-      if (get_rank() == 0) {
-        // stop the executor at servers
-        SendCommandToServers(kStopServer, "");
+      if (barrier_before_exit_) {
+        Barrier();
+        if (get_rank() == 0) {
+          // stop the executor at servers
+          SendCommandToServers(kStopServer, "");
+        }
       }
-      ps::Finalize();
+      ps::Finalize(barrier_before_exit_);
       delete ps_worker_;
     }
   }
@@ -52,50 +58,27 @@ class KVStoreDist : public KVStoreDevice {
   void Init(const std::vector<int>& keys,
             const std::vector<NDArray>& values) override {
     CheckUnique(keys);
+    for (size_t i = 0; i < keys.size(); ++i) {
+      comm_->Init(keys[i], values[i].shape());
+    }
     if (get_rank() == 0) {
-      Push(keys, values, 0);
+      Push_(keys, values, 0, false);
       // wait until the push is finished
-      Wait(keys);
+      for (const auto& v : values) {
+        v.WaitToWrite();
+      }
     } else {
       // do nothing
     }
-    Barrier();
+    if (!ps::Postoffice::Get()->is_recovery()) {
+      Barrier();
+    }
   }
 
   void Push(const std::vector<int>& keys,
             const std::vector<NDArray>& values,
-              int priority) override {
-    // first aggregate the values over keys
-    std::vector<int> uniq_keys;
-    std::vector<std::vector<NDArray> > grouped_vals;
-    GroupKVPairs(keys, values, &uniq_keys, &grouped_vals);
-
-    for (size_t i = 0; i < uniq_keys.size(); ++i) {
-      // merge over devcies
-      int key = uniq_keys[i];
-      const NDArray& merged = MergePushValue(key, grouped_vals[i], priority);
-
-      // push to servers
-      auto push_to_servers =
-          [this, key, merged](RunContext rctx, Engine::CallbackOnComplete cb) {
-         // convert to ps keys
-        size_t size = merged.shape().Size();
-        PSKV& pskv = EncodeKey(key, size);
-
-        // do push
-        real_t* data = static_cast<real_t*>(merged.data().dptr_);
-        // false means no delete
-        ps::SArray<real_t> vals(data, size, false);
-        CHECK_NOTNULL(ps_worker_)->ZPush(
-        pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
-      };
-      Engine::Get()->PushAsync(
-          push_to_servers,
-          pinned_ctx_,
-          {merged.var()},
-          {},
-          FnProperty::kNormal, priority);
-    }
+            int priority) override {
+    Push_(keys, values, priority, true);
   }
 
   void Pull(const std::vector<int>& keys,
@@ -107,19 +90,18 @@ class KVStoreDist : public KVStoreDevice {
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
-      const auto& vals = grouped_vals[i];
-
-      // first pull to a buffer. we reuse the merge buf so that all pushes and
-      // pulls on the same key on the local machine are always sequentials
-      auto& buf = merge_buf_[key].merged;
-      if (buf.is_none()) {
-        buf = NDArray(vals[0]->shape(), pinned_ctx_);
+      // use the same array for merging to guarantee that pull always happens
+      // after the previous push on this key
+      auto& recv_buf = comm_buf_[key];
+      if (recv_buf.is_none()) {
+        // it may happen for the first time a no-rank-0 worker pull the weight.
+        recv_buf = NDArray(grouped_vals[i][0]->shape(), pinned_ctx_);
       }
+      real_t* data = static_cast<real_t*>(recv_buf.data().dptr_);
+      size_t size = recv_buf.shape().Size();
 
-      auto pull_from_servers = [this, key, buf] (
+      auto pull_from_servers = [this, key, data, size](
           RunContext rctx, Engine::CallbackOnComplete cb) {
-        real_t* data = static_cast<real_t*>(buf.data().dptr_);
-        size_t size = buf.shape().Size();
         // convert to ps keys
         PSKV& pskv = EncodeKey(key, size);
 
@@ -133,10 +115,10 @@ class KVStoreDist : public KVStoreDevice {
           pull_from_servers,
           pinned_ctx_,
           {},
-          {buf.var()},
+          {recv_buf.var()},
           FnProperty::kNormal, priority);
 
-      ScatterPullValue(key, buf, vals, priority);
+      comm_->Broadcast(key, recv_buf, grouped_vals[i], priority);
     }
   }
 
@@ -164,6 +146,17 @@ class KVStoreDist : public KVStoreDevice {
 
   int get_rank() const override { return ps::MyRank(); }
 
+  int get_num_dead_node(int node_id, int timeout) const override {
+    int number = 0;
+    auto dead_nodes = ps::Postoffice::Get()->GetDeadNodes(timeout);
+    const auto& watch_nodes = ps::Postoffice::Get()->GetNodeIDs(node_id);
+    std::unordered_set<int> watch_set(watch_nodes.begin(), watch_nodes.end());
+    for (int r : dead_nodes) {
+      if (watch_set.find(r) != watch_set.end()) number++;
+    }
+    return number;
+  }
+
   void RunServer(const Controller& controller) override {
     CHECK(!IsWorkerNode());
     if (IsServerNode()) {
@@ -171,27 +164,64 @@ class KVStoreDist : public KVStoreDevice {
       server_->set_controller(controller);
     }
 
-    ps::Start("mxnet_server\0");
+    ps::StartAsync("mxnet_server\0");
+    if (!ps::Postoffice::Get()->is_recovery()) {
+      ps::Postoffice::Get()->Barrier(
+        ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
+    }
     if (server_) server_->Run();
     ps::Finalize();
-    delete server_; server_ = nullptr;
+    if (server_) {
+      delete server_;
+    }
+    server_ = nullptr;
   }
 
  private:
-  /**
-   * \brief Wait until all pushes and pulls issued on each key have been
-   * finished
-   *
-   * \param keys a list of keys
-   */
-  void Wait(const std::vector<int>& keys) {
-    for (int key : keys) {
-      auto it = merge_buf_.find(key);
-      CHECK(it != merge_buf_.end())
-          << "there is no push/pull on key " << key << " before";
-      CHECK(!it->second.merged.is_none())
-          << "there is no push/pull on key " << key << " before";
-      it->second.merged.WaitToWrite();
+  void Push_(const std::vector<int>& keys,
+             const std::vector<NDArray>& values,
+             int priority,
+             bool do_merge)  {
+    // first aggregate the values over keys
+    std::vector<int> uniq_keys;
+    std::vector<std::vector<NDArray> > grouped_vals;
+    GroupKVPairs(keys, values, &uniq_keys, &grouped_vals);
+
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      // merge over devcies
+      int key = uniq_keys[i];
+      const auto& vals = grouped_vals[i];
+      NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0];
+
+      auto& send_buf = comm_buf_[key];
+      if (merged.ctx().dev_mask() == cpu::kDevMask) {
+        send_buf = merged;  // avoid memory copy
+      } else {
+        if (send_buf.is_none()) {
+          send_buf = NDArray(merged.shape(), pinned_ctx_);
+        }
+        CopyFromTo(merged, &send_buf);
+      }
+
+      // push to servers
+      size_t size = send_buf.shape().Size();
+      real_t* data = static_cast<real_t*>(send_buf.data().dptr_);
+      auto push_to_servers =
+          [this, key, data, size](RunContext rctx, Engine::CallbackOnComplete cb) {
+         // convert to ps keys
+        PSKV& pskv = EncodeKey(key, size);
+
+        // do push. false means no delete
+        ps::SArray<real_t> vals(data, size, false);
+        CHECK_NOTNULL(ps_worker_)->ZPush(
+        pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
+      };
+      Engine::Get()->PushAsync(
+          push_to_servers,
+          pinned_ctx_,
+          {send_buf.var()},
+          {},
+          FnProperty::kNormal, priority);
     }
   }
 
@@ -267,17 +297,20 @@ class KVStoreDist : public KVStoreDevice {
     return pskv;
   }
 
-  // whether use device distributed local sync.
-  bool device_mode_;
   /**
    * \brief for worker to push and pull data
    */
   ps::KVWorker<real_t>* ps_worker_;
-
   /**
    * \brief the server handle
    */
   KVStoreDistServer* server_;
+  /**
+   * \brief threshold for partition
+   */
+  size_t bigarray_bound_;
+  /// \brief send & recver buffer
+  std::unordered_map<int, NDArray> comm_buf_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index 0bf446d50ca5..02d4a38c2b10 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -179,10 +179,15 @@ class KVStoreDistServer {
         if (merged.request.size() == (size_t)ps::NumWorkers()) {
           // let the main thread to execute updater_, which is necessary for
           // python
-          exec_.Exec([this, key, &merged, &stored](){
-              CHECK(updater_);
-              updater_(key, merged.array, &stored);
-            });
+          if (updater_) {
+            exec_.Exec([this, key, &merged, &stored](){
+                CHECK(updater_);
+                updater_(key, merged.array, &stored);
+              });
+          } else {
+            // if no updater, just copy
+            CopyFromTo(merged.array, &stored);
+          }
           for (const auto& req : merged.request) {
             server->Response(req);
           }
@@ -207,6 +212,7 @@ class KVStoreDistServer {
       int len = stored.shape()[0];
       response.keys = req_data.keys;
       response.lens = {len};
+      // TODO(mli) try to remove this CopyFrom
       response.vals.CopyFrom(static_cast<const float*>(stored.data().dptr_), len);
       server->Response(req_meta, response);
     }
@@ -237,8 +243,6 @@ class KVStoreDistServer {
   ps::KVServer<float>* ps_server_;
 };
 
-
-
 }  // namespace kvstore
 }  // namespace mxnet
 
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 3e6ab7b5b3b0..c493fde0f4c1 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -12,6 +12,7 @@
 #include <vector>
 #include <utility>
 #include <algorithm>
+#include "./comm.h"
 
 namespace mxnet {
 namespace kvstore {
@@ -20,12 +21,20 @@ namespace kvstore {
  */
 class KVStoreLocal : public KVStore {
  public:
-  KVStoreLocal() {
-    pinned_ctx_ = (MXNET_USE_CUDA != 0) ?
-        Context::CPUPinned(0) : Context::CPU();
-    // the server perameters
-    nthread_reduction_ = dmlc::GetEnv("MXNET_KVSTORE_REDUCTION_NTHREADS", 4);
-    bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000);
+  /*
+   * \param use_device_comm
+   */
+  explicit KVStoreLocal(bool use_device_comm) : KVStore() {
+    if (use_device_comm) {
+      comm_ = new CommDevice();
+    } else {
+      comm_ = new CommCPU();
+    }
+    pinned_ctx_ = (MXNET_USE_CUDA != 0) ? Context::CPUPinned(0) : Context::CPU();
+  }
+
+  virtual ~KVStoreLocal() {
+    delete comm_;
   }
 
   void Init(const std::vector<int>& keys,
@@ -34,6 +43,7 @@ class KVStoreLocal : public KVStore {
       CHECK(local_.find(keys[i]) == local_.end())
           << "duplicate init of key " << keys[i];
       local_[keys[i]] = values[i].Copy(pinned_ctx_);
+      comm_->Init(keys[i], values[i].shape());
     }
   }
 
@@ -46,11 +56,18 @@ class KVStoreLocal : public KVStore {
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
-      const NDArray& merged = MergePushValue(key, grouped_vals[i], priority);
+      const NDArray& merged = comm_->Reduce(key, grouped_vals[i], priority);
+      NDArray& local = local_[key];
       if (updater_ != nullptr) {
-        auto it = local_.find(key);
-        CHECK(it != local_.end()) << "key " << key << " has not been inited";
-        updater_(key, merged,  &(it->second));
+        CHECK(!local.is_none()) << "key " << key << " has not been inited";
+        // if merged is on gpu, we may need copy weight from cpu to gpu
+        if (merged.ctx().dev_mask() != cpu::kDevMask &&
+            local.ctx().dev_mask() == cpu::kDevMask) {
+          local = local.Copy(merged.ctx());
+        }
+        updater_(key, merged,  &local);
+      } else {
+        local = merged;
       }
     }
   }
@@ -64,39 +81,13 @@ class KVStoreLocal : public KVStore {
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
-      auto it = merge_buf_.find(key);
-      if (updater_ != nullptr || it == merge_buf_.end()) {
-        auto it = local_.find(key);
-        CHECK(it != local_.end()) << "key " << key << " has not been inited";
-        ScatterPullValue(
-            key, it->second, grouped_vals[i], priority);
-      } else {
-        ScatterPullValue(
-            key, it->second.merged, grouped_vals[i], priority);
-      }
+      const NDArray& local = local_[key];
+      CHECK(!local.is_none()) << "key " << key << " has not been inited";
+      comm_->Broadcast(key, local, grouped_vals[i], priority);
     }
   }
 
  protected:
-  /// \brief temperal space for pushing and pull
-  struct BufferEntry {
-    // Context of merged
-    Context ctx;
-    // the merged value
-    NDArray merged;
-    // the merged value on device
-    NDArray merged_device;
-    /// \brief the cpu buffer for gpu data
-    std::vector<NDArray> copy_buf;
-    // allocate copy buffer, if it has not been allocated
-    inline NDArray *AllocCopyBuf(size_t index, Context ctx, const TShape& shape) {
-      if (index >= copy_buf.size()) copy_buf.resize(index + 1);
-      if (copy_buf[index].is_none()) {
-        copy_buf[index] = NDArray(shape, ctx);
-      }
-      return &copy_buf[index];
-    }
-  };
   /**
    * \brief group values on keys
    */
@@ -127,127 +118,12 @@ class KVStoreLocal : public KVStore {
       }
     }
   }
-  /*!
-   * \brief returns the aggregated push value
-   */
-  virtual const NDArray& MergePushValue(
-      int key, const std::vector<NDArray>& val, int priority) {
-    auto& buf = merge_buf_[key];
-    // copy buffer
-    std::vector<Engine::VarHandle> const_vars(val.size() - 1);
-    std::vector<NDArray> reduce(val.size());
-
-    if (buf.merged.is_none()) {
-      buf.ctx = Context::CPUPinned(val[0].ctx().dev_id);
-      if (MXNET_USE_CUDA == 0) buf.ctx = Context::CPU();
-      buf.merged = NDArray(val[0].shape(), buf.ctx);
-    }
-
-    CopyFromTo(val[0], &(buf.merged), priority);
-    reduce[0] = buf.merged;
-
-    for (size_t i = 1; i < val.size(); ++i) {
-      const NDArray& v = val[i];
-      Context ctx = v.ctx();
-      if (ctx.dev_mask() == cpu::kDevMask) {
-        reduce[i] = val[i];
-      } else {
-        NDArray *copy_buf = buf.AllocCopyBuf(
-            i, Context::CPUPinned(ctx.dev_id), val[0].shape());
-        CopyFromTo(val[i], copy_buf, priority);
-        reduce[i] = *copy_buf;
-      }
-      const_vars[i - 1] = reduce[i].var();
-    }
-
-    Engine::Get()->PushSync([reduce, this](RunContext rctx) {
-        ReduceSumCPU(reduce);
-      }, Context::CPU(), const_vars, {reduce[0].var()},
-      FnProperty::kCPUPrioritized, priority);
-    return buf.merged;
-  }
-
-  virtual void ScatterPullValue(
-      int key,
-      const NDArray& src,
-      const std::vector<NDArray*>& vals,
-      int priority) {
-    for (auto* vptr : vals) {
-      CopyFromTo(src, vptr, priority);
-    }
-  }
-
-  /// \brief buffer for merging push value
-  std::unordered_map<int, BufferEntry> merge_buf_;
-  // pinned context
+  /// reducer and broadcaster
+  Comm* comm_;
+  /// pinned context
   Context pinned_ctx_;
-  // the lower bound of a big array
-  size_t bigarray_bound_;
-
- private:
-  inline static void ReduceSumCPU(const std::vector<real_t*> &dptr,
-                                  size_t offset, index_t size) {
-    using namespace mshadow;  // NOLINT(*)
-    Tensor<cpu, 1> in_0(dptr[0] + offset, Shape1(size));
-    switch (dptr.size()) {
-      case 2: {
-        Tensor<cpu, 1> in_1(dptr[1] + offset, Shape1(size));
-        in_0 += in_1;
-        break;
-      }
-      case 3: {
-        Tensor<cpu, 1> in_1(dptr[1] + offset, Shape1(size));
-        Tensor<cpu, 1> in_2(dptr[2] + offset, Shape1(size));
-        in_0 += in_1 + in_2;
-        break;
-      }
-      case 4: {
-        Tensor<cpu, 1> in_1(dptr[1] + offset, Shape1(size));
-        Tensor<cpu, 1> in_2(dptr[2] + offset, Shape1(size));
-        Tensor<cpu, 1> in_3(dptr[3] + offset, Shape1(size));
-        in_0 += in_1 + in_2 + in_3;
-        break;
-      }
-      default: {
-        for (size_t i = 1; i < dptr.size(); ++i) {
-          Tensor<cpu, 1> in_k(dptr[i] + offset, Shape1(size));
-          in_0 += in_k;
-        }
-      }
-    }
-  }
-  // reduce sum into val[0]
-  // this is performance critical
-  inline void ReduceSumCPU(const std::vector<NDArray> &in_data) {
-    const size_t step = std::min(bigarray_bound_, static_cast<size_t>(4 << 10));
-    // ge ptr out
-    std::vector<real_t*> dptr(in_data.size());
-    for (size_t i = 0; i < in_data.size(); ++i) {
-      TBlob data = in_data[i].data();
-      CHECK(data.CheckContiguous());
-      dptr[i] = data.FlatTo2D<cpu, real_t>().dptr_;
-    }
-    size_t total = in_data[0].shape().Size();
-    long ntask = (total + step - 1) / step; // NOLINT(*)
-    if (total < bigarray_bound_ || nthread_reduction_ <= 1) {
-      ReduceSumCPU(dptr, 0, total);
-    } else {
-      #pragma omp parallel for schedule(static) num_threads(nthread_reduction_)
-      for (long j = 0; j < ntask; ++j) { // NOLINT(*)
-        size_t k = static_cast<size_t>(j);
-        size_t begin = std::min(k * step, total);
-        size_t end = std::min((k + 1) * step, total);
-        if (j == ntask - 1) CHECK_EQ(end, total);
-        ReduceSumCPU(dptr, begin, static_cast<index_t>(end - begin));
-      }
-    }
-  }
-
   /// \brief buffer for storing local values
   std::unordered_map<int, NDArray> local_;
-
-  // number of threads to do reduction
-  int nthread_reduction_;
 };
 }  // namespace kvstore
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 4bd5bcb8b68e..a3b5ee1f3995 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -223,7 +223,71 @@ void ScalarOp(const NDArray &lhs,
   }
 }
 
+void CopySliceTo(const NDArray &from, int slice_dim, index_t start, index_t end,
+                 NDArray *to, int priority) {
+  CHECK(from.shape().ndim() == to->shape().ndim())
+      << "from and to must have the same number of dimensions";
+  CHECK(slice_dim < from.shape().ndim())
+      << "slice dimension out of bounds";
+  CHECK(start < end)
+      << "slice is empty";
+  CHECK(end < from.shape()[slice_dim])
+      << "slice out of bounds";
+
+  mshadow::Shape<3> from_shape = from.shape().FlatTo3D(slice_dim);
+  mshadow::Shape<3> to_shape = to->shape().FlatTo3D(slice_dim);
+  CHECK(from_shape[0] == to_shape[0] && from_shape[2] == to_shape[2])
+      << "shape incompatible";
+  CHECK(end - start == to_shape[1])
+      << "shape incompatible";
+
+  int a = from.ctx().dev_mask();
+  int b = to->ctx().dev_mask();
+
+  std::vector<Engine::VarHandle> const_vars{from.var()};
+  NDArray ret = *to;
+
+#define MXNET_COPYSLICETO_IMPL(xpu1, xpu2) \
+    Engine::Get()->PushSync([from, ret, from_shape, start, end](RunContext ctx) { \
+      ret.CheckAndAlloc(); \
+      for (index_t i = 0; i < from_shape[0]; ++i) { \
+        index_t src_idx = i * (from_shape[1] * from_shape[2]) + \
+            start * from_shape[2]; \
+        index_t length = from_shape[2] * (end - start); \
+        index_t dst_idx = i * length; \
+        \
+        TBlob blob_from = from.raw_data(src_idx, length); \
+        TBlob blob_to = ret.raw_data(dst_idx, length); \
+        ndarray::Copy<xpu1, xpu2>(blob_from, &blob_to, \
+                                  from.ctx(), ret.ctx(), ctx); \
+      } \
+    }, from.ctx(), const_vars, {ret.var()}, \
+    FnProperty::kNormal, priority)
+
+  if (a == cpu::kDevMask && b == cpu::kDevMask) {
+    MXNET_COPYSLICETO_IMPL(cpu, cpu);
+  } else {
+#if MXNET_USE_CUDA
+    if (a == cpu::kDevMask && b == gpu::kDevMask) {
+      MXNET_COPYSLICETO_IMPL(cpu, gpu);
+    } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
+      MXNET_COPYSLICETO_IMPL(gpu, cpu);
+    } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
+      MXNET_COPYSLICETO_IMPL(gpu, gpu);
+    } else {
+      LOG(FATAL) << "unknown device mask";
+    }
+#else
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+  }
+}
+
 void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
+  if (from.var() == to->var()) {
+    // skip to copy to itself
+    return;
+  }
   CHECK(from.shape() == to->shape())
       << "operands shape mismatch";
   CHECK(from.shape().ndim() != 0)
@@ -743,6 +807,19 @@ MXNET_REGISTER_NDARRAY_FUN(_copyto)
 .set_function(CopyFromToSimple)
 .set_type_mask(kNDArrayArgBeforeScalar);
 
+MXNET_REGISTER_NDARRAY_FUN(_copy_slice_to)
+.set_body([](NDArray **u, real_t *s, NDArray **out,
+             int num_params, char **param_keys, char **param_vals) {
+  CopySliceTo(*u[0],
+              static_cast<index_t>(s[0]),
+              static_cast<index_t>(s[1]),
+              static_cast<index_t>(s[2]), out[0]);
+})
+.set_num_use_vars(1)
+.set_num_scalars(3)
+.set_num_mutate_vars(1)
+.set_type_mask(kNDArrayArgBeforeScalar);
+
 // register random number generators
 MXNET_REGISTER_NDARRAY_FUN(_random_uniform)
 .set_body([](NDArray **u, real_t *s, NDArray **out,
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index cf6b180714ee..a5ba2660fd34 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -16,12 +16,12 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
                     RunContext ctx) {
   MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
     if (to->type_flag_ == from.type_flag_) {
-        mshadow::Copy(to->FlatTo2D<cpu, DType>(),
-                      from.FlatTo2D<cpu, DType>());
+        mshadow::Copy(to->FlatTo1D<cpu, DType>(),
+                      from.FlatTo1D<cpu, DType>());
     } else {
         MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
-            to->FlatTo2D<cpu, DType>() =
-                mshadow::expr::tcast<DType>(from.FlatTo2D<cpu, SrcDType>());
+            to->FlatTo1D<cpu, DType>() =
+                mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
         })
     }
   })
diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu
index 508efca5e542..ff6702f2f41b 100644
--- a/src/ndarray/ndarray_function.cu
+++ b/src/ndarray/ndarray_function.cu
@@ -18,8 +18,8 @@ void Copy<cpu, gpu>(const TBlob &from, TBlob *to,
   CHECK_EQ(to->type_flag_, from.type_flag_)
     << "Source and target must have the same data type when copying across devices.";
   MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
-    mshadow::Copy(to->FlatTo2D<gpu, DType>(),
-                  from.FlatTo2D<cpu, DType>(),
+    mshadow::Copy(to->FlatTo1D<gpu, DType>(),
+                  from.FlatTo1D<cpu, DType>(),
                   static_cast<mshadow::Stream<gpu>*>(ctx.stream));
   });
 }
@@ -31,8 +31,8 @@ void Copy<gpu, cpu>(const TBlob &from, TBlob *to,
   CHECK_EQ(to->type_flag_, from.type_flag_)
     << "Source and target must have the same data type when copying across devices.";
   MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
-    mshadow::Copy(to->FlatTo2D<cpu, DType>(),
-                  from.FlatTo2D<gpu, DType>(),
+    mshadow::Copy(to->FlatTo1D<cpu, DType>(),
+                  from.FlatTo1D<gpu, DType>(),
                   static_cast<mshadow::Stream<gpu>*>(ctx.stream));
   });
 }
@@ -45,13 +45,13 @@ void Copy<gpu, gpu>(const TBlob &from, TBlob *to,
     mshadow::Stream<gpu>* s = static_cast<mshadow::Stream<gpu>*>(ctx.stream);
     MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
       if (to->type_flag_ == from.type_flag_) {
-        mshadow::Copy(to->FlatTo2D<gpu, DType>(s),
-                      from.FlatTo2D<gpu, DType>(s),
+        mshadow::Copy(to->FlatTo1D<gpu, DType>(s),
+                      from.FlatTo1D<gpu, DType>(s),
                       s);
       } else {
         MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
-          to->FlatTo2D<gpu, DType>(s) =
-            mshadow::expr::tcast<DType>(from.FlatTo2D<gpu, SrcDType>(s));
+          to->FlatTo1D<gpu, DType>(s) =
+            mshadow::expr::tcast<DType>(from.FlatTo1D<gpu, SrcDType>(s));
         })
       }
     })
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index 03238b067ea3..e4fe423c1f46 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -89,7 +89,7 @@ class BatchNormOp : public Operator {
     Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
 
-    if (ctx.is_train && param_.fix_gamma) slope = 1.f;
+    if (param_.fix_gamma) slope = 1.f;
 
     // whether use global statistics
     if (ctx.is_train && !param_.use_global_stats) {
@@ -153,6 +153,8 @@ class BatchNormOp : public Operator {
     Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
 
+    if (param_.fix_gamma) slope = 1.f;
+
     if (ctx.is_train && !param_.use_global_stats) {
       // get requested temp space
       Tensor<xpu, 2> workspace = ctx.requested[batchnorm::kTempSpace].get_space<xpu>(
@@ -200,6 +202,7 @@ class BatchNormOp : public Operator {
       } else {
         Assign(gslope, req[batchnorm::kGamma], 0.0f);
       }
+      Assign(gbias, req[batchnorm::kBeta], sumall_except_dim<1>(grad));
       Assign(grad_in, req[batchnorm::kData], (grad * broadcast<1>(slope, data.shape_)) *
              broadcast<1>(
                  1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu
index 45f1a82307c2..ccd0000b061c 100644
--- a/src/operator/batch_norm.cu
+++ b/src/operator/batch_norm.cu
@@ -13,7 +13,11 @@ namespace op {
 template<>
 Operator *CreateOp<gpu>(BatchNormParam param) {
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-  return new CuDNNBatchNormOp(param);
+  if (!param.use_global_stats) {
+    return new CuDNNBatchNormOp(param);
+  } else {
+    return new BatchNormOp<gpu>(param);
+  }
 #else
   return new BatchNormOp<gpu>(param);
 #endif
diff --git a/src/operator/broadcast_mask_op-inl.h b/src/operator/broadcast_mask_op-inl.h
new file mode 100644
index 000000000000..8f012922e1da
--- /dev/null
+++ b/src/operator/broadcast_mask_op-inl.h
@@ -0,0 +1,95 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file broadcast_mask_op-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_BROADCAST_MASK_OP_INL_H_
+#define MXNET_OPERATOR_BROADCAST_MASK_OP_INL_H_
+
+#include <mxnet/operator_util.h>
+#include "./operator_common.h"
+
+
+#if defined(__CUDACC__)
+#define XPU gpu
+#else
+#define XPU cpu
+#endif
+
+namespace mxnet {
+namespace op {
+
+inline TShape ElementwiseMaskShape_(const TShape& lhs,
+                                    const TShape& rhs,
+                                    const EnvArguments& env) {
+  CHECK(lhs.ndim() > 1 && rhs.ndim() == 1) <<
+    "source tensor should be 2D or more and mask should be 1D";
+  CHECK_EQ(lhs[0], rhs[0]) << "The first dimention of inputs should be same";
+  return TShape(lhs);
+}
+
+template<typename xpu>
+void ElementwiseMaskForward_(const TBlob& lhs,
+                             const TBlob& rhs,
+                             const EnvArguments& env,
+                             TBlob *ret,
+                             OpReqType req,
+                             RunContext ctx) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, lhs.type_flag_)
+    << "Binary function only support input/output with the same type";
+  CHECK_EQ(ret->type_flag_, rhs.type_flag_)
+    << "Binary function only support input/output with the same type";
+  CHECK(lhs.shape_.ndim() > 1 && rhs.shape_.ndim() == 1 &&
+        lhs.shape_[0] == rhs.shape_[0]) <<
+    "the first ndim of lhs and rhs must be equal, lhs should be 2D or more and rhs shoube be 1D"
+    " shape of lhs=" << lhs.shape_ << " shape of rhs=" << rhs.shape_;
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+    mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+    ASSIGN_DISPATCH(out, req,
+      // TODO(bing): swap because requirement of inplace, change mshadow later
+      mask(rhs.get<xpu, 1, DType>(s), lhs.FlatTo2D<xpu, DType>(s)));
+  });
+  return;
+}
+
+template<typename xpu>
+void ElementwiseMaskBackward_(const OutputGrad& out_grad,
+                              const Input0& lhs,
+                              const Input1& rhs,
+                              const EnvArguments& env,
+                              TBlob* lhs_grad,
+                              TBlob* rhs_grad,
+                              OpReqType req_lhs_grad,
+                              OpReqType req_rhs_grad,
+                              RunContext ctx) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
+    mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> mrhs_data = rhs.data.get<xpu, 1, DType>(s);
+    ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
+      // TODO(bing): swap because requirement of inplace, change mshadow later
+      mask(mrhs_data, mout_grad));
+  });
+  return;
+}
+
+
+MXNET_REGISTER_SIMPLE_OP(element_mask, XPU)
+.set_shape_function(ElementwiseMaskShape_)
+.set_function(XPU::kDevMask, ElementwiseMaskForward_<XPU>, kInplaceLhsOut, kRegisterSymbolic)
+.set_gradient(XPU::kDevMask, ElementwiseMaskBackward_<XPU>, kInplaceOutLhs)
+.describe("rhs elmentwise mask lhs with broadcast");
+
+}  // namespace op
+}  // namespace mxnet
+
+
+#endif  // MXNET_OPERATOR_BROADCAST_MASK_OP_INL_H_
+
diff --git a/src/operator/broadcast_mask_op.cc b/src/operator/broadcast_mask_op.cc
new file mode 100644
index 000000000000..a32f57e81be7
--- /dev/null
+++ b/src/operator/broadcast_mask_op.cc
@@ -0,0 +1,8 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file broadcast_mask_op.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./broadcast_mask_op-inl.h"
+
diff --git a/src/operator/broadcast_mask_op.cu b/src/operator/broadcast_mask_op.cu
new file mode 100644
index 000000000000..822458687452
--- /dev/null
+++ b/src/operator/broadcast_mask_op.cu
@@ -0,0 +1,8 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file broadcast_mask_op.cu
+ * \brief
+ * \author Bing Xu
+*/
+#include "./broadcast_mask_op-inl.h"
+
diff --git a/src/operator/broadcast_reduce_op_common.h b/src/operator/broadcast_reduce_op_common.h
index 179935d7a882..bfe4ad937ac7 100644
--- a/src/operator/broadcast_reduce_op_common.h
+++ b/src/operator/broadcast_reduce_op_common.h
@@ -43,7 +43,7 @@ inline std::vector<index_t> ParseAxes_(const TShape& param_axis, index_t max_ndi
 * \param src_shape shape of the source tensor
 */
 inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_size,
-  const mshadow::TShape &axes, const mshadow::TShape &src_shape) {
+  const TShape &axes, const TShape &src_shape) {
   *is_contiguous_axes = true;
   *reducing_size = 1;
   for (index_t i = 0; i < axes.ndim(); ++i) {
@@ -57,13 +57,13 @@ inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_siz
 
 template<int dimsrc>
 inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_size,
-  const mshadow::TShape &axes, const mshadow::Shape<dimsrc> &src_shape) {
+  const TShape &axes, const mshadow::Shape<dimsrc> &src_shape) {
   CheckContiguousAxes_(is_contiguous_axes, reducing_size, axes,
     TShape(src_shape.shape_, src_shape.shape_ + dimsrc));
 }
 
-inline TShape GetBroadcastingAxes_(const mshadow::TShape &src_shape,
-  const mshadow::TShape &target_shape) {
+inline TShape GetBroadcastingAxes_(const TShape &src_shape,
+  const TShape &target_shape) {
   std::vector<index_t> axes_vec;
   CHECK_EQ(target_shape.ndim(), src_shape.ndim());
   for (index_t i = 0; i < src_shape.ndim(); ++i) {
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index f8de862ea3cb..95587b6d5b5a 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -139,7 +139,7 @@ class ConcatProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
     TShape dshape = in_shape->at(concat_enum::kData0);
     if (dshape.ndim() == 0) return false;
-    CHECK_GT(dshape.ndim(), 1);
+    CHECK_GE(dshape.ndim(), 1);
     CHECK_LT(static_cast<index_t>(param_.dim), dshape.ndim())
         <<"the dimension to be concated is not in the range of input's dimension";
     for (int i = 1; i < param_.num_args; ++i) {
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 4a3425fdbdbd..310f91e8cc16 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -38,6 +38,7 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   uint64_t workspace;
   bool no_bias;
   int cudnn_tune;
+  bool cudnn_off;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     int shape[] = {1, 1};
     DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x) or (d, y, x)");
@@ -67,6 +68,8 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
               "Leads to higher startup time but may give better speed."
               "auto tune is turned off by default."
               "Set environment varialbe MXNET_CUDNN_AUTOTUNE_DEFAULT=1 to turn on by default.");
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
+    .describe("Turn off cudnn.");
   }
 };
 
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
index 6f5904658e20..857dd74b4614 100644
--- a/src/operator/convolution.cu
+++ b/src/operator/convolution.cu
@@ -20,7 +20,7 @@ Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
                         Context ctx) {
   Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
-  if (param.dilate[0] == 1 && param.dilate[1] == 1) {
+  if (param.dilate[0] == 1 && param.dilate[1] == 1 && !param.cudnn_off) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       op = new CuDNNConvolutionOp<DType>(param, in_shape, out_shape, ctx);
     })
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
index c4f9afaafeef..5804b1cf3675 100644
--- a/src/operator/cudnn_batch_norm-inl.h
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -99,7 +99,7 @@ class CuDNNBatchNormOp : public Operator {
       .get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
     float a = 1.0f, b = 0.0f;
 
-    if (ctx.is_train && param_.fix_gamma) gamma = 1.f;
+    if (param_.fix_gamma) gamma = 1.f;
 
     if (ctx.is_train) {
       Tensor<gpu, 1> save_mean =
@@ -175,6 +175,9 @@ class CuDNNBatchNormOp : public Operator {
     float b = 0.0f;
     float b_add = 1.0f;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+
+    if (param_.fix_gamma) gamma = 1.f;
+
 #if CUDNN_VERSION >= 4007
     CHECK_EQ(cudnnBatchNormalizationBackward(s->dnn_handle_,
                                              CUDNN_BATCHNORM_SPATIAL,
diff --git a/src/operator/cudnn_lrn-inl.h b/src/operator/cudnn_lrn-inl.h
index 97bfde3d4134..d1f440fa5439 100644
--- a/src/operator/cudnn_lrn-inl.h
+++ b/src/operator/cudnn_lrn-inl.h
@@ -36,7 +36,7 @@ class CuDNNLocalResponseNormOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 2);
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -68,7 +68,7 @@ class CuDNNLocalResponseNormOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 2);
     CHECK_EQ(req.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     float alpha = 1.0f;
@@ -100,7 +100,7 @@ class CuDNNLocalResponseNormOp : public Operator {
                    const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 2);
     if (!init_cudnn_) {
       init_cudnn_ = true;
       Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
index e995a1b289b0..851351e40e1e 100644
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -14,13 +14,14 @@
 namespace mxnet {
 namespace op {
 
+template<typename DType>
 class CuDNNPoolingOp : public Operator {
  public:
   explicit CuDNNPoolingOp(PoolingParam p) {
     param_ = p;
     init_cudnn_ = false;
     // TODO(xxx): fp16
-    dtype_ = CUDNN_DATA_FLOAT;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     switch (param_.pool_type) {
       case pool_enum::kMaxPooling:
         mode_ = CUDNN_POOLING_MAX;
@@ -52,12 +53,12 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    float alpha = 1.0f;
-    float beta = 0.0f;
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
       if (!init_cudnn_) {
         this->Init(s, in_data, out_data);
       }
@@ -73,8 +74,8 @@ class CuDNNPoolingOp : public Operator {
                                    out.dptr_), CUDNN_STATUS_SUCCESS);
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5> data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
-      Tensor<gpu, 5> out = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
       if (!init_cudnn_) {
         this->Init(s, in_data, out_data);
       }
@@ -110,14 +111,14 @@ class CuDNNPoolingOp : public Operator {
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    float alpha = 1.0f;
-    float beta = 0.0f;
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> m_in_data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, DType>(s);
       CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
                                   pooling_desc_,
                                   &alpha,
@@ -132,10 +133,10 @@ class CuDNNPoolingOp : public Operator {
                                   m_in_grad.dptr_), CUDNN_STATUS_SUCCESS);
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 5, real_t>(s);
-      Tensor<gpu, 5> m_in_data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
-      Tensor<gpu, 5> m_out_data = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
-      Tensor<gpu, 5> m_in_grad = in_grad[pool_enum::kData].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 5, DType>(s);
       CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
                                   pooling_desc_,
                                   &alpha,
@@ -167,8 +168,8 @@ class CuDNNPoolingOp : public Operator {
       init_cudnn_ = true;
       if (param_.kernel.ndim() == 2) {
         // 2d conv
-        Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-        Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+        Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
+        Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
         mshadow::Shape<4> dshape = data.shape_;
         CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
@@ -210,8 +211,8 @@ class CuDNNPoolingOp : public Operator {
                                                CUDNN_STATUS_SUCCESS);
         #endif
       } else {
-        Tensor<gpu, 5> data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
-        Tensor<gpu, 5> out = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+        Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
+        Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
         CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
         CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 5707846a781f..f33e68a31d79 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -7,6 +7,7 @@
 #ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
 #define MXNET_OPERATOR_CUDNN_RNN_INL_H_
 
+#include <mxnet/storage.h>
 #include <vector>
 #include <map>
 #include <string>
@@ -72,7 +73,8 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudaFree(dropout_states_), CUDNN_STATUS_SUCCESS);
+      Storage::Get()->Free(dropout_states_);
+      Storage::Get()->Free(reserve_space_);
     }
   }
 
@@ -102,10 +104,11 @@ class CuDNNRNNOp : public Operator {
 
     DType * cx_ptr = NULL;
     DType * cy_ptr = NULL;
-    if (param_.mode == rnn_enum::kLstm) {
+
+    if (param_.lstm_q_)
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+    if (param_.lstm_q_ && param_.state_outputs)
       cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-    }
 
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
@@ -117,7 +120,6 @@ class CuDNNRNNOp : public Operator {
     }
     // Get temp space
     int temp_size = workspace_size_;
-    temp_size += ctx.is_train ? reserve_space_size_ : 0;
     Tensor<gpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
@@ -141,7 +143,7 @@ class CuDNNRNNOp : public Operator {
                                       cy_ptr,
                                       temp_space.dptr_,
                                       workspace_byte_,
-                                      temp_space.dptr_ + workspace_size_,
+                                      reserve_space_.dptr,
                                       reserve_space_byte_), CUDNN_STATUS_SUCCESS);
     } else {
       // inference mode
@@ -184,7 +186,9 @@ class CuDNNRNNOp : public Operator {
     CHECK_EQ(out_data.size(), out_expected);
     CHECK_EQ(in_grad.size(), in_expected);
     CHECK_EQ(out_grad.size(), out_expected);
-
+    CHECK_EQ(req.size(), in_expected);
+    CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
+    CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
     Stream<gpu> *s = ctx.get_stream<gpu>();
     // get input + output tensors
     Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
@@ -195,7 +199,9 @@ class CuDNNRNNOp : public Operator {
     Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
     Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
-
+    if (req[rnn_enum::kParams] != kAddTo) {
+      dw = mshadow::expr::ScalarExp<DType>(0.0f);
+    }
     // only need kStateOut grad output_states is true
     void * dhy_ptr = NULL;
     if (param_.state_outputs)
@@ -207,6 +213,7 @@ class CuDNNRNNOp : public Operator {
     void * cx_ptr = NULL;
 
     if (param_.mode == rnn_enum::kLstm) {
+      CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
       cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
       dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
     }
@@ -215,8 +222,11 @@ class CuDNNRNNOp : public Operator {
 
     CHECK_EQ(x.CheckContiguous(), true);
     CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(dw.CheckContiguous(), true);
     CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(dhx.CheckContiguous(), true);
     CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(dy.CheckContiguous(), true);
 
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
@@ -224,7 +234,6 @@ class CuDNNRNNOp : public Operator {
 
     // Get temp space
     int temp_size = workspace_size_;
-    temp_size += ctx.is_train ? reserve_space_size_ : 0;
     Tensor<gpu, 1, DType> temp_space =
       ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
                               mshadow::Shape1(temp_size), s);
@@ -253,7 +262,7 @@ class CuDNNRNNOp : public Operator {
                                 dcx_ptr,
                                 temp_space.dptr_,
                                 workspace_byte_,
-                                temp_space.dptr_ + workspace_size_,
+                                reserve_space_.dptr,
                                 reserve_space_byte_), CUDNN_STATUS_SUCCESS);
     CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_,
                                     rnn_desc_,
@@ -268,7 +277,7 @@ class CuDNNRNNOp : public Operator {
                                     workspace_byte_,
                                     dw_desc_,
                                     dw.dptr_,
-                                    temp_space.dptr_ + workspace_size_,
+                                    reserve_space_.dptr,
                                     reserve_space_byte_), CUDNN_STATUS_SUCCESS);
   }
 
@@ -414,11 +423,11 @@ class CuDNNRNNOp : public Operator {
       CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_,
                                         &dropout_byte_), CUDNN_STATUS_SUCCESS);
       dropout_size_ = dropout_byte_ / sizeof(DType);
-      CHECK_EQ(cudaMalloc(&dropout_states_, dropout_byte_), CUDNN_STATUS_SUCCESS);
+      dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU());
       CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
                                         s->dnn_handle_,
                                         param_.p,  // keep probability
-                                        dropout_states_,
+                                        dropout_states_.dptr,
                                         dropout_byte_,
                                         seed_), CUDNN_STATUS_SUCCESS);
       // RNN descriptors
@@ -443,9 +452,10 @@ class CuDNNRNNOp : public Operator {
                                         x_desc_vec_.data(),
                                         &reserve_space_byte_), CUDNN_STATUS_SUCCESS);
       workspace_size_ = workspace_byte_ / sizeof(DType);
-      reserve_space_size_ = reserve_space_byte_ / sizeof(DType);
+      // Allocate the reserve space
+      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU());
 
-      // check that number of params are correct
+      // Check that number of params are correct
       size_t cudnn_param_size;
       CHECK_EQ(cudnnGetRNNParamsSize(s->dnn_handle_,
                                     rnn_desc_,
@@ -479,11 +489,10 @@ class CuDNNRNNOp : public Operator {
   cudnnDirectionMode_t direction_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDropoutDescriptor_t dropout_desc_;
-  void *dropout_states_;
+  Storage::Handle dropout_states_, reserve_space_;
   uint64_t seed_ = 1337ull;
   size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
-  int workspace_size_, reserve_space_size_, dropout_size_;
-
+  int workspace_size_, dropout_size_;
   std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
   cudnnTensorDescriptor_t hx_desc_, cx_desc_;
   cudnnTensorDescriptor_t hy_desc_, cy_desc_;
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index 229001eaa66b..20e3b1bfa3b3 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -35,7 +35,7 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
   }
 };  // struct DropoutParam
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class DropoutOp : public Operator {
  public:
   explicit DropoutOp(DropoutParam param) {
@@ -54,12 +54,13 @@ class DropoutOp : public Operator {
       CHECK_EQ(out_data.size(), 2);
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[dropout::kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[dropout::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
     if (ctx.is_train) {
-      Tensor<xpu, 2> mask = out_data[dropout::kMask].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
       Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu, real_t>(s);
-      mask = F<mshadow_op::threshold>(prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_);
+      mask = tcast<DType>(F<mshadow_op::threshold>(
+             prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_));
       Assign(out, req[dropout::kOut], data * mask);
     } else {
       Assign(out, req[dropout::kOut], F<mshadow_op::identity>(data));
@@ -78,9 +79,9 @@ class DropoutOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad = out_grad[dropout::kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> mask = out_data[dropout::kMask].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> gdata = in_grad[dropout::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, DType>(s);
     Assign(gdata, req[dropout::kData], grad * mask);
   }
 
@@ -90,7 +91,7 @@ class DropoutOp : public Operator {
 
 
 template<typename xpu>
-Operator *CreateOp(DropoutParam param);
+Operator *CreateOp(DropoutParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class DropoutProp : public OperatorProperty {
@@ -116,6 +117,23 @@ class DropoutProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = in_type->at(0);
+
+    if (dtype == -1) {
+      LOG(FATAL) << "input type to dropout is not specified.";
+      return false;
+    }
+
+    size_t nout = this->ListOutputs().size();
+    out_type->clear();
+    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new DropoutProp();
     ptr->param_ = param_;
@@ -164,7 +182,13 @@ class DropoutProp : public OperatorProperty {
     return {"output", "mask"};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   DropoutParam param_;
diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc
index 506d7ef544b7..9584ae300fb3 100644
--- a/src/operator/dropout.cc
+++ b/src/operator/dropout.cc
@@ -10,13 +10,22 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>(DropoutParam param) {
-  return new DropoutOp<cpu>(param);
+Operator *CreateOp<cpu>(DropoutParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new DropoutOp<cpu, DType>(param);
+  });
+  return op;
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
-Operator *DropoutProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                              std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(DropoutParam);
diff --git a/src/operator/dropout.cu b/src/operator/dropout.cu
index f0c1da8dbd95..ea9eb7dfa200 100644
--- a/src/operator/dropout.cu
+++ b/src/operator/dropout.cu
@@ -10,8 +10,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(DropoutParam param) {
-  return new DropoutOp<gpu>(param);
+Operator *CreateOp<gpu>(DropoutParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new DropoutOp<gpu, DType>(param);
+  });
+  return op;
 }
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/elementwise_binary_broadcast_op-inl.h b/src/operator/elementwise_binary_broadcast_op-inl.h
index 0723657cc3ef..be6176930cb4 100644
--- a/src/operator/elementwise_binary_broadcast_op-inl.h
+++ b/src/operator/elementwise_binary_broadcast_op-inl.h
@@ -207,7 +207,7 @@ void BinaryBroadcastBackward_(const OutputGrad& out_grad,
     << "Binary function only support ingrad/outgrad with the same type";
   CHECK_EQ(out_grad.data.type_flag_, rhs_grad->type_flag_)
     << "Binary function only support ingrad/outgrad with the same type";
-  CHECK_EQ(rhs_grad->shape_.ndim(), rhs_grad->shape_.ndim()) <<
+  CHECK_EQ(lhs_grad->shape_.ndim(), rhs_grad->shape_.ndim()) <<
     "the ndim of lhs_grad and rhs_grad must be equal,"
     " shape of lhs_grad=" << lhs_grad->shape_ << " shape of rhs_grad=" << rhs_grad->shape_;
   if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
diff --git a/src/operator/elementwise_binary_op-inl.h b/src/operator/elementwise_binary_op-inl.h
index bb3bc66e1ecc..a8290500befe 100644
--- a/src/operator/elementwise_binary_op-inl.h
+++ b/src/operator/elementwise_binary_op-inl.h
@@ -32,10 +32,10 @@ void BinaryForward_(const TBlob& lhs,
   CHECK_EQ(ret->type_flag_, rhs.type_flag_)
     << "Binary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> out = ret->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(out, req,
-                    F<OP>(lhs.FlatTo2D<xpu, DType>(s),
-                          rhs.FlatTo2D<xpu, DType>(s)));
+                    F<OP>(lhs.FlatTo1D<xpu, DType>(s),
+                          rhs.FlatTo1D<xpu, DType>(s)));
   });
 }
 
@@ -51,9 +51,9 @@ void PlusBackward_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mout_grad = out_grad.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad = lhs_grad->FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad = rhs_grad->FlatTo1D<xpu, DType>(s);
       ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, F<mshadow_op::identity>(mout_grad));
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, F<mshadow_op::identity>(mout_grad));
     });
@@ -70,9 +70,9 @@ void MinusBackward_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mout_grad = out_grad.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad = lhs_grad->FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad = rhs_grad->FlatTo1D<xpu, DType>(s);
       ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, F<mshadow_op::identity>(mout_grad));
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, F<mshadow_op::negation>(mout_grad));
     });
@@ -91,11 +91,11 @@ void MulBackward_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mout_grad = out_grad.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_data = lhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_data = rhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad = lhs_grad->FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad = rhs_grad->FlatTo1D<xpu, DType>(s);
       CHECK_NE(req_rhs_grad, kWriteInplace);
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, mlhs_data * mout_grad);
       ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mrhs_data * mout_grad);
@@ -115,11 +115,11 @@ void DivBackward_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mout_grad = out_grad.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_data = lhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_data = rhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad = lhs_grad->FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad = rhs_grad->FlatTo1D<xpu, DType>(s);
       CHECK_NE(req_rhs_grad, kWriteInplace);
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
                       F<mshadow_op::negation>(mout_grad * mlhs_data)/
@@ -141,11 +141,11 @@ void PowerBackward_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mout_grad = out_grad.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_data = lhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_data = rhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad = lhs_grad->FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad = rhs_grad->FlatTo1D<xpu, DType>(s);
       CHECK_NE(req_rhs_grad, kWriteInplace);
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
                       F<mshadow_op::log>(mlhs_data) *
@@ -170,11 +170,11 @@ void MaximumBackward_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mout_grad = out_grad.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_data = lhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_data = rhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad = lhs_grad->FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad = rhs_grad->FlatTo1D<xpu, DType>(s);
       CHECK_NE(req_rhs_grad, kWriteInplace);
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
                       mout_grad * F<mshadow_op::maximum_grad>(mrhs_data, mlhs_data));
@@ -196,11 +196,11 @@ void MinimumBackward_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mout_grad = out_grad.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_data = lhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_data = rhs.data.FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad = lhs_grad->FlatTo1D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad = rhs_grad->FlatTo1D<xpu, DType>(s);
       CHECK_NE(req_rhs_grad, kWriteInplace);
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
                       mout_grad * F<mshadow_op::minimum_grad>(mrhs_data, mlhs_data));
diff --git a/src/operator/elementwise_binary_scalar_op-inl.h b/src/operator/elementwise_binary_scalar_op-inl.h
index a992f30e7625..1b4c0f34db7a 100644
--- a/src/operator/elementwise_binary_scalar_op-inl.h
+++ b/src/operator/elementwise_binary_scalar_op-inl.h
@@ -29,9 +29,9 @@ void BinaryScalarLForward_(const TBlob& lhs,
   CHECK_EQ(ret->type_flag_, lhs.type_flag_)
     << "Binary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> out = ret->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(out, req,
-                    F<OP>(lhs.FlatTo2D<xpu, DType>(s),
+                    F<OP>(lhs.FlatTo1D<xpu, DType>(s),
                           scalar<DType>(env.scalar)));
   });
 }
@@ -47,10 +47,10 @@ void BinaryScalarRForward_(const TBlob& rhs,
   CHECK_EQ(ret->type_flag_, rhs.type_flag_)
     << "Binary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> out = ret->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(out, req,
                     F<OP>(scalar<DType>(env.scalar),
-                          rhs.FlatTo2D<xpu, DType>(s)));
+                          rhs.FlatTo1D<xpu, DType>(s)));
   });
 }
 
@@ -66,9 +66,9 @@ void BinaryScalarBackwardT0_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
-                    F<BackwardOp>(out_grad.data.FlatTo2D<xpu, DType>()));
+                    F<BackwardOp>(out_grad.data.FlatTo1D<xpu, DType>()));
     });
 }
 
@@ -84,9 +84,9 @@ void BinaryScalarBackwardT1_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
-                    F<BackwardOp>(out_grad.data.FlatTo2D<xpu, DType>(),
+                    F<BackwardOp>(out_grad.data.FlatTo1D<xpu, DType>(),
                                   scalar<DType>(env.scalar)));
   });
 }
@@ -104,11 +104,11 @@ void BinaryScalarBackwardT2_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
-                    (F<BackwardOp>(lhs.data.FlatTo2D<xpu, DType>(),
+                    (F<BackwardOp>(lhs.data.FlatTo1D<xpu, DType>(),
                                    scalar<DType>(env.scalar)) *
-                     out_grad.data.FlatTo2D<xpu, DType>()));
+                     out_grad.data.FlatTo1D<xpu, DType>()));
     });
 }
 
@@ -125,11 +125,11 @@ void DivRBackward_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
                     (scalar<DType>(-env.scalar) /
-                     F<mshadow_op::square>(in_data.data.FlatTo2D<xpu, DType>()) *
-                     out_grad.data.FlatTo2D<xpu, DType>()));
+                     F<mshadow_op::square>(in_data.data.FlatTo1D<xpu, DType>()) *
+                     out_grad.data.FlatTo1D<xpu, DType>()));
   });
 }
 
@@ -147,12 +147,12 @@ void PowerLBackward_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
-                    (F<mshadow_op::power>(lhs.data.FlatTo2D<xpu, DType>(),
+                    (F<mshadow_op::power>(lhs.data.FlatTo1D<xpu, DType>(),
                                           scalar<DType>(env.scalar - 1.0f)) *
                      scalar<DType>(env.scalar) *
-                     out_grad.data.FlatTo2D<xpu, DType>()));
+                     out_grad.data.FlatTo1D<xpu, DType>()));
   });
 }
 
@@ -169,11 +169,11 @@ void PowerRBackward_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
                     (scalar<DType>(logf(env.scalar)) *
-                     out_data.data.FlatTo2D<xpu, DType>() *
-                     out_grad.data.FlatTo2D<xpu, DType>()));
+                     out_data.data.FlatTo1D<xpu, DType>() *
+                     out_grad.data.FlatTo1D<xpu, DType>()));
   });
 }
 
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index ebf33f90cc1c..54a6edae7a73 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -52,34 +52,34 @@ class ElementWiseSumOp : public Operator {
     if (req[elemsum::kOut] == kNullOp) return;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> out = out_data[elemsum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 1, DType> out = out_data[elemsum::kOut].FlatTo1D<xpu, DType>(s);
     switch (size_) {
       case 2: {
-        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_0 = in_data[elemsum::kData0].FlatTo1D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_1 = in_data[elemsum::kData1].FlatTo1D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1);
         break;
       }
       case 3: {
-        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_0 = in_data[elemsum::kData0].FlatTo1D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_1 = in_data[elemsum::kData1].FlatTo1D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_2 = in_data[elemsum::kData2].FlatTo1D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2);
         break;
       }
       case 4: {
-        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> in_3 = in_data[elemsum::kData3].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_0 = in_data[elemsum::kData0].FlatTo1D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_1 = in_data[elemsum::kData1].FlatTo1D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_2 = in_data[elemsum::kData2].FlatTo1D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_3 = in_data[elemsum::kData3].FlatTo1D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2 + in_3);
         break;
       }
       default: {
-        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 1, DType> in_0 = in_data[elemsum::kData0].FlatTo1D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], F<mshadow_op::identity>(in_0));
         for (int i = 1; i < size_; ++i) {
-          out += in_data[i].FlatTo2D<xpu, DType>(s);
+          out += in_data[i].FlatTo1D<xpu, DType>(s);
         }
         break;
       }
@@ -97,10 +97,10 @@ class ElementWiseSumOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> ograd = out_grad[elemsum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 1, DType> ograd = out_grad[elemsum::kOut].FlatTo1D<xpu, DType>(s);
     for (int i = 0; i < size_; ++i) {
       if (req[i] == kNullOp || req[i] == kWriteInplace) continue;
-      Tensor<xpu, 2, DType> igrad = in_grad[i].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 1, DType> igrad = in_grad[i].FlatTo1D<xpu, DType>(s);
       Assign(igrad, req[i], F<mshadow_op::identity>(ograd));
     }
   }
diff --git a/src/operator/elementwise_unary_op-inl.h b/src/operator/elementwise_unary_op-inl.h
index 97bd276229e4..c96dfc570660 100644
--- a/src/operator/elementwise_unary_op-inl.h
+++ b/src/operator/elementwise_unary_op-inl.h
@@ -30,8 +30,27 @@ void UnaryForward_(const TBlob& src,
   CHECK_EQ(ret->type_flag_, src.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
-    ASSIGN_DISPATCH(out, req, F<OP>(src.FlatTo2D<xpu, DType>(s)));
+    mshadow::Tensor<xpu, 1, DType> out = ret->FlatTo1D<xpu, DType>(s);
+    ASSIGN_DISPATCH(out, req, F<OP>(src.FlatTo1D<xpu, DType>(s)));
+  });
+}
+
+// backward function that takes input value of the op
+template<typename xpu, typename OP>
+void UnaryBackward_(const OutputGrad& out_grad,
+                    const EnvArguments& env,
+                    TBlob *in_grad,
+                    OpReqType req,
+                    RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
+    << "Unary function only support input/output with the same type";
+  MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
+    ASSIGN_DISPATCH(igrad, req,
+                    (F<OP>(out_grad.data.FlatTo1D<xpu, DType>(s))));
   });
 }
 
@@ -51,10 +70,10 @@ void UnaryBackwardUseIn_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, in_data0.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
-                    (F<OP>(in_data0.data.FlatTo2D<xpu, DType>(s)) *
-                     out_grad.data.FlatTo2D<xpu, DType>(s)));
+                    (F<OP>(in_data0.data.FlatTo1D<xpu, DType>(s)) *
+                     out_grad.data.FlatTo1D<xpu, DType>(s)));
   });
 }
 
@@ -74,10 +93,10 @@ void UnaryBackwardUseOut_(const OutputGrad& out_grad,
   CHECK_EQ(in_grad->type_flag_, out_value.data.type_flag_)
     << "Unary function only support input/output with the same type";
   MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-    mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> igrad = in_grad->FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(igrad, req,
-                    (F<OP>(out_value.data.FlatTo2D<xpu, DType>(s)) *
-                     out_grad.data.FlatTo2D<xpu, DType>(s)));
+                    (F<OP>(out_value.data.FlatTo1D<xpu, DType>(s)) *
+                     out_grad.data.FlatTo1D<xpu, DType>(s)));
     });
 }
 
@@ -138,6 +157,11 @@ MXNET_REGISTER_SIMPLE_OP(sin, XPU)
 .set_function(XPU::kDevMask, UnaryForward_<XPU, mshadow_op::sin>, kInplaceInOut)
 .set_gradient(XPU::kDevMask, UnaryBackwardUseIn_<XPU, mshadow_op::sin_grad>, kInplaceOutIn)
 .describe("Take sin of the src");
+// negation
+MXNET_REGISTER_SIMPLE_OP(negative, XPU)
+.set_function(XPU::kDevMask, UnaryForward_<XPU, mshadow_op::negation>, kInplaceInOut)
+.set_gradient(XPU::kDevMask, UnaryBackward_<XPU, mshadow_op::negation>, kInplaceOutIn)
+.describe("Take negation of the src");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index 656d7e6d7dca..7d270ff1a497 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -177,7 +177,7 @@ class LeakyReLUOp : public Operator {
         weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         grad_weight = sumall_except_dim<1>(F<prelu_grad>(data) * grad);
-        gdata = F<mshadow_op::xelu_grad>(output, broadcast<1>(weight, data.shape_)) * grad;
+        gdata = F<mshadow_op::xelu_grad>(data, broadcast<1>(weight, data.shape_)) * grad;
         break;
       }
       case leakyrelu::kRReLU: {
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
index 40985e7b5112..3542d9e0a3ca 100644
--- a/src/operator/lrn-inl.h
+++ b/src/operator/lrn-inl.h
@@ -120,9 +120,7 @@ class LocalResponseNormProp : public OperatorProperty {
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
     out_shape->push_back(dshape);
-#if MXNET_USE_CUDNN != 1
     out_shape->push_back(dshape);
-#endif
     return true;
   }
 
@@ -140,23 +138,10 @@ class LocalResponseNormProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-#if MXNET_USE_CUDNN == 1
-    return {out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], out_data[lrn_enum::kOut]};
-#else
-    return {out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], out_data[lrn_enum::kTmpNorm]};
-#endif
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-#if MXNET_USE_CUDNN == 1
-    return {};
-#else
-    return {{out_grad[lrn_enum::kOut], in_grad[lrn_enum::kData]}};
-#endif
+    return {
+      out_grad[lrn_enum::kOut], in_data[lrn_enum::kData],
+      out_data[lrn_enum::kTmpNorm], out_data[lrn_enum::kOut]
+    };
   }
 
   int NumVisibleOutputs() const override {
@@ -164,7 +149,7 @@ class LocalResponseNormProp : public OperatorProperty {
   }
 
   int NumOutputs() const override {
-    return MXNET_USE_CUDNN == 1 ? 1 : 2;
+    return 2;
   }
 
   std::vector<std::string> ListArguments() const override {
@@ -172,11 +157,7 @@ class LocalResponseNormProp : public OperatorProperty {
   }
 
   std::vector<std::string> ListOutputs() const override {
-#if MXNET_USE_CUDNN == 1
-    return {"output"};
-#else
     return {"output", "tmp_norm"};
-#endif
   }
 
   Operator* CreateOperator(Context ctx) const override;
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index df1f851fe238..e754300f0f2f 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -198,7 +198,7 @@ struct abs {
   }
 };
 
-/*! \brief used for generate element of power */
+/*! \brief used for generate element of sign */
 struct sign {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index f1fda56713e9..7f9284e925e6 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -59,7 +59,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   }
 };
 
-template<typename xpu, typename Reducer>
+template<typename xpu, typename Reducer, typename DType>
 class PoolingOp : public Operator {
  public:
   explicit PoolingOp(PoolingParam p) {
@@ -79,8 +79,8 @@ class PoolingOp : public Operator {
     if (param_.kernel.ndim() == 3) {
       LOG(FATAL) << "Not implmented";
     }
-    Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
     mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
     if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
       Assign(out,
@@ -94,7 +94,7 @@ class PoolingOp : public Operator {
     } else if (param_.pool_type == pool_enum::kAvgPooling) {
       Assign(out,
              req[pool_enum::kOut],
-             (1.0f / (param_.global_pool ?
+             scalar<DType>(1.0f / (param_.global_pool ?
                       data.shape_[2] * data.shape_[3] :
                       param_.kernel[0] * param_.kernel[1])) * \
              pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
@@ -125,10 +125,10 @@ class PoolingOp : public Operator {
       LOG(FATAL) << "Not implmented";
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[pool_enum::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> output_data = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> input_grad = in_grad[pool_enum::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> grad = out_grad[pool_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> output_data = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> input_grad = in_grad[pool_enum::kData].get<xpu, 4, DType>(s);
 
     mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
 
@@ -146,7 +146,9 @@ class PoolingOp : public Operator {
                   param_.pad[1]));
     } else if (param_.pool_type == pool_enum::kAvgPooling) {
       Assign(input_grad, req[pool_enum::kData],
-             (1.0f / param_.kernel[0] / param_.kernel[1]) *\
+             scalar<DType>(1.0f / (param_.global_pool ?
+                      data.shape_[2] * data.shape_[3] :
+                      param_.kernel[0] * param_.kernel[1])) * \
              crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),
                                   pad(grad, 0, 0),
@@ -165,7 +167,7 @@ class PoolingOp : public Operator {
 };  // class PoolingOp
 
 template<typename xpu>
-Operator* CreateOp(PoolingParam param);
+Operator* CreateOp(PoolingParam param, int dtype);
 
 
 #if DMLC_USE_CXX11
@@ -190,7 +192,6 @@ class PoolingProp : public OperatorProperty {
     if (dshape.ndim() ==  0) return false;
     if (param_.kernel.ndim() == 2) {
       CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-
       if (param_.global_pool) {
         oshape[2] = 1;
         oshape[3] = 1;
@@ -218,12 +219,29 @@ class PoolingProp : public OperatorProperty {
         oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / param_.stride[1];
         oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / param_.stride[2];
       }
+
       out_shape->clear();
       out_shape->push_back(oshape);
     }
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Input type to pooling is not specified.";
+      return false;
+    }
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     PoolingProp *prop_sym = new PoolingProp();
     prop_sym->param_ = this->param_;
@@ -253,7 +271,13 @@ class PoolingProp : public OperatorProperty {
 #endif
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   PoolingParam param_;
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 584d6f5b4c5b..a629a09ffedc 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -9,22 +9,35 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>(PoolingParam param) {
-  switch (param.pool_type) {
-    case pool_enum::kMaxPooling:
-      return new PoolingOp<cpu, mshadow::red::maximum>(param);
-    case pool_enum::kAvgPooling:
-      return new PoolingOp<cpu, mshadow::red::sum>(param);
-    case pool_enum::kSumPooling:
-      return new PoolingOp<cpu, mshadow::red::sum>(param);
-    default:
-      LOG(FATAL) << "unknown pooling type";
-      return NULL;
-  }
+Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_enum::kMaxPooling:
+        op = new PoolingOp<cpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_enum::kAvgPooling:
+        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_enum::kSumPooling:
+        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  });
+  return op;
 }
 
-Operator* PoolingProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(PoolingParam);
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index 72a650d69d60..ee4edbefa304 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -13,33 +13,45 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(PoolingParam param) {
+Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
+  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
-  switch (param.pool_type) {
-    case pool_enum::kMaxPooling:
-      return new CuDNNPoolingOp(param);
-    case pool_enum::kAvgPooling:
-      return new CuDNNPoolingOp(param);
-    case pool_enum::kSumPooling:
-      LOG(WARNING) << "Sum pooling is not supported by cudnn, MxNet sum pooling is applied.";
-      return new PoolingOp<gpu, mshadow::red::sum>(param);
-    default:
-      LOG(FATAL) << "unknown pooling type";
-      return NULL;
-  }
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_enum::kMaxPooling:
+        op = new CuDNNPoolingOp<DType>(param);
+        break;
+      case pool_enum::kAvgPooling:
+        op = new CuDNNPoolingOp<DType>(param);
+        break;
+      case pool_enum::kSumPooling:
+        LOG(WARNING) << "Sum pooling is not supported by cudnn, MxNet sum pooling is applied.";
+        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  });
 #else
-  switch (param.pool_type) {
-    case pool_enum::kMaxPooling:
-      return new PoolingOp<gpu, mshadow::red::maximum>(param);
-    case pool_enum::kAvgPooling:
-      return new PoolingOp<gpu, mshadow::red::sum>(param);
-    case pool_enum::kSumPooling:
-      return new PoolingOp<gpu, mshadow::red::sum>(param);
-    default:
-      LOG(FATAL) << "unknown pooling type";
-      return NULL;
-  }
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_enum::kMaxPooling:
+        op = new PoolingOp<gpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_enum::kAvgPooling:
+        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_enum::kSumPooling:
+        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  });
 #endif  // MXNET_USE_CUDNN
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/proposal-inl.h b/src/operator/proposal-inl.h
index ed91b23b0930..23c8e39e23b7 100644
--- a/src/operator/proposal-inl.h
+++ b/src/operator/proposal-inl.h
@@ -27,7 +27,6 @@ namespace mxnet {
 namespace op {
 
 namespace proposal {
-enum ProposalOpType {kTrain, kTest};
 enum ProposalOpInputs {kClsProb, kBBoxPred, kImInfo};
 enum ProposalOpOutputs {kOut, kScore};
 enum ProposalForwardResource {kTempResource};
@@ -177,30 +176,56 @@ class ProposalOp : public Operator{
 
     // fill in output rois
     for (index_t i = 0; i < out.size(0); ++i) {
-      index_t index = keep[i];
       //batch index 0
       out[i][0] = 0;
-      for (index_t j = 0; j < 4; ++j) {
-        if (i < out_size) {
+      if (i < out_size) {
+        index_t index = keep[i];
+        for (index_t j = 0; j < 4; ++j) {
           out[i][j + 1] =  workspace_ordered_proposals[index][j];
-        } else {
-          out[i][j + 1] = 0;
+        }
+      } else {
+        index_t index = keep[i % out_size];
+        for (index_t j = 0; j < 4; ++j) {
+          out[i][j + 1] = workspace_ordered_proposals[index][j];
         }
       }
     }
 
     // fill in output score
     for (index_t i = 0; i < out_score.size(0); i++) {
-      index_t index = keep[i];
       if (i < out_size) {
+        index_t index = keep[i];
         out_score[i][0] = workspace_ordered_proposals[index][4];
       }
       else {
-        out_score[i][0] = 0;
+        index_t index = keep[i % out_size];
+        out_score[i][0] = workspace_ordered_proposals[index][4];
       }
     }
   }
 
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_grad.size(), 3);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s);
+
+    // can not assume the grad would be zero
+    Assign(gscores, req[proposal::kClsProb], 0);
+    Assign(gbbox, req[proposal::kBBoxPred], 0);
+    Assign(ginfo, req[proposal::kImInfo], 0);
+  }
+
  private:
   ProposalParam param_;
 };  // class ProposalOp
diff --git a/src/operator/proposal.cu b/src/operator/proposal.cu
index eda844c2f28f..36f76ebfc768 100644
--- a/src/operator/proposal.cu
+++ b/src/operator/proposal.cu
@@ -130,7 +130,11 @@ __global__ void FilterBoxKernel(const int count,
     float iw = dets[index * 5 + 2] - dets[index * 5 + 0] + 1.0f;
     float ih = dets[index * 5 + 3] - dets[index * 5 + 1] + 1.0f;
     if (iw < min_size || ih < min_size) {
-      dets[index * 5 + 4] = 0.0f;
+      dets[index * 5 + 0] -= min_size / 2;
+      dets[index * 5 + 1] -= min_size / 2;
+      dets[index * 5 + 2] += min_size / 2;
+      dets[index * 5 + 3] += min_size / 2;
+      dets[index * 5 + 4] = -1.0f;
     }
   }
 }
@@ -158,14 +162,10 @@ template<typename Dtype>
 __global__ void ReorderProposalsKernel(const int count,
                                        const Dtype* prev_dets,
                                        const int* order,
-                                       const int top_n,
                                        Dtype* dets) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x;
        index < count;
        index += blockDim.x * gridDim.x) {
-    if (index > top_n) {
-      return;
-    }
     const int order_i = order[index];
     for (int j = 0; j < 5; j ++) {
       dets[index * 5 + j] = prev_dets[order_i * 5 + j];
@@ -300,10 +300,11 @@ __global__ void PrepareOutput(const int count,
       }
       score[index] = dets[keep_i * 5 + 4];
     } else {
+      int keep_i = keep[index % out_size];
       for (int j = 0; j < 4; ++j) {
-        out[index * 5 + j + 1] = 0;
+        out[index * 5 + j + 1] = dets[keep_i * 5 + j];
       }
-      score[index] = 0;
+      score[index] = dets[keep_i * 5 + 4];
     }
   }
 }
@@ -348,24 +349,10 @@ class ProposalGPUOp : public Operator{
     Tensor<xpu, 2> out = out_data[proposal::kOut].get<xpu, 2, real_t>(s);
     Tensor<xpu, 2> out_score = out_data[proposal::kScore].get<xpu, 2, real_t>(s);
 
-    index_t num_anchors = in_data[proposal::kClsProb].shape_[1] / 2;
-    index_t height = scores.size(2);
-    index_t width = scores.size(3);
-    index_t count = num_anchors * height * width;  // count of total anchors
-    index_t rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count;  // set to -1 for max
-
-    float* workspace_proposals_ptr = NULL;
-    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, sizeof(float) * count * 5));
-    Tensor<xpu, 2> workspace_proposals(workspace_proposals_ptr, Shape2(count, 5));
-    float* workspace_ordered_proposals_ptr = NULL;
-    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr, sizeof(float) * rpn_pre_nms_top_n * 5));
-    Tensor<xpu, 2> workspace_ordered_proposals(workspace_ordered_proposals_ptr, Shape2(rpn_pre_nms_top_n, 5));
-    float* score_ptr = NULL;
-    FRCNN_CUDA_CHECK(cudaMalloc(&score_ptr, sizeof(float) * count));
-    Tensor<xpu, 1> score(score_ptr, Shape1(count));
-    int* order_ptr = NULL;
-        FRCNN_CUDA_CHECK(cudaMalloc(&order_ptr, sizeof(int) * count));
-    Tensor<xpu, 1, int> order(order_ptr, Shape1(count));
+    int num_anchors = in_data[proposal::kClsProb].shape_[1] / 2;
+    int height = scores.size(2);
+    int width = scores.size(3);
+    int count = num_anchors * height * width;  // count of total anchors
 
     // Generate first anchors based on base anchor
     std::vector<float> base_anchor(4);
@@ -380,7 +367,11 @@ class ProposalGPUOp : public Operator{
                            param_.scales.info,
                            anchors);
 
-    // Copy generated anchors to GPU    
+    // Copy generated anchors to GPU
+    float* workspace_proposals_ptr = NULL;
+    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, sizeof(float) * count * 5));
+    Tensor<xpu, 2> workspace_proposals(workspace_proposals_ptr, Shape2(count, 5));
+
     cudaMemcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size(),
       cudaMemcpyHostToDevice);
     FRCNN_CUDA_CHECK(cudaPeekAtLastError());
@@ -413,6 +404,13 @@ class ProposalGPUOp : public Operator{
     FRCNN_CUDA_CHECK(cudaPeekAtLastError());
 
     // Copy score to a continuous memory
+    float* score_ptr = NULL;
+    FRCNN_CUDA_CHECK(cudaMalloc(&score_ptr, sizeof(float) * count));
+    Tensor<xpu, 1> score(score_ptr, Shape1(count));
+    int* order_ptr = NULL;
+        FRCNN_CUDA_CHECK(cudaMalloc(&order_ptr, sizeof(int) * count));
+    Tensor<xpu, 1, int> order(order_ptr, Shape1(count));
+
     CheckLaunchParam(dimGrid, dimBlock, "CopyScore");
     CopyScoreKernel<<<dimGrid, dimBlock>>>(
       count, workspace_proposals.dptr_, score.dptr_, order.dptr_);
@@ -427,11 +425,16 @@ class ProposalGPUOp : public Operator{
     FRCNN_CUDA_CHECK(cudaPeekAtLastError());
 
     // Reorder proposals according to order
-    const int top_n = std::min(rpn_pre_nms_top_n, count);
-    dimGrid.x = (top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+    int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count;  // set to -1 for max
+    rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count);
+    float* workspace_ordered_proposals_ptr = NULL;
+    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr, sizeof(float) * rpn_pre_nms_top_n * 5));
+    Tensor<xpu, 2> workspace_ordered_proposals(workspace_ordered_proposals_ptr, Shape2(rpn_pre_nms_top_n, 5));
+
+    dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
     CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals");
     ReorderProposalsKernel<<<dimGrid, dimBlock>>>(
-      top_n, workspace_proposals.dptr_, order.dptr_, top_n, workspace_ordered_proposals.dptr_);
+      rpn_pre_nms_top_n, workspace_proposals.dptr_, order.dptr_, workspace_ordered_proposals.dptr_);
     FRCNN_CUDA_CHECK(cudaPeekAtLastError());
 
     FRCNN_CUDA_CHECK(cudaFree(workspace_proposals_ptr));
@@ -453,7 +456,7 @@ class ProposalGPUOp : public Operator{
     FRCNN_CUDA_CHECK(cudaPeekAtLastError());
 
     // copy results after nms
-    const int post_top_n = param_.rpn_post_nms_top_n;
+    int post_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n);
     dimGrid.x = (post_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
     CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput");
     PrepareOutput<<<dimGrid, dimBlock>>>(
@@ -466,6 +469,28 @@ class ProposalGPUOp : public Operator{
     FRCNN_CUDA_CHECK(cudaFree(workspace_ordered_proposals_ptr));
   }
 
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_grad.size(), 3);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s);
+
+    // can not assume the grad would be zero
+    Assign(gscores, req[proposal::kClsProb], 0);
+    Assign(gbbox, req[proposal::kBBoxPred], 0);
+    Assign(ginfo, req[proposal::kImInfo], 0);
+  }
+
  private:
   ProposalParam param_;
 };  // class ProposalGPUOp
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index b61224f7ec3b..0b549162a3d3 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -110,6 +110,7 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   TShape target_shape;
   bool keep_highest;
   ShapeInfo shape;
+  bool reverse;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
     int tmp[] = {0, 0};
     DMLC_DECLARE_FIELD(target_shape)
@@ -118,13 +119,19 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
               "in which case it will be inferred from the rest of dims");
     DMLC_DECLARE_FIELD(keep_highest).set_default(false)
     .describe("(Deprecated! Use shape instead.) Whether keep the highest dim unchanged."
-              "If set to yes, than the first dim in target_shape is ignored,"
+              "If set to true, then the first dim in target_shape is ignored,"
               "and always fixed as input");
     DMLC_DECLARE_FIELD(shape)
     .set_default(ShapeInfo())
     .describe("Target new shape. If the dim is same, set it to 0. If the dim is set "
               "to be -1, it will be inferred from the rest of dims. One and only one dim "
               "can be -1");
+    DMLC_DECLARE_FIELD(reverse)
+      .set_default(false)
+      .describe("Whether to match the shapes from the backward. If reverse is true, "
+      "0 values in the `shape` argument will be searched from the backward. E.g the "
+      "original shape is (10, 5, 4) and the shape argument is (-1, 0). If reverse is true, "
+      "the new shape should be (50, 4). Otherwise it will be (40, 5).");
   }
 };
 
@@ -203,31 +210,40 @@ class ReshapeProp : public OperatorProperty {
     const TShape &dshape = in_shape->at(reshape_enum::kData);
     if (dshape.ndim() == 0) return false;
     if (param_.shape.ndim() != 0) {
+      std::vector<int> dshape_vec;
+      std::vector<int> param_shape_vec(param_.shape.info);
+      for (index_t i = 0; i < dshape.ndim(); ++i) {
+        dshape_vec.push_back(dshape[i]);
+      }
       std::vector<int> tmp;
       int src_idx = 0;
       int neg_idx = -1;
       size_t new_size = dshape.Size();
       bool keep = true;
-      for (index_t i = 0; i < param_.shape.info.size(); ++i) {
-        int proposed_dim = param_.shape.info[i];
+      if (param_.reverse) {
+        std::reverse(dshape_vec.begin(), dshape_vec.end());
+        std::reverse(param_shape_vec.begin(), param_shape_vec.end());
+      }
+      for (index_t i = 0; i < param_shape_vec.size(); ++i) {
+        int proposed_dim = param_shape_vec[i];
         if (proposed_dim == 0) {
           // keep same
           CHECK_EQ(keep, true) << "After set manual dim, can't keep original dim";
-          tmp.push_back(dshape[src_idx++]);
+          tmp.push_back(dshape_vec[src_idx++]);
           new_size /= tmp.back();
         } else if (proposed_dim < 0) {
           // infer
-          CHECK_LT(neg_idx, 0) << "One and only one dim can be inferenced";
+          CHECK_LT(neg_idx, 0) << "One and only one dim can be inferred";
           neg_idx = i;
           tmp.push_back(0);
           src_idx++;
         } else {
-          // great than 0, new shape
+          // greater than 0, new shape
           CHECK_EQ(new_size % proposed_dim, 0) << "Illegal dim setting, can't be divided.";
           tmp.push_back(proposed_dim);
           new_size /= proposed_dim;
           // after set manual shape, can't keep same
-          if (param_.shape.info.size() != dshape.ndim()) {
+          if (param_shape_vec.size() != dshape_vec.size()) {
             keep = false;
           } else {
             src_idx++;
@@ -238,6 +254,11 @@ class ReshapeProp : public OperatorProperty {
       if (neg_idx >= 0) {
         tmp[neg_idx] = new_size;
       }
+      if (param_.reverse) {
+        std::reverse(param_shape_vec.begin(), param_shape_vec.end());
+        std::reverse(dshape_vec.begin(), dshape_vec.end());
+        std::reverse(tmp.begin(), tmp.end());
+      }
       TShape oshape(tmp.begin(), tmp.end());
       CHECK_EQ(oshape.Size(), dshape.Size())
         << "Target shape size is different to source. "
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index a70138adb7ce..3ae65ac16f8e 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -95,7 +95,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
 
     DMLC_DECLARE_FIELD(p).set_default(0.)
     .set_range(0, 1)
-    .describe("Fraction of the input that gets dropped out at training time");
+    .describe("Dropout probability, fraction of the input that gets dropped out at training time");
 
     DMLC_DECLARE_FIELD(state_outputs).set_default(false)
     .describe("Whether to have the states as symbol outputs.");
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 3067c8e986c1..ba411f01544d 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -34,7 +34,7 @@ DMLC_REGISTER_PARAMETER(RNNParam);
 MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
 .describe("Apply a recurrent layer to input.")
 .add_argument("data", "Symbol", "Input data to RNN")
-.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
+.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters concatenated")
 .add_argument("state", "Symbol", "initial hidden state of the RNN")
 .add_argument("state_cell", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
diff --git a/src/operator/roi_pooling-inl.h b/src/operator/roi_pooling-inl.h
index d0808aaf9221..85ac723157cd 100644
--- a/src/operator/roi_pooling-inl.h
+++ b/src/operator/roi_pooling-inl.h
@@ -41,7 +41,7 @@ struct ROIPoolingParam : public dmlc::Parameter<ROIPoolingParam> {
   }
 };
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class ROIPoolingOp : public Operator {
  public:
   explicit ROIPoolingOp(ROIPoolingParam p) {
@@ -61,10 +61,10 @@ class ROIPoolingOp : public Operator {
     CHECK_EQ(out_data[roipool::kMaxIdx].shape_[0], in_data[roipool::kBox].shape_[0]);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
-    Tensor<xpu, 4> data = in_data[roipool::kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 2> bbox = in_data[roipool::kBox].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 4> out = out_data[roipool::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> max_idx = out_data[roipool::kMaxIdx].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> data = in_data[roipool::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> bbox = in_data[roipool::kBox].get<xpu, 2, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[roipool::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> max_idx = out_data[roipool::kMaxIdx].get<xpu, 4, DType>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(bbox.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
@@ -87,19 +87,30 @@ class ROIPoolingOp : public Operator {
     CHECK_EQ(out_data.size(), expected);
     CHECK_EQ(out_grad[roipool::kOut].shape_[0], in_data[roipool::kBox].shape_[0]);
     CHECK_EQ(out_data[roipool::kMaxIdx].shape_[0], in_data[roipool::kBox].shape_[0]);
-    CHECK_EQ(req[roipool::kOut], kWriteTo);
+    CHECK_NE(req[roipool::kData], kWriteInplace) <<
+      "ROIPooling: Backward doesn't support kWriteInplace.";
+    CHECK_NE(req[roipool::kBox], kWriteInplace) <<
+      "ROIPooling: Backward doesn't support kWriteInplace.";
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
-    Tensor<xpu, 4> grad_out = out_grad[roipool::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 2> bbox = in_data[roipool::kBox].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 4> max_idx = out_data[roipool::kMaxIdx].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[roipool::kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> grad_out = out_grad[roipool::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> bbox = in_data[roipool::kBox].get<xpu, 2, DType>(s);
+    Tensor<xpu, 4, DType> max_idx = out_data[roipool::kMaxIdx].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> grad_in = in_grad[roipool::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> grad_roi = in_grad[roipool::kBox].get<xpu, 2, DType>(s);
     CHECK_EQ(grad_out.CheckContiguous(), true);
     CHECK_EQ(bbox.CheckContiguous(), true);
     CHECK_EQ(max_idx.CheckContiguous(), true);
     CHECK_EQ(grad_in.CheckContiguous(), true);
-    grad_in = 0.0f;
-    ROIPoolBackward(grad_in, grad_out, bbox, max_idx, param_.spatial_scale);
+    if (kAddTo == req[roipool::kData] || kWriteTo == req[roipool::kData]) {
+      if (kWriteTo == req[roipool::kData]) {
+        grad_in = 0.0f;
+      }
+      ROIPoolBackwardAcc(grad_in, grad_out, bbox, max_idx, param_.spatial_scale);
+    }
+    if (kWriteTo == req[roipool::kBox]) {
+      grad_roi = 0.0f;
+    }
   }
 
  private:
@@ -108,7 +119,7 @@ class ROIPoolingOp : public Operator {
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(ROIPoolingParam param);
+Operator* CreateOp(ROIPoolingParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class ROIPoolingProp : public OperatorProperty {
@@ -162,6 +173,20 @@ class ROIPoolingProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 2);
+    int dtype = (*in_type)[0];
+    CHECK_EQ(dtype, (*in_type)[1]);
+    CHECK_NE(dtype, -1) << "Input must have specified type";
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     ROIPoolingProp* roi_pooling_sym = new ROIPoolingProp();
     roi_pooling_sym->param_ = this->param_;
@@ -180,7 +205,13 @@ class ROIPoolingProp : public OperatorProperty {
     return {out_grad[roipool::kOut], in_data[roipool::kBox], out_data[roipool::kMaxIdx]};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   ROIPoolingParam param_;
diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc
index 5f6ec56a5444..7f3ca9f94109 100644
--- a/src/operator/roi_pooling.cc
+++ b/src/operator/roi_pooling.cc
@@ -108,11 +108,11 @@ inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,
 }
 
 template<typename Dtype>
-inline void ROIPoolBackward(const Tensor<cpu, 4, Dtype> &in_grad,
-                            const Tensor<cpu, 4, Dtype> &out_grad,
-                            const Tensor<cpu, 2, Dtype> &bbox,
-                            const Tensor<cpu, 4, Dtype> &max_idx,
-                            const float spatial_scale_) {
+inline void ROIPoolBackwardAcc(const Tensor<cpu, 4, Dtype> &in_grad,
+                               const Tensor<cpu, 4, Dtype> &out_grad,
+                               const Tensor<cpu, 2, Dtype> &bbox,
+                               const Tensor<cpu, 4, Dtype> &max_idx,
+                               const float spatial_scale_) {
   const Dtype *top_diff = out_grad.dptr_;
   const Dtype *bottom_rois = bbox.dptr_;
   Dtype *bottom_diff = in_grad.dptr_;
@@ -132,22 +132,23 @@ inline void ROIPoolBackward(const Tensor<cpu, 4, Dtype> &in_grad,
       for (int h = 0; h < height_; ++h) {
         for (int w = 0; w < width_; ++w) {
           int offset_bottom_diff = (b * channels_ + c) * height_ * width_;
-          offset_bottom_diff += h * height_ + w;
+          offset_bottom_diff += h * width_ + w;
 
           Dtype gradient = 0;
           // Accumulate gradient over all ROIs that pooled this element
           for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
-            int roi_batch_ind = bottom_rois[0];
+            const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
+            int roi_batch_ind = offset_bottom_rois[0];
             assert(roi_batch_ind >= 0);
             assert(roi_batch_ind < batch_size_);
             if (b != roi_batch_ind) {
               continue;
             }
 
-            int roi_start_w = round(bottom_rois[1] * spatial_scale_);
-            int roi_start_h = round(bottom_rois[2] * spatial_scale_);
-            int roi_end_w = round(bottom_rois[3] * spatial_scale_);
-            int roi_end_h = round(bottom_rois[4] * spatial_scale_);
+            int roi_start_w = round(offset_bottom_rois[1] * spatial_scale_);
+            int roi_start_h = round(offset_bottom_rois[2] * spatial_scale_);
+            int roi_end_w = round(offset_bottom_rois[3] * spatial_scale_);
+            int roi_end_h = round(offset_bottom_rois[4] * spatial_scale_);
 
             bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                            h >= roi_start_h && h <= roi_end_h);
@@ -191,11 +192,8 @@ inline void ROIPoolBackward(const Tensor<cpu, 4, Dtype> &in_grad,
                 }
               }
             }
-
-            // Increment ROI data pointer
-            bottom_rois += bbox.size(1);
           }
-          bottom_diff[offset_bottom_diff] = gradient;
+          bottom_diff[offset_bottom_diff] += gradient;
         }
       }
     }
@@ -209,13 +207,21 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator* CreateOp<cpu>(ROIPoolingParam param) {
-  return new ROIPoolingOp<cpu>(param);
+Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
+  Operator* op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ROIPoolingOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-// DO_BIND_DISPATCH comes from static_operator_common.h
-Operator* ROIPoolingProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                           std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(ROIPoolingParam);
diff --git a/src/operator/roi_pooling.cu b/src/operator/roi_pooling.cu
index b64e37e03318..ab19def80b50 100644
--- a/src/operator/roi_pooling.cu
+++ b/src/operator/roi_pooling.cu
@@ -10,13 +10,6 @@
 #include <algorithm>
 #include <vector>
 
-#define ROIPOOLING_CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
 namespace mshadow {
 namespace cuda {
 
@@ -117,16 +110,15 @@ inline void ROIPoolForward(const Tensor<gpu, 4, Dtype> &out,
   ROIPoolForwardKernel<Dtype><<<dimGrid, dimBlock, 0, stream>>>(
       count, bottom_data, spatial_scale, channels, height, width,
       pooled_height, pooled_width, bottom_rois, top_data, argmax_data);
-  ROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template<typename Dtype>
-__global__ void ROIPoolBackwardKernel(const int count, const Dtype* top_diff,
-                                      const Dtype* argmax_data, const int num_rois,
-                                      const float spatial_scale, const int channels,
-                                      const int height, const int width,
-                                      const int pooled_height, const int pooled_width,
-                                      Dtype* bottom_diff, const Dtype* bottom_rois) {
+__global__ void ROIPoolBackwardAccKernel(const int count, const Dtype* top_diff,
+                                         const Dtype* argmax_data, const int num_rois,
+                                         const float spatial_scale, const int channels,
+                                         const int height, const int width,
+                                         const int pooled_height, const int pooled_width,
+                                         Dtype* bottom_diff, const Dtype* bottom_rois) {
   for (int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
        index < count;
        index += blockDim.x * gridDim.x * gridDim.y) {
@@ -192,16 +184,16 @@ __global__ void ROIPoolBackwardKernel(const int count, const Dtype* top_diff,
         }
       }
     }
-    bottom_diff[index] = gradient;
+    bottom_diff[index] += gradient;
   }
 }
 
 template<typename Dtype>
-inline void ROIPoolBackward(const Tensor<gpu, 4, Dtype> &in_grad,
-                            const Tensor<gpu, 4, Dtype> &out_grad,
-                            const Tensor<gpu, 2, Dtype> &bbox,
-                            const Tensor<gpu, 4, Dtype> &max_idx,
-                            const float spatial_scale) {
+inline void ROIPoolBackwardAcc(const Tensor<gpu, 4, Dtype> &in_grad,
+                               const Tensor<gpu, 4, Dtype> &out_grad,
+                               const Tensor<gpu, 2, Dtype> &bbox,
+                               const Tensor<gpu, 4, Dtype> &max_idx,
+                               const float spatial_scale) {
   const Dtype *top_diff = out_grad.dptr_;
   const Dtype *bottom_rois = bbox.dptr_;
   Dtype *bottom_diff = in_grad.dptr_;
@@ -218,10 +210,9 @@ inline void ROIPoolBackward(const Tensor<gpu, 4, Dtype> &in_grad,
   dim3 dimBlock(kMaxThreadsPerBlock);
   CheckLaunchParam(dimGrid, dimBlock, "ROIPooling Backward");
   cudaStream_t stream = Stream<gpu>::GetStream(in_grad.stream_);
-  ROIPoolBackwardKernel<Dtype><<<dimGrid, dimBlock, 0, stream>>>(
+  ROIPoolBackwardAccKernel<Dtype><<<dimGrid, dimBlock, 0, stream>>>(
       count, top_diff, argmax_data, num_rois, spatial_scale, channels, height, width,
       pooled_height, pooled_width, bottom_diff, bottom_rois);
-  ROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
 }
 
 }  // namespace cuda
@@ -236,12 +227,12 @@ inline void ROIPoolForward(const Tensor<gpu, 4, Dtype> &out,
 }
 
 template<typename Dtype>
-inline void ROIPoolBackward(const Tensor<gpu, 4, Dtype> &in_grad,
-                            const Tensor<gpu, 4, Dtype> &out_grad,
-                            const Tensor<gpu, 2, Dtype> &bbox,
-                            const Tensor<gpu, 4, Dtype> &max_idx,
-                            const float spatial_scale) {
-  cuda::ROIPoolBackward(in_grad, out_grad, bbox, max_idx, spatial_scale);
+inline void ROIPoolBackwardAcc(const Tensor<gpu, 4, Dtype> &in_grad,
+                               const Tensor<gpu, 4, Dtype> &out_grad,
+                               const Tensor<gpu, 2, Dtype> &bbox,
+                               const Tensor<gpu, 4, Dtype> &max_idx,
+                               const float spatial_scale) {
+  cuda::ROIPoolBackwardAcc(in_grad, out_grad, bbox, max_idx, spatial_scale);
 }
 
 }  // namespace mshadow
@@ -251,8 +242,13 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator* CreateOp<gpu>(ROIPoolingParam param) {
-  return new ROIPoolingOp<gpu>(param);
+Operator* CreateOp<gpu>(ROIPoolingParam param, int dtype) {
+  Operator* op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ROIPoolingOp<gpu, DType>(param);
+  });
+  return op;
 }
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/scale-inl.h b/src/operator/scale-inl.h
new file mode 100644
index 000000000000..da982640e40b
--- /dev/null
+++ b/src/operator/scale-inl.h
@@ -0,0 +1,173 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file scale-inl.h
+ * \brief scale operator and symbol
+*/
+#ifndef MXNET_OPERATOR_SCALE_INL_H_
+#define MXNET_OPERATOR_SCALE_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace scale {
+enum ScaleOpInputs {kData, kGamma, kBeta};
+enum ScaleOpOutputs {kOut};
+}  // namespace scale
+
+struct ScaleParam : public dmlc::Parameter<ScaleParam> {
+  bool no_bias;
+  DMLC_DECLARE_PARAMETER(ScaleParam) {
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+    .describe("Whether to disable bias parameter.");
+  }
+};
+
+template<typename xpu>
+class ScaleOp : public Operator {
+ public:
+  explicit ScaleOp(ScaleParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(req[scale::kOut], kWriteTo);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    Tensor<xpu, 4> data;
+    Tensor<xpu, 4> out;
+    if (in_data[scale::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data[scale::kData].shape_[0],
+                               in_data[scale::kData].shape_[1], 1, 1);
+      data = in_data[scale::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      out = out_data[scale::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+    } else {
+      data = in_data[scale::kData].get<xpu, 4, real_t>(s);
+      out = out_data[scale::kOut].get<xpu, 4, real_t>(s);
+    }
+    Tensor<xpu, 1> slope = in_data[scale::kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> bias = in_data[scale::kBeta].get<xpu, 1, real_t>(s);
+
+    Assign(out, req[scale::kOut], broadcast<1>(slope, data.shape_) * data +
+           broadcast<1>(bias, data.shape_));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_grad.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data, grad, grad_in;
+
+    if (in_data[scale::kData].ndim() == 2) {
+      Shape<4> dshape = Shape4(out_grad[scale::kOut].shape_[0],
+                               out_grad[scale::kOut].shape_[1], 1, 1);
+      data = in_data[scale::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+      grad = out_grad[scale::kOut].get_with_shape<xpu, 4, real_t>(dshape, s);
+      grad_in = in_grad[scale::kData].get_with_shape<xpu, 4, real_t>(dshape, s);
+    } else {
+      data = in_data[scale::kData].get<xpu, 4, real_t>(s);
+      grad = out_grad[scale::kOut].get<xpu, 4, real_t>(s);
+      grad_in = in_grad[scale::kData].get<xpu, 4, real_t>(s); 
+    }
+
+    Tensor<xpu, 1> slope = in_data[scale::kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> gslope = in_grad[scale::kGamma].get<xpu, 1, real_t>(s);
+    Tensor<xpu, 1> gbias = in_grad[scale::kBeta].get<xpu, 1, real_t>(s);
+
+    Assign(gslope, req[scale::kGamma], sumall_except_dim<1>(grad * data));
+    Assign(gbias, req[scale::kBeta], sumall_except_dim<1>(grad));
+    Assign(grad_in, req[scale::kData], (grad * broadcast<1>(slope, data.shape_)));
+  }
+
+ private:
+  ScaleParam param_;
+
+};  // class ScaleOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator *CreateOp(ScaleParam param);
+
+#if DMLC_USE_CXX11
+class ScaleProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 3) << "Input:[data, gamma, beta]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    in_shape->at(1) = TShape(Shape1(dshape[1]));
+    in_shape->at(2) = TShape(Shape1(dshape[1]));
+    out_shape->clear();
+    out_shape->push_back(dshape);
+
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new ScaleProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Scale";
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "gamma", "beta"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output"};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  ScaleParam param_;
+};  // class ScaleParam
+
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SCALE_INL_H_
diff --git a/src/operator/scale.cc b/src/operator/scale.cc
new file mode 100644
index 000000000000..802e1295d6cd
--- /dev/null
+++ b/src/operator/scale.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file scale.cc
+ * \brief scale operator
+*/
+#include "./scale-inl.h"
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(ScaleParam param) {
+  return new ScaleOp<cpu>(param);
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *ScaleProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(ScaleParam);
+
+MXNET_REGISTER_OP_PROPERTY(Scale, ScaleProp)
+.describe("Scale input then add a bias.")
+.add_argument("data", "Symbol", "Input data to the ScaleOp.")
+.add_argument("gamma", "Symbol", "Scale Parameter.")
+.add_argument("bias", "Symbol", "Bias parameter.")
+.add_arguments(ScaleParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/scale.cu b/src/operator/scale.cu
new file mode 100644
index 000000000000..5c3c137bf8d2
--- /dev/null
+++ b/src/operator/scale.cu
@@ -0,0 +1,14 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file scale.cu
+ * \brief scale operator
+*/
+#include "./scale-inl.h"
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(ScaleParam param) {
+  return new ScaleOp<gpu>(param);
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
new file mode 100644
index 000000000000..fba88896a9ff
--- /dev/null
+++ b/src/operator/sequence_last-inl.h
@@ -0,0 +1,236 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file sequence_last-inl.h
+ * \brief
+ * \author Sebastian Bodenstien
+*/
+#ifndef MXNET_OPERATOR_SEQUENCE_LAST_INL_H_
+#define MXNET_OPERATOR_SEQUENCE_LAST_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include "./mshadow_op.h"
+#include "./operator_common.h"
+#include "./operator_common.h"
+#include "./sequence_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace seq_last {
+enum SequenceLastOpInputs { kData, kSequenceLength };
+enum SequenceLastOpOutputs { kOut };
+}
+
+struct SequenceLastParam : public dmlc::Parameter<SequenceLastParam> {
+  bool use_sequence_length;
+  DMLC_DECLARE_PARAMETER(SequenceLastParam) {
+    DMLC_DECLARE_FIELD(use_sequence_length)
+        .set_default(false)
+        .describe(
+            "If set to true, this layer takes in extra input sequence_length "
+            "to specify variable length sequence");
+  }
+};
+
+template <typename xpu, typename DType>
+class SequenceLastOp : public Operator {
+ public:
+  explicit SequenceLastOp(SequenceLastParam p) { this->param_ = p; }
+
+  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    // Get any size input + output into required form
+    int n = in_data[seq_last::kData].size(1);
+    int max_seq_len = in_data[seq_last::kData].size(0);
+    int total_size = in_data[seq_last::kData].Size();
+    Shape<2> s2 = Shape2(n, static_cast<int>(total_size / n / max_seq_len));
+    Shape<3> s3 =
+        Shape3(max_seq_len, n, static_cast<int>(total_size / n / max_seq_len));
+    Tensor<xpu, 3, DType> data =
+        in_data[seq_last::kData].get_with_shape<xpu, 3, DType>(s3, s);
+    Tensor<xpu, 2, DType> out =
+        out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+
+    if (param_.use_sequence_length) {
+      std::vector<index_t> indices_vec(n, max_seq_len);
+      IndexTensorToVector(
+          in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s),
+          &indices_vec);
+      if (req[seq_last::kOut] == kWriteTo) out = 0.0f;
+      index_t seq_ind;
+      for (index_t i = 0; i < n; ++i) {
+        seq_ind = indices_vec[i] - 1;  // 1-indexing
+        out[i] += data[seq_ind][i];
+      }
+    } else {
+      Assign(out, req[seq_last::kOut],
+             F<mshadow_op::identity>(data[max_seq_len - 1]));
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+
+    // break immediately if null grad
+    if (req[seq_last::kData] == kNullOp) return;
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    // Get any size input + output into required form
+    int n = in_grad[seq_last::kData].size(1);
+    int max_seq_len = in_grad[seq_last::kData].size(0);
+    int total_size = in_grad[seq_last::kData].Size();
+    Shape<2> s2 = Shape2(n, static_cast<int>(total_size / n / max_seq_len));
+    Shape<3> s3 =
+        Shape3(max_seq_len, n, static_cast<int>(total_size / n / max_seq_len));
+
+    Tensor<xpu, 3, DType> data_grad =
+        in_grad[seq_last::kData].get_with_shape<xpu, 3, DType>(s3, s);
+    Tensor<xpu, 2, DType> output_grad =
+        out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+
+    // copy indices to vector
+    std::vector<index_t> indices_vec(n, max_seq_len);
+    if (param_.use_sequence_length)
+      IndexTensorToVector(
+          in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s),
+          &indices_vec);
+
+    index_t seq_ind;
+    if (req[seq_last::kData] == kWriteTo) data_grad = 0.0f;
+    for (index_t i = 0; i < n; ++i) {
+      seq_ind = indices_vec[i] - 1;
+      data_grad[seq_ind][i] += output_grad[i];
+    }
+  }
+
+ private:
+  SequenceLastParam param_;
+};  // class SequenceLastOp
+
+template <typename xpu>
+Operator *CreateOp(SequenceLastParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SequenceLastProp : public OperatorProperty {
+ public:
+  int NumOutputs() const override { return 1; }
+
+  std::vector<std::string> ListArguments() const override {
+    if (param_.use_sequence_length)
+      return {"data", "sequence_length"};
+    else
+      return {"data"};
+  }
+
+  std::vector<std::string> ListOutputs() const override { return {"output"}; }
+
+  void Init(
+      const std::vector<std::pair<std::string, std::string>> &kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2 : 1)
+        << "Input:[data, sequence_length]";
+
+    const TShape &dshape = (*in_shape)[seq_last::kData];
+    if (dshape.ndim() == 0) return false;
+    // seq length vector is same as batch size
+    if (param_.use_sequence_length)
+      SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength,
+                         Shape1(dshape[1]));
+
+    // calculate output size
+    TShape shape_o(dshape.ndim() - 1);
+    for (index_t i = 0; i < shape_o.ndim(); ++i) shape_o[i] = dshape[i + 1];
+
+    const TShape &oshape = shape_o;
+    out_shape->clear();
+    out_shape->push_back(oshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type, std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2 : 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at "
+                                       << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty *Copy() const override {
+    auto ptr = new SequenceLastProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override { return "SequenceLast"; }
+
+  std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad, const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const override {
+    if (param_.use_sequence_length)
+      return {out_grad[seq_last::kOut], in_data[seq_last::kSequenceLength]};
+    else
+      return {out_grad[seq_last::kOut]};
+  }
+
+  Operator *CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator *CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  SequenceLastParam param_;
+};      // class SequenceLastProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SEQUENCE_LAST_INL_H_
diff --git a/src/operator/sequence_last.cc b/src/operator/sequence_last.cc
new file mode 100644
index 000000000000..02e0a6d2253c
--- /dev/null
+++ b/src/operator/sequence_last.cc
@@ -0,0 +1,50 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sequence_last.cc
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#include "./sequence_last-inl.h"
+
+namespace mxnet {
+namespace op {
+template <>
+Operator *CreateOp<cpu>(SequenceLastParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
+                           { op = new SequenceLastOp<cpu, DType>(param); })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *SequenceLastProp::CreateOperatorEx(Context ctx,
+                                             std::vector<TShape> *in_shape,
+                                             std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SequenceLastParam);
+
+MXNET_REGISTER_OP_PROPERTY(SequenceLast, SequenceLastProp)
+    .describe(
+"Takes the last element of a sequence. Takes an n-dimensional tensor of "
+"the form [max sequence length, batchsize, other dims] and returns a (n-1)-dimensional tensor "
+"of the form [batchsize, other dims]. This operator takes an optional input tensor "
+"sequence_length of positive ints of dimension [batchsize] when the "
+"sequence_length option is set to true. This allows the operator to handle "
+"variable-length sequences. If sequence_length is false, then each example "
+"in the batch is assumed to have the max sequence length."
+)
+    .add_argument("data", "Symbol",
+                  "n-dimensional input tensor of the form [max sequence "
+                  "length, batchsize, other dims]")
+    .add_argument("sequence_length", "Symbol",
+                  "vector of sequence lengths of size batchsize")
+    .add_arguments(SequenceLastParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sequence_last.cu b/src/operator/sequence_last.cu
new file mode 100644
index 000000000000..329c2c77f6b4
--- /dev/null
+++ b/src/operator/sequence_last.cu
@@ -0,0 +1,20 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sequence_last.cu
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./sequence_last-inl.h"
+
+namespace mxnet {
+namespace op {
+template <> Operator *CreateOp<gpu>(SequenceLastParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
+                           { op = new SequenceLastOp<gpu, DType>(param); })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
new file mode 100644
index 000000000000..3b716185f3c4
--- /dev/null
+++ b/src/operator/sequence_mask-inl.h
@@ -0,0 +1,220 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file wl_sequence_mask-inl.h
+ * \brief
+ * \author Sebastian Bodenstien
+*/
+
+#ifndef MXNET_OPERATOR_SEQUENCE_MASK_INL_H_
+#define MXNET_OPERATOR_SEQUENCE_MASK_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+#include "./mshadow_op.h"
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace seq_mask {
+enum SequenceMaskOpInputs { kData, kSequenceLength };
+enum SequenceMaskOpOutputs { kOut };
+}
+
+struct SequenceMaskParam : public dmlc::Parameter<SequenceMaskParam> {
+  bool use_sequence_length;
+  DMLC_DECLARE_PARAMETER(SequenceMaskParam) {
+    DMLC_DECLARE_FIELD(use_sequence_length)
+        .set_default(false)
+        .describe(
+            "If set to true, this layer takes in extra input sequence_length "
+            "to specify variable length sequence");
+  }
+};
+
+template <typename xpu, typename DType>
+class SequenceMaskOp : public Operator {
+ public:
+  explicit SequenceMaskOp(SequenceMaskParam p) { this->param_ = p; }
+
+  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    // Get any size input + output into required form
+    int max_seq_len = in_data[seq_mask::kData].size(0);
+    int n = in_data[seq_mask::kData].size(1);
+    int total_size = in_data[seq_mask::kData].Size();
+    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
+
+    Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
+    Tensor<xpu, 3, DType> data =
+        in_data[seq_mask::kData].get_with_shape<xpu, 3, DType>(s3, s);
+    Tensor<xpu, 3, DType> out =
+        out_data[seq_mask::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+    Assign(out, req[seq_mask::kOut], F<mshadow_op::identity>(data));
+
+    if (param_.use_sequence_length) {
+      Tensor<xpu, 1, DType> indices =
+          in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
+      SequenceMask(out, indices);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    // Get any size input + output into required form
+    int max_seq_len = in_grad[seq_mask::kData].size(0);
+    int n = in_grad[seq_mask::kData].size(1);
+    int total_size = in_grad[seq_mask::kData].Size();
+    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
+
+    Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
+
+    Tensor<xpu, 3, DType> data_grad =
+        in_grad[seq_mask::kData].get_with_shape<xpu, 3, DType>(s3, s);
+    Tensor<xpu, 3, DType> output_grad =
+        out_grad[seq_mask::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+
+    Assign(data_grad, req[seq_mask::kData],
+           F<mshadow_op::identity>(output_grad));
+
+    if (param_.use_sequence_length) {
+      Tensor<xpu, 1, DType> indices =
+          in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
+      SequenceMask(data_grad, indices);
+    }
+  }
+
+ private:
+  SequenceMaskParam param_;
+};  // class SequenceMaskOp
+
+template <typename xpu>
+Operator *CreateOp(SequenceMaskParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SequenceMaskProp : public OperatorProperty {
+ public:
+  int NumVisibleOutputs() const override { return 1; }
+
+  int NumOutputs() const override { return 1; }
+
+  std::vector<std::string> ListArguments() const override {
+    if (param_.use_sequence_length)
+      return {"data", "sequence_length"};
+    else
+      return {"data"};
+  }
+
+  std::vector<std::string> ListOutputs() const override { return {"output"}; }
+
+  void Init(const std::vector<std::pair<std::string, std::string> > &kwargs)
+      override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2 : 1)
+        << "Input:[data, sequence_length]";
+
+    const TShape &dshape = (*in_shape)[seq_mask::kData];
+    if (dshape.ndim() == 0) return false;
+    // seq length vector is same as batch size
+    if (param_.use_sequence_length)
+      SHAPE_ASSIGN_CHECK(*in_shape, seq_mask::kSequenceLength,
+                         Shape1(dshape[1]));
+
+    const TShape &oshape = dshape;
+    out_shape->clear();
+    out_shape->push_back(oshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type, std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2 : 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at "
+                                       << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty *Copy() const override {
+    auto ptr = new SequenceMaskProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override { return "SequenceMask"; }
+
+  std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad, const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const override {
+    if (param_.use_sequence_length)
+      return {out_grad[seq_mask::kOut], in_data[seq_mask::kSequenceLength]};
+    else
+      return {out_grad[seq_mask::kOut]};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator *CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator *CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  SequenceMaskParam param_;
+};      // class SequenceMaskProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SEQUENCE_MASK_INL_H_
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
new file mode 100644
index 000000000000..15584ab3d00c
--- /dev/null
+++ b/src/operator/sequence_mask.cc
@@ -0,0 +1,63 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sequence_mask.cc
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#include "./sequence_mask-inl.h"
+
+namespace mshadow {
+
+template <typename DType>
+inline void SequenceMask(const Tensor<cpu, 3, DType> &dst,
+                         const Tensor<cpu, 1, DType> label) {
+  for (index_t b = 0; b < dst.size(1); ++b)
+    for (index_t s = label[b]; s < dst.size(0); ++s)
+      for (index_t r = 0; r < dst.size(2); ++r)
+        dst[s][b][r] = 0.;
+}
+
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template <>
+Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
+                           { op = new SequenceMaskOp<cpu, DType>(param); })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *SequenceMaskProp::CreateOperatorEx(Context ctx,
+                                             std::vector<TShape> *in_shape,
+                                             std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SequenceMaskParam);
+
+MXNET_REGISTER_OP_PROPERTY(SequenceMask, SequenceMaskProp)
+    .describe(
+"Sets all elements outside the sequence to zero. Takes an n-dimensional tensor of the "
+"form [max sequence length, batchsize, other dims] and returns a tensor of the same "
+"shape. This operator takes an optional input tensor sequence_length of positive ints of "
+"dimension [batchsize] when the sequence_length option is set to true. This allows the "
+"operator to handle variable-length sequences. If sequence_length is false, then each "
+"example in the batch is assumed to have the max sequence length, and this operator becomes "
+"the identity operator."
+)
+    .add_argument("data", "Symbol",
+                  "n-dimensional input tensor of the form [max sequence "
+                  "length, batchsize, other dims]")
+    .add_argument("sequence_length", "Symbol",
+                  "vector of sequence lengths of size batchsize")
+    .add_arguments(SequenceMaskParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sequence_mask.cu b/src/operator/sequence_mask.cu
new file mode 100644
index 000000000000..bab13e1c0384
--- /dev/null
+++ b/src/operator/sequence_mask.cu
@@ -0,0 +1,69 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sequence_mask.cu
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./sequence_mask-inl.h"
+
+
+namespace mshadow {
+namespace cuda {
+
+////////////////////////////////////////////////////////////////////////////////
+// Cross-Entropy loss
+template<int n_bits, typename DType>
+__global__ void SequenceMaskKernel(Tensor<gpu, 3, DType> dst,
+                    const Tensor<gpu, 1, DType> lengths) {
+  const index_t smax = dst.size(0);
+  const index_t bmax = lengths.size(1);
+  const index_t nmax = dst.size(2);
+  unsigned int batch = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // early return if out of bounds
+  if (batch >= bmax)
+    return;
+
+  // loop over batches
+    for (index_t s = lengths[batch]; s < smax; ++s)
+      for (index_t r = 0; r < nmax; ++r)
+      dst[s][batch][r] = 0.;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename DType>
+inline void SequenceMask(const Tensor<gpu, 3, DType> &dst,
+                         const Tensor<gpu, 1, DType> &lengths) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(1));
+  CheckLaunchParam(dimGrid, dimBlock, "SequenceMask");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SequenceMaskKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, lengths);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace cuda
+
+template<typename DType>
+inline void SequenceMask(Tensor<gpu, 3, DType> dst,
+                   const Tensor<gpu, 1, DType> &lengths) {
+  cuda::SequenceMask(dst, lengths);
+}
+
+}  // namespace mshadow
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace mxnet {
+namespace op {
+template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
+                           { op = new SequenceMaskOp<gpu, DType>(param); })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sequence_op_common.h b/src/operator/sequence_op_common.h
new file mode 100644
index 000000000000..a2924921218f
--- /dev/null
+++ b/src/operator/sequence_op_common.h
@@ -0,0 +1,45 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sequence_op_common.h
+ * \brief common function used for sequence layers
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_SEQUENCE_OP_COMMON_H_
+#define MXNET_OPERATOR_SEQUENCE_OP_COMMON_H_
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include <vector>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+template <typename DType>
+void IndexTensorToVector(mshadow::Tensor<gpu, 1, DType> data,
+                         std::vector<index_t> *index_vec) {
+  int max_seq_len = data.shape_.Size();
+#if MXNET_USE_CUDA
+  DType *temp_index =
+      reinterpret_cast<DType *>(malloc(sizeof(DType) * max_seq_len));
+  cudaError_t cuda_status =
+      cudaMemcpyAsync(temp_index, data.dptr_, max_seq_len * sizeof(DType),
+                      cudaMemcpyDeviceToHost, data.stream_->stream_);
+  CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
+  for (int i = 0; i < max_seq_len; ++i) {
+    (*index_vec)[i] = static_cast<index_t>(temp_index[i]);
+  }
+  free(temp_index);
+#endif
+}
+template <typename DType>
+void IndexTensorToVector(mshadow::Tensor<cpu, 1, DType> data,
+                         std::vector<index_t> *index_vec) {
+  int max_seq_len = data.shape_.Size();
+  DType *index_array = static_cast<DType *>(data.dptr_);
+  for (int i = 0; i < max_seq_len; ++i)
+    (*index_vec)[i] = static_cast<index_t>(index_array[i]);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SEQUENCE_OP_COMMON_H_
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
new file mode 100644
index 000000000000..fa56c6884c93
--- /dev/null
+++ b/src/operator/sequence_reverse-inl.h
@@ -0,0 +1,244 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file sequence_reverse-inl.h
+ * \brief
+ * \author Sebastian Bodenstien
+*/
+
+#ifndef MXNET_OPERATOR_SEQUENCE_REVERSE_INL_H_
+#define MXNET_OPERATOR_SEQUENCE_REVERSE_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+#include "./sequence_op_common.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace seq_reverse {
+enum SequenceReverseOpInputs { kData, kSequenceLength };
+enum SequenceReverseOpOutputs { kOut };
+}
+
+struct SequenceReverseParam : public dmlc::Parameter<SequenceReverseParam> {
+  bool use_sequence_length;
+  DMLC_DECLARE_PARAMETER(SequenceReverseParam) {
+    DMLC_DECLARE_FIELD(use_sequence_length)
+        .set_default(false)
+        .describe(
+            "If set to true, this layer takes in extra input sequence_length "
+            "to specify variable length sequence");
+  }
+};
+
+template <typename xpu, typename DType>
+class SequenceReverseOp : public Operator {
+ public:
+  explicit SequenceReverseOp(SequenceReverseParam p) { this->param_ = p; }
+  void sequence_reverse(const mshadow::Tensor<xpu, 3, DType> data,
+                        const mshadow::Tensor<xpu, 3, DType> &out,
+                        std::vector<index_t> indices, OpReqType req) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    index_t seq_length;
+    index_t max_seq_len = data.size(0);
+    index_t batch_size = data.size(1);
+    for (index_t b = 0; b < batch_size; ++b) {
+      seq_length = indices[b];
+      for (index_t s = 0; s < max_seq_len; ++s) {
+        if (s < seq_length)
+          Assign(
+              out[s][b], req,
+              F<mshadow_op::identity>(
+                  data[seq_length - s - 1][b]))
+        else  // preserve padding type
+          Assign(out[s][b], req, F<mshadow_op::identity>(data[s][b]))
+      }
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    // Get any size input + output into required form
+    int max_seq_len = in_data[seq_reverse::kData].size(0);
+    int n = in_data[seq_reverse::kData].size(1);
+    int total_size = in_data[seq_reverse::kData].Size();
+    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
+
+    Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
+    Tensor<xpu, 3, DType> data =
+        in_data[seq_reverse::kData].get_with_shape<xpu, 3, DType>(s3, s);
+    Tensor<xpu, 3, DType> out =
+        out_data[seq_reverse::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+
+    // copy indices to vector
+    std::vector<index_t> indices_vec(n, max_seq_len);
+    if (param_.use_sequence_length)
+      IndexTensorToVector(
+          in_data[seq_reverse::kSequenceLength].get<xpu, 1, DType>(s),
+          &indices_vec);
+
+    sequence_reverse(data, out, indices_vec, req[seq_reverse::kOut]);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    // Get any size input + output into required form
+    int max_seq_len = in_grad[seq_reverse::kData].size(0);
+    int n = in_grad[seq_reverse::kData].size(1);
+    int total_size = in_grad[seq_reverse::kData].Size();
+    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
+
+    Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
+
+    Tensor<xpu, 3, DType> data_grad =
+        in_grad[seq_reverse::kData].get_with_shape<xpu, 3, DType>(s3, s);
+    Tensor<xpu, 3, DType> output_grad =
+        out_grad[seq_reverse::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+    // copy indices to vector
+    std::vector<index_t> indices_vec(n, max_seq_len);
+    if (param_.use_sequence_length)
+      IndexTensorToVector(
+          in_data[seq_reverse::kSequenceLength].get<xpu, 1, DType>(s),
+          &indices_vec);
+
+    sequence_reverse(output_grad, data_grad, indices_vec,
+                     req[seq_reverse::kData]);
+  }
+
+ private:
+  SequenceReverseParam param_;
+};  // class SequenceReverseOp
+
+template <typename xpu>
+Operator *CreateOp(SequenceReverseParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SequenceReverseProp : public OperatorProperty {
+ public:
+  int NumVisibleOutputs() const override { return 1; }
+
+  int NumOutputs() const override { return 1; }
+
+  std::vector<std::string> ListArguments() const override {
+    if (param_.use_sequence_length)
+      return {"data", "sequence_length"};
+    else
+      return {"data"};
+  }
+
+  std::vector<std::string> ListOutputs() const override { return {"output"}; }
+
+  void Init(const std::vector<std::pair<std::string, std::string> > &kwargs)
+      override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2 : 1)
+        << "Input:[data, sequence_length]";
+
+    const TShape &dshape = (*in_shape)[seq_reverse::kData];
+    if (dshape.ndim() == 0) return false;
+    // seq length vector is same as batch size
+    if (param_.use_sequence_length)
+      SHAPE_ASSIGN_CHECK(*in_shape, seq_reverse::kSequenceLength,
+                         Shape1(dshape[1]));
+
+    const TShape &oshape = dshape;
+    out_shape->clear();
+    out_shape->push_back(oshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type, std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2 : 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at "
+                                       << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty *Copy() const override {
+    auto ptr = new SequenceReverseProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override { return "SequenceReverse"; }
+
+  std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad, const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const override {
+    if (param_.use_sequence_length)
+      return {out_grad[seq_reverse::kOut],
+              in_data[seq_reverse::kSequenceLength]};
+    else
+      return {out_grad[seq_reverse::kOut]};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator *CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator *CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  SequenceReverseParam param_;
+};      // class SequenceReverseProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SEQUENCE_REVERSE_INL_H_
diff --git a/src/operator/sequence_reverse.cc b/src/operator/sequence_reverse.cc
new file mode 100644
index 000000000000..590821743bbc
--- /dev/null
+++ b/src/operator/sequence_reverse.cc
@@ -0,0 +1,49 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sequence_reverse.cc
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#include "./sequence_reverse-inl.h"
+
+namespace mxnet {
+namespace op {
+template <>
+Operator *CreateOp<cpu>(SequenceReverseParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
+                           { op = new SequenceReverseOp<cpu, DType>(param); })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *SequenceReverseProp::CreateOperatorEx(
+    Context ctx, std::vector<TShape> *in_shape,
+    std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SequenceReverseParam);
+
+MXNET_REGISTER_OP_PROPERTY(SequenceReverse, SequenceReverseProp)
+    .describe(
+"Reverses the elements of each sequence. Takes an n-dimensional tensor of the form "
+"[max sequence length, batchsize, other dims] and returns a tensor of the same shape. "
+"This operator takes an optional input tensor sequence_length of positive ints of dimension "
+"[batchsize] when the sequence_length option is set to true. This allows the operator to "
+"handle variable-length sequences. If sequence_length is false, then each example "
+"in the batch is assumed to have the max sequence length."
+)
+    .add_argument("data", "Symbol",
+                  "n-dimensional input tensor of the form [max sequence "
+                  "length, batchsize, other dims]")
+    .add_argument("sequence_length", "Symbol",
+                  "vector of sequence lengths of size batchsize")
+    .add_arguments(SequenceReverseParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sequence_reverse.cu b/src/operator/sequence_reverse.cu
new file mode 100644
index 000000000000..cdd8f348950c
--- /dev/null
+++ b/src/operator/sequence_reverse.cu
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file sequence_reverse.cu
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./sequence_reverse-inl.h"
+
+namespace mxnet {
+namespace op {
+template <> Operator *CreateOp<gpu>(SequenceReverseParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SequenceReverseOp<gpu, DType>(param);
+  })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/softmax_activation-inl.h b/src/operator/softmax_activation-inl.h
index db1834ead66b..222e62522d00 100644
--- a/src/operator/softmax_activation-inl.h
+++ b/src/operator/softmax_activation-inl.h
@@ -70,14 +70,15 @@ class SoftmaxActivationOp : public Operator {
       Tensor<xpu, 2> out = out_data[softmax_activation::kOut].FlatTo2D<xpu, real_t>(s);
       Softmax(out, data);
     } else {
-      CHECK_EQ(in_data[softmax_activation::kData].ndim(), 4);
-      TShape src_shape = in_data[softmax_activation::kData].shape_;
-      Shape<3> dst_shape = Shape3(src_shape[0], src_shape[1],
-                                  src_shape[2] * src_shape[3]);
-      Tensor<xpu, 3> data =
-        in_data[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(dst_shape, s);
-      Tensor<xpu, 3> out =
-        out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(dst_shape, s);
+      CHECK_GE(in_data[softmax_activation::kData].ndim(), 3)
+        << "Input need to have a least 3 dimensions when mode=channel";
+      int n = in_data[softmax_activation::kData].size(0);
+      int k = in_data[softmax_activation::kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_activation::kData].Size()/n/k));
+      Tensor<xpu, 3, real_t> data =
+        in_data[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3, real_t> out =
+        out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
       Softmax(out, data);
     }
   }
diff --git a/src/operator/softmax_activation.cc b/src/operator/softmax_activation.cc
index 7ab486342414..6afcb296e9ed 100644
--- a/src/operator/softmax_activation.cc
+++ b/src/operator/softmax_activation.cc
@@ -23,9 +23,9 @@ DMLC_REGISTER_PARAMETER(SoftmaxActivationParam);
 
 MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp)
 .describe("Apply softmax activation to input. This is intended for internal layers. "
-          "For output (loss layer) please use SoftmaxOutput. If type=instance, "
+          "For output (loss layer) please use SoftmaxOutput. If mode=instance, "
           "this operator will compute a softmax for each instance in the batch; "
-          "this is the default mode. If type=channel, this operator will compute "
+          "this is the default mode. If mode=channel, this operator will compute "
           "a num_channel-class softmax at each position of each instance; this can "
           "be used for fully convolutional network, image segmentation, etc.")
 .add_argument("data", "Symbol", "Input data to activation function.")
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 876cf4491663..0c9d5c1819ab 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -162,6 +162,26 @@ class SoftmaxOutputOp : public Operator {
         Tensor<xpu, 3, DType> o_grad =
           out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
         grad *= o_grad;
+        // cancel out previous normalization and normalize against valid grad count
+        // the s3[2] is the number of multi_output, total grad must be normalized by this or valid_cnt (valid multiple output * batch size)
+        index_t valid_grad = 0;
+        if (param_.normalization == softmaxout_enum::kBatch || param_.normalization == softmaxout_enum::kValid) {
+          Tensor<cpu, 3, DType> workspace = ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<3, DType>(o_grad.shape_);
+          Copy(workspace, o_grad, o_grad.stream_);
+          for (index_t i = 0; i < workspace.size(0); i++) {
+            for (index_t j = 0; j < workspace.size(1); j++) {
+              for (index_t k = 0; k < workspace.size(2); k++) {
+                if (workspace[i][j][k] > 0) {
+                  valid_grad++;
+                }
+              }
+            }
+          }
+          valid_grad = valid_grad == 0 ? 1 : valid_grad;
+        } else {
+          valid_grad = 1;
+        }
+        grad *= DType(static_cast<float>(param_.normalization == softmaxout_enum::kValid ? 1 : s3[2]) * static_cast<float>(valid_cnt) / static_cast<float>(valid_grad));
       }
     } else {
       int n = out_data[softmaxout_enum::kOut].size(0);
@@ -200,6 +220,23 @@ class SoftmaxOutputOp : public Operator {
         Tensor<xpu, 2, DType> o_grad =
           out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
         grad *= o_grad;
+        // cancel out previous normalization and normalize against valid grad count
+        index_t valid_grad = 0;
+        if (param_.normalization == softmaxout_enum::kBatch || param_.normalization == softmaxout_enum::kValid) {
+          Tensor<cpu, 2, DType> workspace = ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(o_grad.shape_);
+          Copy(workspace, o_grad, o_grad.stream_);
+          for (index_t i = 0; i < workspace.size(0); i++) {
+            for (index_t j = 0; j < workspace.size(1); j++) {
+              if (workspace[i][j] > 0) {
+                valid_grad++;
+              }
+            }
+          }
+          valid_grad = valid_grad == 0 ? 1 : valid_grad;
+        } else {
+          valid_grad = 1;
+        }
+        grad *= DType(static_cast<float>(valid_cnt) / static_cast<float>(valid_grad));
       }
     }
   }
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index fe301d1d186e..28aa5acdf015 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -49,7 +49,7 @@ class SwapAxisOp : public Operator {
   }
 
   void Reshape2Five(mshadow::Shape<5> *inter_shape,
-                    const mshadow::TShape &shape,
+                    const TShape &shape,
                     uint32_t dim1, uint32_t dim2) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -217,5 +217,3 @@ class SwapAxisProp : public OperatorProperty {
 }  // namespace mxnet
 
 #endif  // MXNET_OPERATOR_SWAPAXIS_INL_H_
-
-
diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h
index 743427b7a942..b392d3d7a568 100644
--- a/src/operator/upsampling-inl.h
+++ b/src/operator/upsampling-inl.h
@@ -39,7 +39,7 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
     .set_range(1, 1000)
     .describe("Up sampling scale");
     DMLC_DECLARE_FIELD(num_filter)
-    .describe("Input filter. Only used by nearest sample_type.")
+    .describe("Input filter. Only used by bilinear sample_type.")
     .set_default(0);
     DMLC_DECLARE_FIELD(sample_type)
     .add_enum("nearest", up_enum::kNearest)
@@ -205,7 +205,7 @@ class UpSamplingProp : public OperatorProperty {
         CHECK_EQ(oh%shape[2], 0) << "UpSamplingNearest: input height of " << shape[2] << \
           "does not divide output height of " << oh;
         CHECK_EQ(ow%shape[3], 0) << "UpSamplingNearest: input weight of " << shape[3] << \
-          "does not divide output weight of " << ow;
+          "does not divide output width of " << ow;
         if (param_.multi_input_mode == up_enum::kSum) {
           CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
             "Number of channels must be the same when multi_input_mode==sum";
@@ -217,7 +217,7 @@ class UpSamplingProp : public OperatorProperty {
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
       CHECK_EQ(dshape.ndim(), 4) << \
-        "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
+        "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
       if (dshape.ndim() ==  0) return false;
       int kernel = 2 * param_.scale - param_.scale % 2;
       SHAPE_ASSIGN_CHECK(*in_shape,
diff --git a/src/resource.cc b/src/resource.cc
index bb1842ab83d1..9123c42c3a69 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -5,6 +5,7 @@
  */
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
+#include <dmlc/thread_local.h>
 #include <mxnet/base.h>
 #include <mxnet/engine.h>
 #include <mxnet/resource.h>
@@ -24,11 +25,6 @@ struct SpaceAllocator {
   Storage::Handle handle;
   // internal CPU handle
   Storage::Handle host_handle;
-  // The old handles that need to be kept valid
-  // until release is called.
-  // This API allows several CUDA calls using
-  // temp space to get valid space until all the calls finished.
-  std::vector<Storage::Handle> old_handles;
 
   SpaceAllocator() {
     handle.dptr = nullptr;
@@ -36,37 +32,31 @@ struct SpaceAllocator {
     host_handle.dptr = nullptr;
     host_handle.size = 0;
   }
-
-  inline void Release() {
-    for (const Storage::Handle& handle : old_handles) {
-      if (handle.size != 0) {
-        Storage::Get()->Free(handle);
-      }
-    }
-    old_handles.clear();
-  }
-
   inline void ReleaseAll() {
-    old_handles.push_back(handle);
-    old_handles.push_back(host_handle);
-    this->Release();
-    handle.size = 0;
-    host_handle.size = 0;
+    if (handle.size != 0) {
+      Storage::Get()->DirectFree(handle);
+      handle.size = 0;
+    }
+    if (host_handle.size != 0) {
+      Storage::Get()->DirectFree(host_handle);
+      host_handle.size = 0;
+    }
   }
-
   inline void* GetSpace(size_t size) {
     if (handle.size >= size) return handle.dptr;
-    old_handles.push_back(handle);
-    handle = Storage::Get()->Alloc(
-        std::max(size, handle.size * 2), ctx);
+    if (handle.size != 0) {
+      Storage::Get()->DirectFree(handle);
+    }
+    handle = Storage::Get()->Alloc(size, ctx);
     return handle.dptr;
   }
 
   inline void* GetHostSpace(size_t size) {
     if (host_handle.size >= size) return host_handle.dptr;
-    old_handles.push_back(host_handle);
-    host_handle = Storage::Get()->Alloc(
-        std::max(size, handle.size * 2), Context());
+    if (handle.size != 0) {
+      Storage::Get()->DirectFree(host_handle);
+    }
+    host_handle = Storage::Get()->Alloc(size, Context());
     return host_handle.dptr;
   }
 };
@@ -77,8 +67,8 @@ class ResourceManagerImpl : public ResourceManager {
  public:
   ResourceManagerImpl() noexcept(false)
       : global_seed_(0) {
-    cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 16);
-    gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 4);
+    cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 4);
+    gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 1);
     engine_ref_ = Engine::_GetSharedRef();
     storage_ref_ = Storage::_GetSharedRef();
     cpu_rand_.reset(new ResourceRandom<cpu>(
@@ -260,12 +250,8 @@ void* Resource::get_host_space_internal(size_t size) const {
   return static_cast<resource::SpaceAllocator*>(ptr_)->GetHostSpace(size);
 }
 
-void Resource::release() const {
-  return static_cast<resource::SpaceAllocator*>(ptr_)->Release();
-}
-
 ResourceManager* ResourceManager::Get() {
-  static resource::ResourceManagerImpl inst;
-  return &inst;
+  typedef dmlc::ThreadLocalStore<resource::ResourceManagerImpl> inst;
+  return inst::Get();
 }
 }  // namespace mxnet
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index a476f5ea2acc..05a8b10c2bb1 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -29,6 +29,10 @@ class NaiveStorageManager final : public StorageManager {
   void* Alloc(size_t size) override;
   void Free(void* ptr, size_t) override;
 
+  void DirectFree(void* ptr, size_t size) override {
+    DeviceStorage::Free(ptr);
+  }
+
  private:
   DISALLOW_COPY_AND_ASSIGN(NaiveStorageManager);
 };  // class NaiveStorageManager
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index 297ee0a5aa48..5b0df1041909 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -5,6 +5,7 @@
  */
 #ifndef MXNET_STORAGE_PINNED_MEMORY_STORAGE_H_
 #define MXNET_STORAGE_PINNED_MEMORY_STORAGE_H_
+#if MXNET_USE_CUDA
 
 #include <dmlc/logging.h>
 #include "mxnet/base.h"
@@ -31,28 +32,21 @@ class PinnedMemoryStorage {
 
 inline void* PinnedMemoryStorage::Alloc(size_t size) {
   void* ret = nullptr;
-#if MXNET_USE_CUDA
   // make the memory available across all devices
   CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
-#else   // MXNET_USE_CUDA
-  LOG(FATAL) << "Please compile with CUDA enabled";
-#endif  // MXNET_USE_CUDA
   return ret;
 }
 
 inline void PinnedMemoryStorage::Free(void* ptr) {
-#if MXNET_USE_CUDA
   cudaError_t err = cudaFreeHost(ptr);
   // ignore unloading error, as memory has already been recycled
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
     LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
   }
-#else   // MXNET_USE_CUDA
-  LOG(FATAL) << "Please compile with CUDA enabled";
-#endif  // MXNET_USE_CUDA
 }
 
 }  // namespace storage
 }  // namespace mxnet
 
+#endif  // MXNET_USE_CUDA
 #endif  // MXNET_STORAGE_PINNED_MEMORY_STORAGE_H_
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 5fcf781a67f0..dbad12719cbf 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -6,61 +6,80 @@
 #ifndef MXNET_STORAGE_POOLED_STORAGE_MANAGER_H_
 #define MXNET_STORAGE_POOLED_STORAGE_MANAGER_H_
 
+#if MXNET_USE_CUDA
+  #include <cuda_runtime.h>
+#endif  // MXNET_USE_CUDA
 #include <mxnet/base.h>
 #include <unordered_map>
 #include <vector>
 #include <mutex>
 #include <new>
 #include "./storage_manager.h"
+#include "../common/cuda_utils.h"
+
 
 namespace mxnet {
 namespace storage {
 
+#if MXNET_USE_CUDA
 /*!
- * \brief Storage manager with a memory pool.
+ * \brief Storage manager with a memory pool on gpu.
  */
-template <class DeviceStorage>
-class PooledStorageManager final : public StorageManager {
+class GPUPooledStorageManager final : public StorageManager {
  public:
   /*!
    * \brief Default constructor.
    */
-  PooledStorageManager() = default;
+  GPUPooledStorageManager() {
+    reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5);
+  }
   /*!
    * \brief Default destructor.
    */
-  ~PooledStorageManager() {
+  ~GPUPooledStorageManager() {
     ReleaseAll();
   }
+
   void* Alloc(size_t size) override;
   void Free(void* ptr, size_t size) override;
 
+  void DirectFree(void* ptr, size_t size) override {
+    cudaError_t err = cudaFree(ptr);
+    // ignore unloading error, as memory has already been recycled
+    if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
+    }
+    used_memory_ -= size;
+  }
+
  private:
   void ReleaseAll();
   // internal mutex
   std::mutex mutex_;
   // used memory
   size_t used_memory_ = 0;
+  // percentage of reserved memory
+  int reserve_;
   // memory pool
   std::unordered_map<size_t, std::vector<void*>> memory_pool_;
-  DISALLOW_COPY_AND_ASSIGN(PooledStorageManager);
-};  // class PooledStorageManager
+  DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
+};  // class GPUPooledStorageManager
 
-template <class DeviceStorage>
-void* PooledStorageManager<DeviceStorage>::Alloc(size_t size) {
+void* GPUPooledStorageManager::Alloc(size_t size) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
-    used_memory_ += size;
-    for (int i = 0; i < 2; ++i) {
-      try {
-        return DeviceStorage::Alloc(size);
-      } catch (const std::bad_alloc& e) {
-        ReleaseAll();
-      }
+    size_t free, total;
+    cudaMemGetInfo(&free, &total);
+    if (size > free - total*reserve_/100) ReleaseAll();
+
+    void* ret = nullptr;
+    cudaError_t e = cudaMalloc(&ret, size);
+    if (e != cudaSuccess && e != cudaErrorCudartUnloading) {
+      LOG(FATAL) << "cudaMalloc failed: " << cudaGetErrorString(e);
     }
-    LOG(FATAL) << "Memory allocation failed.";
-    return NULL;
+    used_memory_ += size;
+    return ret;
   } else {
     auto&& reuse_pool = reuse_it->second;
     auto ret = reuse_pool.back();
@@ -69,23 +88,21 @@ void* PooledStorageManager<DeviceStorage>::Alloc(size_t size) {
   }
 }
 
-template <class DeviceStorage>
-void PooledStorageManager<DeviceStorage>::Free(void* ptr, size_t size) {
+void GPUPooledStorageManager::Free(void* ptr, size_t size) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(ptr);
 }
 
-template <class DeviceStorage>
-void PooledStorageManager<DeviceStorage>::ReleaseAll() {
+void GPUPooledStorageManager::ReleaseAll() {
   for (auto&& i : memory_pool_) {
     for (auto&& j : i.second) {
-      DeviceStorage::Free(j);
-      used_memory_ -= i.first;
+      DirectFree(j, i.first);
     }
   }
   memory_pool_.clear();
 }
+#endif  // MXNET_USE_CUDA
 
 }  // namespace storage
 }  // namespace mxnet
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 79cc06c8dc0b..d80c64b4685c 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -21,6 +21,7 @@ class StorageImpl : public Storage {
  public:
   Handle Alloc(size_t size, Context ctx) override;
   void Free(Handle handle) override;
+  void DirectFree(Handle handle) override;
   StorageImpl() {}
   virtual ~StorageImpl() = default;
 
@@ -28,10 +29,6 @@ class StorageImpl : public Storage {
   static constexpr size_t kMaxNumberOfDevices = Context::kMaxDevType + 1;
   static constexpr size_t kMaxNumberOfDeviceIDs = Context::kMaxDevID + 1;
 
-  template <class DeviceStorage>
-  using CurrentStorageManager =
-      storage::PooledStorageManager<DeviceStorage>;
-
   static void ActivateDevice(Context ctx) {
     switch (ctx.dev_type) {
       case Context::kCPU: break;
@@ -63,15 +60,23 @@ Storage::Handle StorageImpl::Alloc(size_t size, Context ctx) {
         storage::StorageManager *ptr = nullptr;
         switch (ctx.dev_type) {
           case Context::kCPU: {
-            ptr = new CurrentStorageManager<storage::CPUDeviceStorage>();
+            ptr = new storage::NaiveStorageManager<storage::CPUDeviceStorage>();
             break;
           }
           case Context::kCPUPinned: {
-            ptr = new CurrentStorageManager<storage::PinnedMemoryStorage>();
+#if MXNET_USE_CUDA
+            ptr = new storage::NaiveStorageManager<storage::PinnedMemoryStorage>();
+#else
+            LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
+#endif  // MXNET_USE_CUDA
             break;
           }
           case Context::kGPU: {
-            ptr = new CurrentStorageManager<storage::GPUDeviceStorage>();
+#if MXNET_USE_CUDA
+            ptr = new storage::GPUPooledStorageManager();
+#else
+            LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage";
+#endif  // MXNET_USE_CUDA
             break;
           }
           default: LOG(FATAL) <<  "Unimplemented device " << ctx.dev_type;
@@ -95,6 +100,19 @@ void StorageImpl::Free(Storage::Handle handle) {
   manager->Free(handle.dptr, handle.size);
 }
 
+void StorageImpl::DirectFree(Storage::Handle handle) {
+  const Context &ctx = handle.ctx;
+  auto&& device = storage_managers_.at(ctx.dev_type);
+  storage::StorageManager *manager = device.Get(
+      ctx.dev_id, []() {
+        LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
+        return nullptr;
+      });
+  this->ActivateDevice(ctx);
+  // directly free ths data.
+  manager->DirectFree(handle.dptr, handle.size);
+}
+
 std::shared_ptr<Storage> Storage::_GetSharedRef() {
 #ifdef __MXNET_JS__
   // dummy code needed for emscripten code to pass
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index 3d264ab278ca..de08688c5aac 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -28,6 +28,12 @@ class StorageManager {
    * \param size Size of the storage.
    */
   virtual void Free(void* ptr, size_t size) = 0;
+  /*!
+   * \brief Direct de-allocation.
+   * \param ptr Pointer to deallocate.
+   * \param size Size of the storage.
+   */
+  virtual void DirectFree(void* ptr, size_t size) = 0;
   /*!
    * \brief Destructor.
    */
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index 4ea774829e2c..46a94222905f 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -1081,7 +1081,7 @@ GraphExecutor::CreateCachedSegOpr(size_t topo_start, size_t topo_end) {
   auto rtop = read_vars.begin();
   for (auto rit = read_vars.begin(); rit != read_vars.end(); ++rit) {
     while (wit != write_vars.end() && *wit < *rit) ++wit;
-    if (*wit != *rit) {
+    if (wit == write_vars.end() || *wit != *rit) {
       *rtop = *rit;
       ++rtop;
     }
diff --git a/tests/cpp/storage_test.cc b/tests/cpp/storage_test.cc
index 5b9c2300a249..66b125a7d7a6 100644
--- a/tests/cpp/storage_test.cc
+++ b/tests/cpp/storage_test.cc
@@ -34,3 +34,9 @@ TEST(Storage, Basic_GPU) {
   EXPECT_EQ(handle.dptr, ptr);
 }
 #endif  // MXNET_USE_CUDA
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/cpp/threaded_engine_test.cc b/tests/cpp/threaded_engine_test.cc
index 11a8b656b169..336bf3d8891b 100644
--- a/tests/cpp/threaded_engine_test.cc
+++ b/tests/cpp/threaded_engine_test.cc
@@ -229,3 +229,9 @@ TEST(Engine, basics) {
   oprs.clear();
   LOG(INFO) << "All pass";
 }
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index dff9e62941cd..6ac2b44fa46c 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -6,6 +6,6 @@ GTEST_INC=$(GTEST_PATH)/include/
 
 tests/cpp/% : tests/cpp/%.cc lib/libmxnet.a
 	$(CXX) -std=c++0x $(CFLAGS) -MM -MT tests/cpp/$* $< >tests/cpp/$*.d
-	$(CXX) -std=c++0x $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^) $(LDFLAGS) -L$(GTEST_LIB) -lgtest -lgtest_main
+	$(CXX) -std=c++0x $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^) $(LDFLAGS) -L$(GTEST_LIB) -lgtest
 
 -include tests/cpp/*.d
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index daa60e1779a0..5cd71e470f4f 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -5,87 +5,10 @@
 from test_operator import *
 import mxnet as mx
 import numpy as np
+from mxnet.test_utils import check_consistency
 from numpy.testing import assert_allclose
 import time
 
-def check_consistency(sym, ctx_list, scale=1.0, grad_req='write'):
-    tol = {np.dtype(np.float16): 1e-1,
-           np.dtype(np.float32): 1e-3,
-           np.dtype(np.float64): 1e-5,
-           np.dtype(np.uint8): 0,
-           np.dtype(np.int32): 0}
-    assert(len(ctx_list) > 1)
-    exe_list = [sym.simple_bind(grad_req=grad_req, **ctx) for ctx in ctx_list]
-    for exe in exe_list:
-        assert(len(exe.outputs) == 1)
-        assert(len(exe.arg_arrays) == len(exe_list[0].arg_arrays))
-        assert(len(exe.grad_arrays) == len(exe_list[0].grad_arrays))
-
-    init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe_list[0].arg_arrays]
-    if sym.name == 'embedding':
-        init[0] = np.random.randint(low=0, high=10, size=exe_list[0].arg_arrays[0].shape)
-
-    for exe in exe_list:
-        for arr, iarr in zip(exe.arg_arrays, init):
-            arr[:] = iarr.astype(arr.dtype)
-
-    # forward
-    for exe in exe_list:
-        exe.forward(is_train=True)
-        exe.backward(exe.outputs[0])
-
-    outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
-    # lazy solution handling None grad
-    grads = [[grad.asnumpy() if grad is not None else np.zeros(1) for grad in exe.grad_arrays] for exe in exe_list]
-    dtypes = [arr.dtype for arr in outputs]
-    max_idx = np.argmax(dtypes)
-
-    for i, exe in enumerate(exe_list):
-        if i == max_idx:
-            continue
-        for arr1, arr2 in zip([outputs[i]]+grads[i], [outputs[max_idx]]+grads[max_idx]):
-            arr2 = arr2.astype(dtypes[i])
-            try:
-                assert_allclose(arr1, arr2, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
-            except Exception, e:
-                print e
-
-    #forward predict
-    for exe in exe_list:
-        exe.forward(is_train=False)
-
-    outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
-    dtypes = [arr.dtype for arr in outputs]
-    max_idx = np.argmax(dtypes)
-
-    for i, exe in enumerate(exe_list):
-        if i == max_idx:
-            continue
-        for arr1, arr2 in zip([outputs[i]], [outputs[max_idx]]):
-            arr2 = arr2.astype(dtypes[i])
-            try:
-                assert_allclose(arr1, arr2, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
-            except Exception, e:
-                print e
-
-def check_speed(sym, ctx, scale=1.0, N=100, grad_req='write'):
-    exe = sym.simple_bind(grad_req=grad_req, **ctx)
-    init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe.arg_arrays]
-    for arr, iarr in zip(exe.arg_arrays, init):
-        arr[:] = iarr.astype(arr.dtype)
-
-    # warm up
-    exe.forward(is_train=True)
-    exe.backward(exe.outputs[0])
-    exe.outputs[0].wait_to_read()
-
-    tic = time.time()
-    for i in range(N):
-        exe.forward(is_train=True)
-        exe.backward(exe.outputs[0])
-        exe.outputs[0].wait_to_read()
-    return (time.time() - tic)*1.0/N
-
 def test_batchnorm_with_type():
     sym = mx.sym.BatchNorm(name='norm', fix_gamma=False)
     ctx_list = [{'ctx': mx.gpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}},
diff --git a/tests/python/unittest/check_utils.py b/tests/python/unittest/check_utils.py
deleted file mode 100644
index ff036bd29550..000000000000
--- a/tests/python/unittest/check_utils.py
+++ /dev/null
@@ -1,257 +0,0 @@
-import mxnet as mx
-from mxnet.operator import NumpyOp
-
-import numpy as np
-
-def _np_reduce(dat, axis, keepdims, numpy_reduce_func):
-    if isinstance(axis, int):
-        axis = [axis]
-    else:
-        axis = list(axis) if axis is not None else range(len(dat.shape))
-    ret = dat
-    for i in reversed(sorted(axis)):
-        ret = numpy_reduce_func(ret, axis=i)
-    if keepdims:
-        keepdims_shape = list(dat.shape)
-        for i in axis:
-            keepdims_shape[i] = 1
-        ret = ret.reshape(tuple(keepdims_shape))
-    return ret
-
-def reldiff(a, b):
-    diff = np.sum(np.abs(a - b))
-    norm = np.sum(np.abs(a))
-    if diff == 0:
-        return 0
-    reldiff = diff  / norm
-    return reldiff
-
-class SumAllLoss(NumpyOp):
-    """
-    Operator to sum all elements in a tensor.
-    """
-    def __init__(self):
-        super(SumAllLoss, self).__init__(False)
-    def list_arguments(self):
-        return ['data']
-    def list_outputs(self):
-        return ['output']
-    def infer_shape(self, in_shape):
-        return in_shape, [(1,)]
-    def forward(self, in_data, out_data):
-        out_data[0][:] = np.sum(in_data[0])
-    def backward(self, out_grad, in_data, out_data, in_grad):
-        in_grad[0][:] = 1
-
-def numeric_grad(executor, location, eps=1e-4):
-    """ Class based on Theano's `theano.gradient.numeric_grad` [1]
-    Calculates a numeric gradient via finite difference method.
-
-    Parameters:
-    -----------
-    executor: `mxnet.executor.Executor`
-        exectutor that computes the forward pass
-
-    location: list np.ndarray
-        location in which to compute gradient. list should be the same size
-        as executor.arg_arrays
-
-    References
-    ---------
-    ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
-    """
-    args = executor.arg_arrays
-    for a, l in zip(args, location):
-        a[:] = np.asarray(l)
-    approx_grads = [np.zeros_like(l) for l in location]
-
-    executor.forward(is_train=True)
-    f_x = executor.outputs[0].asnumpy()
-
-    x_copy = [np.copy(x) for x in location]
-    for ap_grad, loc, reset in zip(approx_grads, location, x_copy):
-        for i in range(np.prod(loc.shape)):
-            # inplace update of memory
-            loc.ravel()[i] += eps
-
-            # set initial states. Need to set all due to inplace operations
-            for inp, val in zip(args, location):
-                inp[:] = val
-            executor.forward(is_train=True)
-            f_eps = executor.outputs[0].asnumpy()
-            ap_grad.ravel()[i] = (f_eps - f_x) / eps
-            loc.ravel()[i] = reset.ravel()[i]
-
-    return approx_grads
-
-
-rng = np.random.RandomState(1234)
-
-def check_numeric_gradient(sym, location, aux_states=[], numeric_eps=1e-4, check_eps=1e-2):
-    """
-    Verify an operation by checking backwards pass via
-    finite difference method.
-
-    Based on Theano's `theano.gradient.numeric_grad` [1]
-
-    Parameters:
-    -----------
-    sym: `mxnet.symbol.Symbol`
-        Symbol containing op to test
-    location: list of numpy.ndarray
-        list of numpy.ndarray used as location to compute gradient
-    numeric_eps: float, optional
-        delta for location to compute numeric gradient
-    check_eps: float, optional
-        relative error eps used when comparing numeric grad to symbolic grad
-
-    References
-    ---------
-    ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
-    """
-
-    # random_projection should not have elements too small,
-    # otherwise too much precision is lost in numerical gradient
-    def random_projection(shape):
-        plain = rng.rand(*shape) + 0.1
-        #plain = np.ones(shape)
-        return plain
-
-    kwargs = {name:array.shape for name, array in zip(sym.list_arguments(), location)}
-    arg_shape, out_shape, aux_shape = sym.infer_shape(**kwargs)
-
-    proj = mx.sym.Variable("__random_proj")
-    out = SumAllLoss()(sym*proj)
-
-    args = out.list_arguments()
-
-    kwargs = {a:loc.shape for a,loc in zip(args, location)}
-
-    arr_data = [mx.nd.array(l) for l in location] + [mx.nd.empty(out_shape[0])]
-    arr_grad = [mx.nd.empty(l.shape) for l in location] + [mx.nd.empty(out_shape[0])]
-    arr_aux = [mx.nd.array(l) for l in aux_states]
-
-    executor = out.bind(mx.cpu(), args=arr_data, args_grad=arr_grad, aux_states=arr_aux)
-
-    location = location + [random_projection(out_shape[0])]
-    inps = executor.arg_arrays
-    if len(inps) != len(location):
-        raise ValueError("Executor arg_arrays and and location len do not match."
-                         "Got %d inputs and %d locations"%(len(inps), len(location))
-        )
-    for inp, source in zip(location, executor.arg_arrays):
-        source[:] = inp
-
-    for g in executor.grad_arrays:
-        if g:
-            g[:] = 0
-
-    assert len(executor.outputs) == 1
-
-    executor.forward(is_train=True)
-    executor.backward()
-    # remove the proj from grads
-    symbolic_grad = [g.asnumpy() for g in executor.grad_arrays[0:-1]]
-
-    # refactor forward out of here as this no longer computes correct forward pass
-    numeric_gradients = numeric_grad(executor, location, eps=numeric_eps)
-
-    for name, numeric, symbolic in zip(out.list_arguments(), numeric_gradients, symbolic_grad):
-        rel = reldiff(numeric, symbolic)
-        if rel > check_eps:
-            raise Exception("Numeric check failed for %s. relative error of %f expected <= %f"%(name, rel, check_eps))
-
-def check_symbolic_forward(sym, location, expected, check_eps=1e-5):
-    """ Compare foward call to expected value.
-
-    Parameters
-    ---------
-    sym: mxnet.symbol.Symbol
-        output symbol
-    location: list np.ndarray
-        list of numpy arrays corresponding to sym.list_arguments
-    expected: list np.ndarray
-        list of arrays corresponding to sym.outputs
-    check_eps: float
-        relative error to check to
-    """
-    kwargs = {name:array.shape for name, array in zip(sym.list_arguments(), location)}
-    arg_shape, out_shape, aux_shape = sym.infer_shape(**kwargs)
-
-    args = sym.list_arguments()
-
-    kwargs = {a:loc.shape for a,loc in zip(args, location)}
-
-    arr_data = [mx.nd.array(l) for l in location]
-    arr_grad = [mx.nd.empty(l.shape) for l in location]
-
-    executor = sym.bind(mx.cpu(), args=arr_data, args_grad=arr_grad)
-
-    inps = executor.arg_arrays
-    if len(inps) != len(location):
-        raise ValueError("Executor arg_arrays and and location len do not match."
-                         "Got %d inputs and %d locations"%(len(inps), len(location))
-        )
-    for inp, source in zip(location, executor.arg_arrays):
-        source[:] = inp
-
-    for g in executor.grad_arrays:
-        if g:
-            g[:] = 0
-
-    assert len(executor.outputs) == 1
-
-    executor.forward()
-
-    outputs = [x.asnumpy() for x in executor.outputs]
-    for expect, output in zip(expected, outputs):
-        assert reldiff(expect, output) <= check_eps
-
-def check_symbolic_backward(sym, location, out_grad, expected, check_eps=1e-5):
-    """ Compare backwards call to expected value.
-
-    Parameters
-    ---------
-    sym: mxnet.symbol.Symbol
-        output symbol
-    location: list np.ndarray
-        list of numpy arrays corresponding to sym.list_arguments
-    location: list np.ndarray
-        list of numpy arrays corresponding to sym.outputs for incomming gradient
-    expected: list np.ndarray
-        list of arrays corresponding to sym.outputs
-    check_eps: float
-        relative error to check to
-    """
-
-    kwargs = {name:array.shape for name, array in zip(sym.list_arguments(), location)}
-    arg_shape, out_shape, aux_shape = sym.infer_shape(**kwargs)
-
-    args = sym.list_arguments()
-
-    kwargs = {a:loc.shape for a,loc in zip(args, location)}
-
-    arr_data = [mx.nd.array(l) for l in location]
-    arr_grad = [mx.nd.empty(l.shape) for l in location]
-    out_grad = [mx.nd.array(j) for j in out_grad]
-
-    executor = sym.bind(mx.cpu(), args=arr_data, args_grad=arr_grad)
-
-    inps = executor.arg_arrays
-    if len(inps) != len(location):
-        raise ValueError("Executor arg_arrays and and location len do not match."
-                         "Got %d inputs and %d locations"%(len(inps), len(location))
-        )
-    for inp, source in zip(location, executor.arg_arrays):
-        source[:] = inp
-
-    for g in executor.grad_arrays:
-        if g:
-            g[:] = 0
-
-    executor.forward()
-    executor.backward(out_grad)
-
-    grads = [x.asnumpy() for x in executor.grad_arrays]
-    for expect, grad in zip(expected, grads):
-        assert reldiff(expect, grad) <= check_eps
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 45537ff7540b..3bc5999a5ee7 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -2,17 +2,7 @@
 import mxnet as mx
 import numpy as np
 import pickle as pkl
-from check_utils import _np_reduce
-
-def reldiff(a, b):
-    diff = np.sum(np.abs(a - b))
-    norm = np.sum(np.abs(a))
-    reldiff = diff  / (norm + 1e-8)
-    return reldiff
-
-
-def same(a, b):
-    return np.sum(a != b) == 0
+from mxnet.test_utils import *
 
 
 def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=[np.float32]):
@@ -186,6 +176,20 @@ def test_ndarray_slice():
     A[3:8] = A2[3:8]
     assert same(A[3:8].asnumpy(), A2[3:8])
 
+
+def test_ndarray_slice_along_axis():
+    arr = mx.nd.array(np.random.uniform(-10, 10, (3, 4, 2, 3)))
+    sub_arr = mx.nd.zeros((3, 2, 2, 3))
+    arr._copy_slice_to(1, 1, 3, sub_arr)
+
+    # test we sliced correctly
+    assert same(arr.asnumpy()[:, 1:3, :, :], sub_arr.asnumpy())
+
+    # test that slice is copy, instead of shared memory
+    sub_arr[:] = 0
+    assert not same(arr.asnumpy()[:, 1:3, :, :], sub_arr.asnumpy())
+
+
 def test_clip():
     shape = (10,)
     A = mx.random.uniform(-10, 10, shape)
@@ -231,11 +235,11 @@ def test_reduce_inner(numpy_reduce_func, nd_reduce_func):
                                                          %(ndarray_ret.shape, numpy_ret.shape)
             err = np.square(ndarray_ret - numpy_ret).mean()
             assert err < 1E-4
-    test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.sum),
+    test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.sum),
                       mx.nd.sum)
-    test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.max),
+    test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.max),
                       mx.nd.max)
-    test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.min),
+    test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.min),
                       mx.nd.min)
 
 def test_broadcast():
@@ -261,6 +265,7 @@ def test_broadcast_to():
     test_broadcast_to()
 
 if __name__ == '__main__':
+    test_ndarray_slice_along_axis()
     test_ndarray_slice()
     test_ndarray_pickle()
     test_ndarray_saveload()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a9fc189cc727..8119b349d758 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3,12 +3,9 @@
 import mxnet as mx
 import random
 from numpy.testing import assert_allclose
-from check_utils import (check_numeric_gradient, check_symbolic_backward,
-                         check_symbolic_forward, reldiff, _np_reduce)
+from mxnet.test_utils import *
 
 
-def same(a, b):
-    return np.sum(a != b) == 0
 
 def np_softmax(x):
     x = x - np.max(x, axis=1).reshape(x.shape[0], 1)
@@ -727,8 +724,10 @@ def test_batchnorm_training():
 
         data = mx.symbol.Variable('data')
         test = mx.symbol.BatchNorm(data, fix_gamma=False)
+        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=0.16)
 
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=5e-2)
+        test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True)
+        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=0.16)
 
 def test_convolution_grouping():
     num_filter = 4
@@ -926,38 +925,44 @@ def test_convolution_dilated_impulse_response():
 
 def test_reshape():
 
-    def test_reshape_new(src_shape, shape_args, dst_shape):
+    def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
         net = mx.sym.Variable("data")
-        net = mx.sym.Reshape(net, shape=shape_args)
+        net = mx.sym.Reshape(net, shape=shape_args, reverse=reverse)
         js = net.tojson()
         net = mx.sym.load_json(js)
         _, output_shape, __ = net.infer_shape(data=src_shape)
         assert output_shape[0] == dst_shape, \
-            'Src Shape = %s, Shape Arguments = %s, Dst Shape = %s, Output Shape = %s' \
-            %(str(src_shape), str(shape_args), str(dst_shape), str(output_shape[0]))
+            'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \
+            'Output Shape = %s' %(str(src_shape), str(shape_args), str(reverse),
+                                  str(dst_shape), str(output_shape[0]))
         dat_npy = np.random.rand(*src_shape)
         grad_npy = np.random.rand(*dst_shape)
         exe = net.simple_bind(mx.cpu(), data=src_shape)
         exe.arg_dict['data'][:] = dat_npy
         exe.forward(is_train=True)
         assert np.square(exe.outputs[0].asnumpy() - dat_npy.reshape(dst_shape)).mean() < 1E-7, \
-            'Src Shape = %s, Shape Arguments = %s, Dst Shape = %s' %(str(src_shape),
-                                                                     str(shape_args), str(dst_shape))
+            'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s'\
+            %(str(src_shape), str(shape_args), str(reverse), str(dst_shape))
         exe.backward(out_grads=mx.nd.array(grad_npy))
         assert np.square(exe.grad_dict['data'].asnumpy() - grad_npy.reshape(src_shape)).mean() < 1E-7, \
-            'Src Shape = %s, Shape Arguments = %s, Dst Shape = %s' %(str(src_shape),
-                                                                     str(shape_args), str(dst_shape))
+            'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s'\
+            %(str(src_shape), str(shape_args), str(reverse), str(dst_shape))
     # Test new api (Using shape)
-    test_cases = [[(2, 3, 5, 5), (0, -1), (2, 75)],
-                  [(2, 3, 5, 5), (0, 0, -1), (2, 3, 25)],
-                  [(5, 3, 4, 5), (0, -1, 0), (5, 15, 4)],
-                  [(2, 3, 5, 4), (-1, 0, 0), (8, 3, 5),
-                  [(2, 3, 4, 5), (3, -1, 0), (3, 10, 4)],
-                  [(2, 3, 5, 5), (5, 3, 0, -1), (5, 3, 5, 2)],
-                  [(2, 3, 5, 5), (0, 0, 0, 0), (2, 3, 5, 5)],
-                  [(2, 4, 5, 3), (-1, 2, 2, 1), (30, 2, 2, 1)]]]
+    test_cases = [[(2, 3, 5, 5), (0, -1), False, (2, 75)],
+                  [(2, 3, 5, 5), (0, 0, -1), False, (2, 3, 25)],
+                  [(5, 3, 4, 5), (0, -1, 0), False, (5, 15, 4)],
+                  [(2, 3, 5, 4), (-1, 0, 0), False, (8, 3, 5)],
+                  [(2, 3, 5, 5), (0, 0, 0, 0), False, (2, 3, 5, 5)],
+                  [(2, 4, 5, 3), (-1, 2, 2, 1), False, (30, 2, 2, 1)],
+                  [(2, 3, 5, 5), (0, -1), True, (5, 30)],
+                  [(2, 3, 5, 5), (0, 0, -1), True, (3, 5, 10)],
+                  [(5, 3, 4, 5), (0, -1, 0), True, (3, 20, 5)],
+                  [(2, 3, 5, 4), (-1, 0, 0), True, (6, 5, 4)],
+                  [(2, 3, 4, 5), (3, -1, 0), True, (3, 8, 5)],
+                  [(2, 3, 5, 5), (5, 3, 0, -1), True, (5, 3, 5, 2)],
+                  [(2, 3, 5, 5), (0, 0, 0, 0), True, (2, 3, 5, 5)]]
     for test_case in test_cases:
-        test_reshape_new(test_case[0], test_case[1], test_case[2])
+        test_reshape_new(*test_case)
     # Test old api
     net = mx.sym.Variable("data")
     net = mx.sym.Reshape(net, target_shape=(2, 0))
@@ -1008,9 +1013,9 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
             net.backward(out_grads=mx.nd.array(outgrad_npy))
             err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth)
             assert err_backward < 1E-4
-    test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.sum),
+    test_reduce_inner(lambda data, axis, keepdims:np_reduce(data, axis, keepdims, np.sum),
                       lambda outgrad, data, axis, keepdims:
-                        outgrad.reshape(_np_reduce(data, axis, 1, np.sum).shape),
+                        outgrad.reshape(np_reduce(data, axis, 1, np.sum).shape),
                       mx.symbol.sum)
 
 def test_broadcast():
@@ -1032,7 +1037,7 @@ def test_broadcasting_ele(sym_bcast):
             groundtruth = dat_npy
             grad_nd = mx.nd.empty(shape)
             outgrad_npy = np.random.rand(*target_shape)
-            grad_groundtruth = _np_reduce(outgrad_npy, axis=axis, keepdims=True,
+            grad_groundtruth = np_reduce(outgrad_npy, axis=axis, keepdims=True,
                                           numpy_reduce_func=np.sum)
             net = sym_bcast.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
                                                  args_grad={'a': grad_nd})
@@ -1131,7 +1136,6 @@ def test_flip():
 
 
 def test_stn():
-    import pdb
     np.set_printoptions(threshold=np.nan)
     num_filter = 2  # conv of loc net
     kernel = (3, 3)  # conv of loc net
@@ -1437,6 +1441,18 @@ def test_support_vector_machine_l2_svm():
     grad_np = grad_np.astype(np.float32)
     assert_allclose(grad_np, grad.asnumpy())
 
+def test_roipooling():
+    data = mx.symbol.Variable(name='data')
+    rois = mx.symbol.Variable(name='rois')
+    test = mx.symbol.ROIPooling(data=data, rois=rois, pooled_size=(6, 6), spatial_scale=1)
+
+    x1 = np.random.rand(4, 3, 12, 8)
+    x2 = np.array([[0, 1, 1, 6, 6], [2, 6, 2, 7, 11], [1, 3, 1, 5, 10], [0, 3, 3, 3, 3]])
+
+    check_numeric_gradient(test, [x1, x2], numeric_eps=1e-3, check_eps=1e-2)
+    check_numeric_gradient(sym=test, location=[x1, x2],
+                           grad_nodes={'data':'add', 'rois':'write'},
+                           numeric_eps=1e-3, check_eps=1e-2)
 
 if __name__ == '__main__':
     test_expand_dims()
@@ -1478,3 +1494,4 @@ def test_support_vector_machine_l2_svm():
     test_correlation()
     test_support_vector_machine_l1_svm()
     test_support_vector_machine_l2_svm()
+    test_roipooling()
diff --git a/tests/python/unittest/test_recordio.py b/tests/python/unittest/test_recordio.py
new file mode 100644
index 000000000000..a3853ee891c2
--- /dev/null
+++ b/tests/python/unittest/test_recordio.py
@@ -0,0 +1,71 @@
+# pylint: skip-file
+import sys
+import mxnet as mx
+import numpy as np
+import tempfile
+import random
+import string
+
+def test_recordio():
+    frec = tempfile.mktemp()
+    N = 255
+
+    writer = mx.recordio.MXRecordIO(frec, 'w')
+    for i in range(N):
+        if sys.version_info[0] < 3:
+            writer.write(str(chr(i)))
+        else:
+            writer.write(bytes(str(chr(i)), 'utf-8'))
+    del writer
+
+    reader = mx.recordio.MXRecordIO(frec, 'r')
+    for i in range(N):
+        res = reader.read()
+        if sys.version_info[0] < 3:
+            assert res == str(chr(i))
+        else:
+            assert res == bytes(str(chr(i)), 'utf-8')
+
+def test_indexed_recordio():
+    fidx = tempfile.mktemp()
+    frec = tempfile.mktemp()
+    N = 255
+
+    writer = mx.recordio.MXIndexedRecordIO(fidx, frec, 'w')
+    for i in range(N):
+        if sys.version_info[0] < 3:
+            writer.write_idx(i, str(chr(i)))
+        else:
+            writer.write_idx(i, bytes(str(chr(i)), 'utf-8'))
+    del writer
+
+    reader = mx.recordio.MXIndexedRecordIO(fidx, frec, 'r')
+    keys = reader.keys()
+    assert sorted(keys) == [i for i in range(N)]
+    random.shuffle(keys)
+    for i in keys:
+        res = reader.read_idx(i)
+        if sys.version_info[0] < 3:
+            assert res == str(chr(i))
+        else:
+            assert res == bytes(str(chr(i)), 'utf-8')
+
+def test_recordio_pack_label():
+    frec = tempfile.mktemp()
+    N = 255
+
+    for i in range(1, N):
+        for j in range(N):
+            content = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(j))
+            content = content.encode('utf-8')
+            label = np.random.uniform(size=i).astype(np.float32)
+            header = (0, label, 0, 0)
+            s = mx.recordio.pack(header, content)
+            rheader, rcontent = mx.recordio.unpack(s)
+            assert (label == rheader.label).all()
+            assert content == rcontent
+
+if __name__ == '__main__':
+    test_recordio_pack_label()
+    test_recordio()
+    test_indexed_recordio()
\ No newline at end of file
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index edbc2f99ebaa..fa7d8c5a7e22 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -111,6 +111,18 @@ if [ ${TASK} == "python_test" ]; then
     exit 0
 fi
 
+if [ ${TASK} == "julia" ]; then
+    make all || exit -1
+    # use cached dir for storing data
+    rm -rf ${PWD}/data
+    mkdir -p ${CACHE_PREFIX}/data
+    ln -s ${CACHE_PREFIX}/data ${PWD}/data
+
+    export MXNET_HOME="${PWD}"
+    julia -e 'Pkg.clone("MXNet"); Pkg.checkout("MXNet"); Pkg.build("MXNet"); Pkg.test("MXNet")' || exit -1
+    exit 0
+fi
+
 if [ ${TASK} == "scala_test" ]; then
     if [ ${TRAVIS_OS_NAME} == "osx" ]; then
         LIB_GOMP_PATH=`find /usr/local/lib -name libgomp.dylib | grep -v i386 | head -n1`
diff --git a/tools/bandwidth/.gitignore b/tools/bandwidth/.gitignore
new file mode 100644
index 000000000000..d2597a8d27b2
--- /dev/null
+++ b/tools/bandwidth/.gitignore
@@ -0,0 +1 @@
+ResNet
diff --git a/tools/bandwidth/README.md b/tools/bandwidth/README.md
new file mode 100644
index 000000000000..c57177e612bc
--- /dev/null
+++ b/tools/bandwidth/README.md
@@ -0,0 +1,124 @@
+# Measure communication bandwidth
+
+MXNet provides multiple ways to communicate data. The best choice depends on
+both the physical machines and neural network strcture. This folder provides
+tools to test the bandwidth under various setups, which can be used to debugging
+the performance.
+
+## Usages
+
+`measure.py` provides several options. We list some important ones, try `python
+measure.py --help` for more details.
+
+- `--gpus` the list of gpus to test. `0,3` means GPUs 0 and 3.
+- `--network` the neural network to test, such as resnet, alexnet, inception-bn, and vgg
+- `--kvstore` the way how data is communicated.
+  - `local` : copy data from GPU to CPU, run optimizer on CPU
+  - `device` (default) : communicate (reduce and broadcast) data on GPU,
+     use GPU peer-to-peer communication if supported. The optimizer will run on
+     GPUs.
+  - `dist_sync` : similar to `local`, but the data is further send to parameter
+    servers, and run the optimizer on servers
+  - `dist_sync_device` : similar to `dist_sync` but try best to use GPU for communcation
+  - `dist_async` : similar to `dist_sync` but uses asynchoronous communication
+  - `dist_async_device` : similar to `dist_async` but try best to use GPU for communcation
+
+## Samples
+
+### Single machine with multiple GPUs
+
+- Use resnet 200 layers on GPU 0, 1, 2, and 3
+
+```bash
+~/mxnet/tools/bandwidth $ python measure.py --kv-store device --gpus 0,1 --network resnet --depth 200
+INFO:root:Namespace(batch_size=128, data_shape='128,3,224,224', depth=200, disp_batches=1, gpus='0,1', kv_store='device', network='resnet', num_batches=5, num_classes=1000, optimizer='None', test_results=1)
+INFO:root:num of arrays = 205, total size = 257.991328 MB
+INFO:root:iter 1, 0.023242 sec, 11.100222 GB/sec per gpu, error 0.000000
+INFO:root:iter 2, 0.023106 sec, 11.165508 GB/sec per gpu, error 0.000000
+INFO:root:iter 3, 0.023218 sec, 11.111735 GB/sec per gpu, error 0.000000
+INFO:root:iter 4, 0.023193 sec, 11.123614 GB/sec per gpu, error 0.000000
+INFO:root:iter 5, 0.023089 sec, 11.173694 GB/sec per gpu, error 0.000000
+```
+
+The results are close to the unidirectional bandwidth, which is 13 GB/sec, reported by
+`cuda/samples/1_Utilities/p2pBandwidthLatencyTest`. But our problem is harder
+because we do all-to-all communication.
+
+- Use 8 GPUs, it satruates the single 16x link between GPU 0,1,2,3 and GPU 4,5,6,7.
+
+```bash
+~/mxnet/tools/bandwidth $ python measure.py --kv-store device --gpus 0,1,2,3,4,5,6,7 --network resnet --depth 200
+INFO:root:Namespace(batch_size=128, data_shape='128,3,224,224', depth=200, disp_batches=1, gpus='0,1,2,3,4,5,6,7', kv_store='device', network='resnet', num_batches=5, num_classes=1000, optimizer='None', test_results=1)
+INFO:root:num of arrays = 205, total size = 257.991328 MB
+INFO:root:iter 1, 0.102321 sec, 4.412429 GB/sec per gpu, error 0.000000
+INFO:root:iter 2, 0.100345 sec, 4.499330 GB/sec per gpu, error 0.000000
+INFO:root:iter 3, 0.097317 sec, 4.639322 GB/sec per gpu, error 0.000000
+INFO:root:iter 4, 0.099873 sec, 4.520586 GB/sec per gpu, error 0.000000
+INFO:root:iter 5, 0.100774 sec, 4.480169 GB/sec per gpu, error 0.000000
+```
+
+- Now let's only use GPU-CPU communication, it satruates the single 16x link
+between all GPUs and the CPU.
+
+```bash
+~/mxnet/tools/bandwidth $ python measure.py --kv-store local --gpus 0,1,2,3,4,5,6,7 --network resnet --depth 200
+INFO:root:Namespace(batch_size=128, data_shape='128,3,224,224', depth=200, disp_batches=1, gpus='0,1,2,3,4,5,6,7', kv_store='local', network='resnet', num_batches=5, num_classes=1000, optimizer='None', test_results=1)
+INFO:root:num of arrays = 205, total size = 257.991328 MB
+INFO:root:iter 1, 0.290164 sec, 1.555964 GB/sec per gpu, error 0.000000
+INFO:root:iter 2, 0.293963 sec, 1.535856 GB/sec per gpu, error 0.000000
+INFO:root:iter 3, 0.294468 sec, 1.533222 GB/sec per gpu, error 0.000000
+INFO:root:iter 4, 0.290657 sec, 1.553325 GB/sec per gpu, error 0.000000
+INFO:root:iter 5, 0.290799 sec, 1.552567 GB/sec per gpu, error 0.000000
+```
+
+- Finally we change to VGG and also run the `sgd` optimizor
+
+```bash
+~/mxnet/tools/bandwidth $ python measure.py --kv-store device --gpus 0,1,2,3,4,5,6,7 --network vgg --optimizer sgd
+INFO:root:Namespace(batch_size=128, data_shape='128,3,224,224', depth=152, disp_batches=1, gpus='0,1,2,3,4,5,6,7', kv_store='device', network='vgg', num_batches=5, num_classes=1000, optimizer='sgd', test_results=1)
+INFO:root:num of arrays = 22, total size = 531.453344 MB
+INFO:root:iter 1, 0.525208 sec, 1.770810 GB/sec per gpu, error 0.000000
+INFO:root:iter 2, 0.524052 sec, 1.774715 GB/sec per gpu, error 0.000000
+INFO:root:iter 3, 0.524732 sec, 1.772416 GB/sec per gpu, error 0.000000
+INFO:root:iter 4, 0.527117 sec, 1.764396 GB/sec per gpu, error 0.000000
+INFO:root:iter 5, 0.520293 sec, 1.787538 GB/sec per gpu, error 0.000000
+```
+
+### Multiple GPU machines
+
+We can use `tools/launch.py` to launch a distributed job easily.
+To show the idea, we run a worker and a server on the single machine. First we put the ip
+into the `hosts` file
+
+```bash
+echo "127.0.0.1" >hosts
+```
+
+For more than one machines, we can replace `hosts` with the actual machine IPs
+line by line. Then launch it by
+
+```bash
+~/mxnet/tools/bandwidth $ python ../launch.py -H hosts -n 1 python measure.py --kv-store dist_device_sync --gpus 0,1,2,3,4,5,6,7 --network resnet --depth 200
+INFO:root:Namespace(batch_size=128, data_shape='128,3,224,224', depth=200, disp_batches=1, gpus='0,1,2,3,4,5,6,7', kv_store='dist_device_sync', network='resnet', num_batches=5, num_classes=1000, optimizer='None', test_results=1)
+INFO:root:num of arrays = 205, total size = 257.991328 MB
+INFO:root:iter 1, 0.295398 sec, 1.528395 GB/sec per gpu, error 0.000000
+INFO:root:iter 2, 0.303159 sec, 1.489267 GB/sec per gpu, error 0.000000
+INFO:root:iter 3, 0.290734 sec, 1.552913 GB/sec per gpu, error 0.000000
+INFO:root:iter 4, 0.299437 sec, 1.507780 GB/sec per gpu, error 0.000000
+INFO:root:iter 5, 0.285363 sec, 1.582142 GB/sec per gpu, error 0.000000
+```
+
+As we can see, the extra memory copy from GPUs to CPU, and then network card
+harms the performance. We can slightly improve the performance using more than
+1 server nodes:
+
+```bash
+~/mxnet/tools/bandwidth $ python ../launch.py -H hosts -n 1 -s 4 python measure.py --kv-store dist_device_sync --gpus 0,1,2,3,4,5,6,7 --network resnet --depth 200
+INFO:root:Namespace(batch_size=128, data_shape='128,3,224,224', depth=200, disp_batches=1, gpus='0,1,2,3,4,5,6,7', kv_store='dist_device_sync', network='resnet', num_batches=5, num_classes=1000, optimizer='None', test_results=1)
+INFO:root:num of arrays = 205, total size = 257.991328 MB
+INFO:root:iter 1, 0.233309 sec, 1.935137 GB/sec per gpu, error 0.000000
+INFO:root:iter 2, 0.253864 sec, 1.778453 GB/sec per gpu, error 0.000000
+INFO:root:iter 3, 0.256627 sec, 1.759303 GB/sec per gpu, error 0.000000
+INFO:root:iter 4, 0.250969 sec, 1.798965 GB/sec per gpu, error 0.000000
+INFO:root:iter 5, 0.229306 sec, 1.968919 GB/sec per gpu, error 0.000000
+```
diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py
new file mode 100644
index 000000000000..d647c6829320
--- /dev/null
+++ b/tools/bandwidth/measure.py
@@ -0,0 +1,147 @@
+import os, sys
+curr_path = os.path.abspath(os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(curr_path, "../../python"))
+sys.path.insert(0, os.path.join(curr_path, "../../example/image-classification"))
+import mxnet as mx
+import logging
+import argparse
+import time
+import numpy as np
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="command for benchmark kv-store")
+    parser.add_argument('--network', type=str, default="resnet",
+                        help='the neural network to test')
+    parser.add_argument('--gpus', type=str, default='0,1',
+                        help='the gpus to be used, e.g "0,1,2,3"')
+    parser.add_argument('--depth', type=int, default=152,
+                        help='the depth of network, only valid for resnet')
+    parser.add_argument('--kv-store', type=str, default='device',
+                        help='the kvstore type')
+    parser.add_argument('--batch-size', type=int, default=128,
+                        help='batch size. should not affect the results')
+    parser.add_argument('--num-batches', type=int, default=5,
+                        help='number of batches to run')
+    parser.add_argument('--disp-batches', type=int, default=1,
+                        help='show averaged results for every n batches')
+    parser.add_argument('--test-results', type=int, default=1,
+                        help='if or not evalute the results correctness')
+    parser.add_argument('--data-shape', type=str, default='128,3,224,224',
+                        help='input data shape')
+    parser.add_argument('--num-classes', type=int, default=1000,
+                        help='number of classes')
+    parser.add_argument('--optimizer', type=str, default='None',
+                        help='the optimizer set to kvstore. None means no optimizer')
+    args = parser.parse_args()
+    logging.info(args)
+    return args
+
+def get_resnet(args):
+    resnet_path = os.path.join(curr_path, "./ResNet")
+    if not os.path.isdir(resnet_path):
+        os.system("git clone https://github.com/tornadomeet/ResNet")
+    sys.path.insert(0, resnet_path)
+    from symbol_resnet import resnet
+    if args.depth == 18:
+        units = [2, 2, 2, 2]
+    elif args.depth == 34:
+        units = [3, 4, 6, 3]
+    elif args.depth == 50:
+        units = [3, 4, 6, 3]
+    elif args.depth == 101:
+        units = [3, 4, 23, 3]
+    elif args.depth == 152:
+        units = [3, 8, 36, 3]
+    elif args.depth == 200:
+        units = [3, 24, 36, 3]
+    else:
+        raise ValueError("no experiments done on detph {}, you can do it youself".format(args.depth))
+
+    filter_list=[64, 256, 512, 1024, 2048] if args.depth >=50 else [64, 64, 128, 256, 512]
+    bottle_neck = True if args.depth >= 50 else False
+    symbol = resnet(units=units, num_stage=4, filter_list=filter_list,
+                    num_class=args.num_classes, data_type="imagenet", bottle_neck=bottle_neck, bn_mom=.9, workspace=512)
+    return symbol
+
+def get_shapes(symbol, data_shape):
+    arg_name = symbol.list_arguments()
+    arg_shape, _, _ = symbol.infer_shape(data=data_shape)
+    shapes = [s for n,s in zip(arg_name, arg_shape) if 'weight' in n or 'bias' in n]
+    return shapes
+
+def diff(a, b):
+    return np.sum(np.abs(a.asnumpy() - b.asnumpy()))
+
+def error(gpu_res, cpu_res):
+    res = sum([sum([diff(a, b) for a in w]) for w, b in zip(gpu_res, cpu_res)])
+    res /= sum([np.sum(np.abs(g.asnumpy())) for g in cpu_res])
+    return res
+
+def run():
+    args = parse_args();
+    # create kvstore and optimizer
+    devs = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    kv = mx.kv.create(args.kv_store)
+    if args.optimizer == 'None':
+        optimizer = None
+    else:
+        optimizer = mx.optimizer.Optimizer.create_optimizer(args.optimizer)
+        updater = mx.optimizer.get_updater(mx.optimizer.Optimizer.create_optimizer(args.optimizer))
+        kv.set_optimizer(optimizer)
+
+    # create network
+    if args.network == 'resnet':
+        symbol = get_resnet(args)
+    else:
+        import importlib
+        symbol = importlib.import_module('symbol_' + args.network).get_symbol(args.num_classes)
+    data_shape = tuple([int(s) for s in args.data_shape.split(',')])
+    shapes = get_shapes(symbol, data_shape)
+
+    size = float(sum([reduce(lambda x,y : x*y, s, 1) for s in shapes])) * 4 / 1e6
+    logging.info('num of arrays = %d, total size = %f MB' % (len(shapes), size))
+
+    for i, s in enumerate(shapes):
+        kv.init(i, mx.nd.zeros(s))
+
+    grads_val = [[mx.random.uniform(-1,1,shape=s) for d in devs] for s in shapes]
+    grads = [[g.as_in_context(d) for g, d in zip(gs, devs)] for gs in grads_val]
+    weights = [[mx.nd.zeros(s, d) for d in devs] for s in shapes]
+
+    cpu_grads = [mx.nd.array(sum([g.asnumpy() for g in gs]))*kv.num_workers for gs in grads_val]
+    cpu_weights = [mx.nd.zeros(s) for s in shapes]
+    toc = 0
+    for b in range(0, args.num_batches+1):
+        tic = time.time()
+        for i,g in enumerate(grads):
+            kv.push(i, g, i)
+
+        for i,w in enumerate(weights):
+            kv.pull(i, w, i)
+        for ws in weights:
+            for w in ws:
+                w.wait_to_read()
+        toc += time.time() - tic
+        if args.test_results:
+            if optimizer == None:
+                err = error(weights, cpu_grads)
+            else:
+                for i, wg in enumerate(zip(cpu_weights, cpu_grads)):
+                    updater(i, wg[1], wg[0])
+                err = error(weights, cpu_weights)
+        else:
+            err = -1
+
+        if b % args.disp_batches == 0:
+            toc /= args.disp_batches
+            if b != 0:
+                # 0 is used for warmup, ignored
+                logging.info('iter %d, %f sec, %f GB/sec per gpu, error %f' % (
+                    b, toc, size*2*(len(devs)-1)/len(devs)/toc/1e3, err))
+            toc = 0
+
+if __name__ == "__main__":
+    run()
diff --git a/tools/caffe_converter/README.md b/tools/caffe_converter/README.md
index 3155239daf1d..92cd2aa55019 100644
--- a/tools/caffe_converter/README.md
+++ b/tools/caffe_converter/README.md
@@ -35,3 +35,6 @@ For example: `python convert_model.py VGG_ILSVRC_16_layers_deploy.prototxt VGG_I
 * We have verified the results of VGG_16/VGG_19 model and BVLC_googlenet results from Caffe model zoo.
 * The tool only supports single input and single output network.
 * The tool can only work with the L2LayerParameter in Caffe.
+* Caffe uses a convention for multi-strided pooling output shape inconsistent with MXNet
+    * This importer doesn't handle this problem properly yet
+    * And example of this failure is importing bvlc_Googlenet. The user needs to add padding to stride-2 pooling to make this work right now.
\ No newline at end of file
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index 891681fb347a..a139db111b64 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -72,9 +72,11 @@ def main():
                 wmat_dim = list(layer_blobs[0].shape)
             wmat = np.array(layer_blobs[0].data).reshape(wmat_dim)
             bias = np.array(layer_blobs[1].data)
-            if first_conv:
-                print 'Swapping BGR of caffe into RGB in mxnet'
-                wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
+            channels = layer_blobs[0].channels;
+            if channels == 3 or channels == 4: # RGB or RGBA
+                if first_conv:
+                    print 'Swapping BGR of caffe into RGB in mxnet'
+                    wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
 
             assert(wmat.flags['C_CONTIGUOUS'] is True)
             assert(bias.flags['C_CONTIGUOUS'] is True)
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index a62a25e7e7cf..a8855c7435c1 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -77,6 +77,12 @@ def proto2script(proto_file):
         input_dim = proto.input_dim
     elif len(proto.input_shape) > 0: 
         input_dim = proto.input_shape[0].dim
+    elif (layer[0].type == "Input"):
+        input_dim = layer[0].input_param.shape._values[0].dim
+        layer.pop(0)
+    else:
+        raise Exception('Invalid proto file.')   
+
     # We assume the first bottom blob of first layer is the output from data layer
     input_name = layer[0].bottom[0]
     output_name = ""
@@ -116,6 +122,14 @@ def proto2script(proto_file):
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='relu'"
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'TanH' or layer[i].type == 23:
+            type_string = 'mx.symbol.Activation'
+            param_string = "act_type='tanh'"
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Sigmoid' or layer[i].type == 19:
+            type_string = 'mx.symbol.Activation'
+            param_string = "act_type='sigmoid'"
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'LRN' or layer[i].type == 15:
             type_string = 'mx.symbol.LRN'
             param = layer[i].lrn_param
diff --git a/tools/caffe_converter/mean_image.py b/tools/caffe_converter/mean_image.py
new file mode 100644
index 000000000000..32c5a25134e5
--- /dev/null
+++ b/tools/caffe_converter/mean_image.py
@@ -0,0 +1,48 @@
+import mxnet as mx
+import numpy as np
+import argparse
+
+caffe_flag = True
+try:
+    import caffe
+    from caffe.proto import caffe_pb2
+except ImportError:
+    caffe_flag = False
+    import caffe_parse.caffe_pb2
+
+def protoBlobFileToND(protofile):
+    data = ''
+    file = open(protofile, "r")
+    if not file:
+        raise self.ProcessException("ERROR (" + protofile + ")!")
+    data = file.read()
+    file.close()
+
+    if caffe_flag:
+        mean_blob = caffe.proto.caffe_pb2.BlobProto()
+    else:
+        mean_blob = caffe_parse.caffe_pb2.BlobProto()
+
+    mean_blob.ParseFromString(data)
+    img_mean_np = np.array(mean_blob.data)
+    img_mean_np = img_mean_np.reshape(
+        mean_blob.channels, mean_blob.height, mean_blob.width
+    )
+    # swap channels from Caffe BGR to RGB
+    img_mean_np2 = img_mean_np
+    img_mean_np[0] = img_mean_np2[2]
+    img_mean_np[2] = img_mean_np2[0]
+    return mx.nd.array(img_mean_np)
+
+def main():
+    parser = argparse.ArgumentParser(description='Caffe prototxt to mxnet model parameter converter.\
+                    Note that only basic functions are implemented. You are welcomed to contribute to this file.')
+    parser.add_argument('mean_image_proto', help='The protobuf file in Caffe format')
+    parser.add_argument('save_name', help='The name of the output file prefix')
+    args = parser.parse_args()
+    nd = protoBlobFileToND(args.mean_image_proto)
+    mx.nd.save(args.save_name + ".nd", {"mean_image": nd})
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 408cf9bc35a5..9718cec2ec5a 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -49,6 +49,7 @@ int main(int argc, char *argv[]) {
            "\tcolor=USE_COLOR[default=1] Force color (1), gray image (0) or keep source unchanged (-1).\n"\
            "\tresize=newsize resize the shorter edge of image to the newsize, original images will be packed by default\n"\
            "\tlabel_width=WIDTH[default=1] specify the label_width in the list, by default set to 1\n"\
+           "\tpack_label=PACK_LABEL[default=0] whether to also pack multi dimenional label in the record file\n"\
            "\tnsplit=NSPLIT[default=1] used for part generation, logically split the image.list to NSPLIT parts by position\n"\
            "\tpart=PART[default=0] used for part generation, pack the images from the specific part in image.list\n"\
            "\tcenter_crop=CENTER_CROP[default=0] specify whether to crop the center image to make it square.\n"\
@@ -59,6 +60,7 @@ int main(int argc, char *argv[]) {
     return 0;
   }
   int label_width = 1;
+  int pack_label = 0;
   int new_size = -1;
   int nsplit = 1;
   int partid = 0;
@@ -70,9 +72,18 @@ int main(int argc, char *argv[]) {
   std::string encoding(".jpg");
   for (int i = 4; i < argc; ++i) {
     char key[128], val[128];
-    if (sscanf(argv[i], "%[^=]=%s", key, val) == 2) {
+    int effct_len = 0;
+    
+#ifdef _MSC_VER
+    effct_len = sscanf_s(argv[i], "%[^=]=%s", key, sizeof(key), val, sizeof(val));
+#else
+    effct_len = sscanf(argv[i], "%[^=]=%s", key, val);
+#endif
+    
+    if (effct_len == 2) {
       if (!strcmp(key, "resize")) new_size = atoi(val);
       if (!strcmp(key, "label_width")) label_width = atoi(val);
+      if (!strcmp(key, "pack_label")) pack_label = atoi(val);
       if (!strcmp(key, "nsplit")) nsplit = atoi(val);
       if (!strcmp(key, "part")) partid = atoi(val);
       if (!strcmp(key, "center_crop")) center_crop = atoi(val);
@@ -85,10 +96,13 @@ int main(int argc, char *argv[]) {
   }
   // Check parameters ranges
   if (color_mode != -1 && color_mode != 0 && color_mode != 1) {
-      LOG(FATAL) << "Color mode must be -1, 0 or 1.";
+    LOG(FATAL) << "Color mode must be -1, 0 or 1.";
   }
   if (encoding != std::string(".jpg") && encoding != std::string(".png")) {
-      LOG(FATAL) << "Encoding mode must be .jpg or .png.";
+    LOG(FATAL) << "Encoding mode must be .jpg or .png.";
+  }
+  if (label_width <= 1 && pack_label) {
+    LOG(FATAL) << "pack_label can only be used when label_width > 1";
   }
   if (new_size > 0) {
     LOG(INFO) << "New Image Size: Short Edge " << new_size;
@@ -106,7 +120,7 @@ int main(int argc, char *argv[]) {
   }
   LOG(INFO) << "Encoding is " << encoding;
 
-  if (encoding == std::string(".png") and quality > 9) {
+  if (encoding == std::string(".png") && quality > 9) {
       quality = 3;
   }
   if (inter_method != 1) {
@@ -168,16 +182,25 @@ int main(int argc, char *argv[]) {
       LOG(INFO) << "JPEG encoding quality: " << quality;
   }
   dmlc::InputSplit::Blob line;
+  std::vector<float> label_buf(label_width, 0.f);
 
   while (flist->NextRecord(&line)) {
     std::string sline(static_cast<char*>(line.dptr), line.size);
     std::istringstream is(sline);
     if (!(is >> rec.header.image_id[0] >> rec.header.label)) continue;
+    label_buf[0] = rec.header.label;
     for (int k = 1; k < label_width; ++k) {
-      float tmp;
-      CHECK(is >> tmp)
+      CHECK(is >> label_buf[k])
           << "Invalid ImageList, did you provide the correct label_width?";
     }
+    if (pack_label) rec.header.flag = label_width;
+    rec.SaveHeader(&blob);
+    if (pack_label) {
+      size_t bsize = blob.size();
+      blob.resize(bsize + label_buf.size()*sizeof(float));
+      memcpy(BeginPtr(blob) + bsize,
+             BeginPtr(label_buf), label_buf.size()*sizeof(float));
+    }
     CHECK(std::getline(is, fname));
     // eliminate invalid chars in the end
     while (fname.length() != 0 &&
@@ -190,7 +213,6 @@ int main(int argc, char *argv[]) {
     path = root + p;
     // use "r" is equal to rb in dmlc::Stream
     dmlc::Stream *fi = dmlc::Stream::Create(path.c_str(), "r");
-    rec.SaveHeader(&blob);
     decode_buf.clear();
     size_t imsize = 0;
     while (true) {
@@ -201,6 +223,8 @@ int main(int argc, char *argv[]) {
       if (nread != kBufferSize) break;
     }
     delete fi;
+
+
     if (unchanged != 1) {
       cv::Mat img = cv::imdecode(decode_buf, color_mode);
       CHECK(img.data != NULL) << "OpenCV decode fail:" << path;
@@ -234,6 +258,8 @@ int main(int argc, char *argv[]) {
       }
       encode_buf.clear();
       CHECK(cv::imencode(encoding, res, encode_buf, encode_params));
+
+      // write buffer
       size_t bsize = blob.size();
       blob.resize(bsize + encode_buf.size());
       memcpy(BeginPtr(blob) + bsize,
diff --git a/tools/im2rec.py b/tools/im2rec.py
index 7df1f5a6c72f..040735595121 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -1,238 +1,255 @@
-import os
-import sys
-
-curr_path = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.join(curr_path, "../python"))
-import mxnet as mx
-import random
-import argparse
-import cv2
-import time
-
-
-def list_image(root, recursive, exts):
-    image_list = []
-    if recursive:
-        cat = {}
-        for path, subdirs, files in os.walk(root, followlinks=True):
-            subdirs.sort()
-            print(len(cat), path)
-            for fname in files:
-                fpath = os.path.join(path, fname)
-                suffix = os.path.splitext(fname)[1].lower()
-                if os.path.isfile(fpath) and (suffix in exts):
-                    if path not in cat:
-                        cat[path] = len(cat)
-                    image_list.append((len(image_list), os.path.relpath(fpath, root), cat[path]))
-    else:
-        for fname in os.listdir(root):
-            fpath = os.path.join(root, fname)
-            suffix = os.path.splitext(fname)[1].lower()
-            if os.path.isfile(fpath) and (suffix in exts):
-                image_list.append((len(image_list), os.path.relpath(fpath, root), 0))
-    return image_list
-
-
-def write_list(path_out, image_list):
-    with open(path_out, 'w') as fout:
-        n_images = xrange(len(image_list))
-        for i in n_images:
-            line = '%d\t' % image_list[i][0]
-            for j in image_list[i][2:]:
-                line += '%d\t' % j
-            line += '%s\n' % image_list[i][1]
-            fout.write(line)
-
-
-def make_list(args):
-    image_list = list_image(args.root, args.recursive, args.exts)
-    if args.shuffle is True:
-        random.seed(100)
-        random.shuffle(image_list)
-    N = len(image_list)
-    chunk_size = (N + args.chunks - 1) / args.chunks
-    for i in xrange(args.chunks):
-        chunk = image_list[i * chunk_size:(i + 1) * chunk_size]
-        if args.chunks > 1:
-            str_chunk = '_%d' % i
-        else:
-            str_chunk = ''
-        sep = int(chunk_size * args.train_ratio)
-        sep_test = int(chunk_size * args.test_ratio)
-        write_list(args.prefix + str_chunk + '_test.lst', chunk[:sep_test])
-        write_list(args.prefix + str_chunk + '_train.lst', chunk[sep_test:sep_test + sep])
-        write_list(args.prefix + str_chunk + '_val.lst', chunk[sep_test + sep:])
-
-
-def read_list(path_in):
-    image_list = []
-    with open(path_in) as fin:
-        for line in fin.readlines():
-            line = [i.strip() for i in line.strip().split('\t')]
-            item = [int(line[0])] + [line[-1]] + [int(i) for i in line[1:-1]]
-            image_list.append(item)
-    return image_list
-
-
-def write_record(args, image_list, fname):
-    source = image_list
-    tic = [time.time()]
-    color_modes = {-1: cv2.IMREAD_UNCHANGED,
-                   0: cv2.IMREAD_GRAYSCALE,
-                   1: cv2.IMREAD_COLOR}
-    total = len(source)
-
-    def image_encode(item, q_out):
-        try:
-            img = cv2.imread(os.path.join(args.root, item[1]), color_modes[args.color])
-        except:
-            print 'imread error:', item[1]
-            return
-        if img is None:
-            print 'read none error:', item[1]
-            return
-        if args.center_crop:
-            if img.shape[0] > img.shape[1]:
-                margin = (img.shape[0] - img.shape[1]) / 2;
-                img = img[margin:margin + img.shape[1], :]
-            else:
-                margin = (img.shape[1] - img.shape[0]) / 2;
-                img = img[:, margin:margin + img.shape[0]]
-        if args.resize:
-            if img.shape[0] > img.shape[1]:
-                newsize = (args.resize, img.shape[0] * args.resize / img.shape[1])
-            else:
-                newsize = (img.shape[1] * args.resize / img.shape[0], args.resize)
-            img = cv2.resize(img, newsize)
-        header = mx.recordio.IRHeader(0, item[2], item[0], 0)
-
-        try:
-            s = mx.recordio.pack_img(header, img, quality=args.quality, img_fmt=args.encoding)
-            q_out.put(('data', s, item))
-        except:
-            print 'pack_img error:', item[1]
-            return
-
-    def read_worker(q_in, q_out):
-        while not q_in.empty():
-            item = q_in.get()
-            image_encode(item, q_out)
-
-    def write_worker(q_out, fname, saving_folder):
-        pre_time = time.time()
-        sink = []
-        os.chdir(saving_folder)
-        fname_rec = fname[:fname.rfind('.')]
-        record = mx.recordio.MXRecordIO(fname_rec + '.rec', 'w')
-        while True:
-            stat, s, item = q_out.get()
-            if stat == 'finish':
-                write_list(fname_rec + '.lst', sink)
-                break
-            record.write(s)
-            sink.append(item)
-            if len(sink) % 1000 == 0:
-                cur_time = time.time()
-                print 'time:', cur_time - pre_time, ' count:', len(sink)
-                pre_time = cur_time
-
-    try:
-        import multiprocessing
-        q_in = [multiprocessing.Queue() for i in range(args.num_thread)]
-        q_out = multiprocessing.Queue(1024)
-        for i in range(len(image_list)):
-            q_in[i % len(q_in)].put(image_list[i])
-        read_process = [multiprocessing.Process(target=read_worker, args=(q_in[i], q_out)) \
-                        for i in range(args.num_thread)]
-        for p in read_process:
-            p.start()
-        write_process = multiprocessing.Process(target=write_worker, args=(q_out, fname, args.saving_folder))
-        write_process.start()
-        for p in read_process:
-            p.join()
-        q_out.put(('finish', '', []))
-        write_process.join()
-    except ImportError:
-        print('multiprocessing not available, fall back to single threaded encoding')
-        import Queue
-        q_out = Queue.Queue()
-        os.chdir(args.saving_folder)
-        fname_rec = fname[:fname.rfind('.')]
-        record = mx.recordio.MXRecordIO(fname_rec + '.rec', 'w')
-        cnt = 0
-        pre_time = time.time()
-        for item in image_list:
-            image_encode(item, q_out)
-            if q_out.empty():
-                continue
-            _, s, _ = q_out.get()
-            record.write(s)
-            cnt += 1
-            if cnt % 1000 == 0:
-                cur_time = time.time()
-                print 'time:', cur_time - pre_time, ' count:', cnt
-                pre_time = cur_time
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description='Create an image list or \
-        make a record database by reading from an image list')
-    parser.add_argument('prefix', help='prefix of input/output files.')
-    parser.add_argument('root', help='path to folder containing images.')
-
-    cgroup = parser.add_argument_group('Options for creating image lists')
-    cgroup.add_argument('--list', type=bool, default=False,
-                        help='If this is set im2rec will create image list(s) by traversing root folder\
-        and output to <prefix>.lst.\
-        Otherwise im2rec will read <prefix>.lst and create a database at <prefix>.rec')
-    cgroup.add_argument('--exts', type=list, default=['.jpeg', '.jpg'],
-                        help='list of acceptable image extensions.')
-    cgroup.add_argument('--chunks', type=int, default=1, help='number of chunks.')
-    cgroup.add_argument('--train-ratio', type=float, default=1.0,
-                        help='Ratio of images to use for training.')
-    cgroup.add_argument('--test-ratio', type=float, default=0,
-                        help='Ratio of images to use for testing.')
-    cgroup.add_argument('--recursive', type=bool, default=False,
-                        help='If true recursively walk through subdirs and assign an unique label\
-        to images in each folder. Otherwise only include images in the root folder\
-        and give them label 0.')
-
-    rgroup = parser.add_argument_group('Options for creating database')
-    rgroup.add_argument('--resize', type=int, default=0,
-                        help='resize the shorter edge of image to the newsize, original images will\
-        be packed by default.')
-    rgroup.add_argument('--center-crop', type=bool, default=False,
-                        help='specify whether to crop the center image to make it rectangular.')
-    rgroup.add_argument('--quality', type=int, default=80,
-                        help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
-    rgroup.add_argument('--num_thread', type=int, default=1,
-                        help='number of thread to use for encoding. order of images will be different\
-        from the input list if >1. the input list will be modified to match the\
-        resulting order.')
-    rgroup.add_argument('--color', type=int, default=1, choices=[-1, 0, 1],
-                        help='specify the color mode of the loaded image.\
-        1: Loads a color image. Any transparency of image will be neglected. It is the default flag.\
-        0: Loads image in grayscale mode.\
-        -1:Loads image as such including alpha channel.')
-    rgroup.add_argument('--encoding', type=str, default='.jpg', choices=['.jpg', '.png'],
-                        help='specify the encoding of the images.')
-    rgroup.add_argument('--saving-folder', type=str, default='.',
-                        help='folder in which .rec files will be saved.')
-    rgroup.add_argument('--shuffle', default=True, help='If this is set as True, \
-        im2rec will randomize the image order in <prefix>.lst')
-    args = parser.parse_args()
-    if args.list:
-        make_list(args)
-    else:
-        files = [f for f in os.listdir('.') if os.path.isfile(f)]
-        for f in files:
-            if f.startswith(args.prefix) is True and f.endswith('.lst') is True:
-                print 'Creating .rec file from', f, 'in', args.saving_folder
-                image_list = read_list(f)
-                write_record(args, image_list, f)
-
-
-if __name__ == '__main__':
-    main()
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+import os
+import sys
+
+curr_path = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(curr_path, "../python"))
+import mxnet as mx
+import random
+import argparse
+import cv2
+import time
+
+
+def list_image(root, recursive, exts):
+    image_list = []
+    if recursive:
+        cat = {}
+        for path, subdirs, files in os.walk(root, followlinks=True):
+            subdirs.sort()
+            print(len(cat), path)
+            for fname in files:
+                fpath = os.path.join(path, fname)
+                suffix = os.path.splitext(fname)[1].lower()
+                if os.path.isfile(fpath) and (suffix in exts):
+                    if path not in cat:
+                        cat[path] = len(cat)
+                    yield (len(image_list), os.path.relpath(fpath, root), cat[path])
+    else:
+        for fname in os.listdir(root):
+            fpath = os.path.join(root, fname)
+            suffix = os.path.splitext(fname)[1].lower()
+            if os.path.isfile(fpath) and (suffix in exts):
+                yield (len(image_list), os.path.relpath(fpath, root), 0)
+
+def write_list(path_out, image_list):
+    with open(path_out, 'w') as fout:
+        for i, item in enumerate(image_list):
+            line = '%d\t' % item[0]
+            for j in item[2:]:
+                line += '%f\t' % j
+            line += '%s\n' % item[1]
+            fout.write(line)
+
+def make_list(args):
+    image_list = list_image(args.root, args.recursive, args.exts)
+    image_list = list(image_list)
+    if args.shuffle is True:
+        random.seed(100)
+        random.shuffle(image_list)
+    N = len(image_list)
+    chunk_size = (N + args.chunks - 1) / args.chunks
+    for i in xrange(args.chunks):
+        chunk = image_list[i * chunk_size:(i + 1) * chunk_size]
+        if args.chunks > 1:
+            str_chunk = '_%d' % i
+        else:
+            str_chunk = ''
+        sep = int(chunk_size * args.train_ratio)
+        sep_test = int(chunk_size * args.test_ratio)
+        write_list(args.prefix + str_chunk + '_test.lst', chunk[:sep_test])
+        write_list(args.prefix + str_chunk + '_train.lst', chunk[sep_test:sep_test + sep])
+        write_list(args.prefix + str_chunk + '_val.lst', chunk[sep_test + sep:])
+
+def read_list(path_in):
+    with open(path_in) as fin:
+        while True:
+            line = fin.readline()
+            if not line:
+                break
+            line = [i.strip() for i in line.strip().split('\t')]
+            item = [int(line[0])] + [line[-1]] + [float(i) for i in line[1:-1]]
+            yield item
+
+def image_encode(args, item, q_out):
+    try:
+        img = cv2.imread(os.path.join(args.root, item[1]), args.color)
+    except:
+        print('imread error:', item[1])
+        return
+    if img is None:
+        print('read none error:', item[1])
+        return
+    if args.center_crop:
+        if img.shape[0] > img.shape[1]:
+            margin = (img.shape[0] - img.shape[1]) / 2;
+            img = img[margin:margin + img.shape[1], :]
+        else:
+            margin = (img.shape[1] - img.shape[0]) / 2;
+            img = img[:, margin:margin + img.shape[0]]
+    if args.resize:
+        if img.shape[0] > img.shape[1]:
+            newsize = (args.resize, img.shape[0] * args.resize / img.shape[1])
+        else:
+            newsize = (img.shape[1] * args.resize / img.shape[0], args.resize)
+        img = cv2.resize(img, newsize)
+    if len(item) > 3 and args.pack_label:
+        header = mx.recordio.IRHeader(0, item[2:], item[0], 0)
+    else:
+        header = mx.recordio.IRHeader(0, item[2], item[0], 0)
+
+    try:
+        s = mx.recordio.pack_img(header, img, quality=args.quality, img_fmt=args.encoding)
+        q_out.put((s, item))
+    except Exception, e:
+        print('pack_img error:', item[1], e)
+        return
+
+def read_worker(args, q_in, q_out):
+    while True:
+        item = q_in.get()
+        if item is None:
+            break
+        image_encode(args, item, q_out)
+
+def write_worker(q_out, fname, working_dir):
+    pre_time = time.time()
+    count = 0
+    fname_rec = os.path.basename(fname)
+    fname_rec = os.path.splitext(fname)[0] + '.rec'
+    fout = open(fname+'.tmp', 'w')
+    record = mx.recordio.MXRecordIO(os.path.join(working_dir, fname_rec), 'w')
+    while True:
+        deq = q_out.get()
+        if deq is None:
+            break
+        s, item = deq
+        record.write(s)
+
+        line = '%d\t' % item[0]
+        for j in item[2:]:
+            line += '%f\t' % j
+        line += '%s\n' % item[1]
+        fout.write(line)
+
+        if count % 1000 == 0:
+            cur_time = time.time()
+            print('time:', cur_time - pre_time, ' count:', count)
+            pre_time = cur_time
+        count += 1
+    os.rename(fname+'.tmp', fname)
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Create an image list or \
+        make a record database by reading from an image list')
+    parser.add_argument('prefix', help='prefix of input/output lst and rec files.')
+    parser.add_argument('root', help='path to folder containing images.')
+
+    cgroup = parser.add_argument_group('Options for creating image lists')
+    cgroup.add_argument('--list', type=bool, default=False,
+                        help='If this is set im2rec will create image list(s) by traversing root folder\
+        and output to <prefix>.lst.\
+        Otherwise im2rec will read <prefix>.lst and create a database at <prefix>.rec')
+    cgroup.add_argument('--exts', type=list, default=['.jpeg', '.jpg'],
+                        help='list of acceptable image extensions.')
+    cgroup.add_argument('--chunks', type=int, default=1, help='number of chunks.')
+    cgroup.add_argument('--train-ratio', type=float, default=1.0,
+                        help='Ratio of images to use for training.')
+    cgroup.add_argument('--test-ratio', type=float, default=0,
+                        help='Ratio of images to use for testing.')
+    cgroup.add_argument('--recursive', type=bool, default=False,
+                        help='If true recursively walk through subdirs and assign an unique label\
+        to images in each folder. Otherwise only include images in the root folder\
+        and give them label 0.')
+
+    rgroup = parser.add_argument_group('Options for creating database')
+    rgroup.add_argument('--resize', type=int, default=0,
+                        help='resize the shorter edge of image to the newsize, original images will\
+        be packed by default.')
+    rgroup.add_argument('--center-crop', type=bool, default=False,
+                        help='specify whether to crop the center image to make it rectangular.')
+    rgroup.add_argument('--quality', type=int, default=80,
+                        help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
+    rgroup.add_argument('--num-thread', type=int, default=1,
+                        help='number of thread to use for encoding. order of images will be different\
+        from the input list if >1. the input list will be modified to match the\
+        resulting order.')
+    rgroup.add_argument('--color', type=int, default=1, choices=[-1, 0, 1],
+                        help='specify the color mode of the loaded image.\
+        1: Loads a color image. Any transparency of image will be neglected. It is the default flag.\
+        0: Loads image in grayscale mode.\
+        -1:Loads image as such including alpha channel.')
+    rgroup.add_argument('--encoding', type=str, default='.jpg', choices=['.jpg', '.png'],
+                        help='specify the encoding of the images.')
+    rgroup.add_argument('--shuffle', default=True, help='If this is set as True, \
+        im2rec will randomize the image order in <prefix>.lst')
+    rgroup.add_argument('--pack-label', default=False,
+        help='Whether to also pack multi dimensional label in the record file') 
+    args = parser.parse_args()
+    args.prefix = os.path.abspath(args.prefix)
+    args.root = os.path.abspath(args.root)
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.list:
+        make_list(args)
+    else:
+        if os.path.isdir(args.prefix):
+            working_dir = args.prefix
+        else:
+            working_dir = os.path.dirname(args.prefix)
+        files = [os.path.join(working_dir, fname) for fname in os.listdir(working_dir)
+                    if os.path.isfile(os.path.join(working_dir, fname))]
+        count = 0
+        for fname in files:
+            if fname.startswith(args.prefix) and fname.endswith('.lst'):
+                print('Creating .rec file from', fname, 'in', working_dir)
+                count += 1
+                image_list = read_list(fname)
+                # -- write_record -- #
+                try:
+                    import multiprocessing
+                    q_in = [multiprocessing.Queue(1024) for i in range(args.num_thread)]
+                    q_out = multiprocessing.Queue(1024)
+                    read_process = [multiprocessing.Process(target=read_worker, args=(args, q_in[i], q_out)) \
+                                    for i in range(args.num_thread)]
+                    for p in read_process:
+                        p.start()
+                    write_process = multiprocessing.Process(target=write_worker, args=(q_out, fname, working_dir))
+                    write_process.start()
+
+                    for i, item in enumerate(image_list):
+                        q_in[i % len(q_in)].put(item)
+                    for q in q_in:
+                        q.put(None)
+                    for p in read_process:
+                        p.join()
+
+                    q_out.put(None)
+                    write_process.join()
+                except ImportError:
+                    print('multiprocessing not available, fall back to single threaded encoding')
+                    import Queue
+                    q_out = Queue.Queue()
+                    fname_rec = os.path.basename(fname)
+                    fname_rec = os.path.splitext(fname)[0] + '.rec'
+                    record = mx.recordio.MXRecordIO(os.path.join(working_dir, fname_rec), 'w')
+                    cnt = 0
+                    pre_time = time.time()
+                    for item in image_list:
+                        image_encode(args, item, q_out)
+                        if q_out.empty():
+                            continue
+                        _, s, _ = q_out.get()
+                        record.write(s)
+                        if cnt % 1000 == 0:
+                            cur_time = time.time()
+                            print('time:', cur_time - pre_time, ' count:', cnt)
+                            pre_time = cur_time
+                        cnt += 1
+        if not count:
+            print('Did not find and list file with prefix %s'%args.prefix)
diff --git a/tools/launch.py b/tools/launch.py
index 3e0444952046..127756880495 100755
--- a/tools/launch.py
+++ b/tools/launch.py
@@ -19,7 +19,12 @@ def dmlc_opts(opts):
             '--host-file', opts.hostfile,
             '--sync-dst-dir', opts.sync_dst_dir]
     args += opts.command;
-    from dmlc_tracker import opts
+    try:
+        from dmlc_tracker import opts
+    except ImportError:
+        print("Can't load dmlc_tracker package.  Perhaps you need to run")
+        print("    git submodule update --init --recursive")
+        raise
     dmlc_opts = opts.get_opts(args)
     return dmlc_opts
 
@@ -49,24 +54,28 @@ def main():
         args.num_servers = args.num_workers
 
     args = dmlc_opts(args)
-
-    if args.cluster == 'local' or args.host_file is None or args.host_file == 'None':
-        from dmlc_tracker import local
-        local.submit(args)
-    elif args.cluster == 'sge':
-        from dmlc_tracker import sge
-        sge.submit(args)
-    elif args.cluster == 'yarn':
-        from dmlc_tracker import yarn
-        yarn.submit(args)
-    elif args.cluster == 'ssh':
-        from dmlc_tracker import ssh
-        ssh.submit(args)
-    elif args.cluster == 'mpi':
-        from dmlc_tracker import mpi
-        mpi.submit(args)
+    
+    if args.host_file is None or args.host_file == 'None':
+      if args.cluster == 'yarn':
+          from dmlc_tracker import yarn
+          yarn.submit(args)
+      elif args.cluster == 'local':
+          from dmlc_tracker import local
+          local.submit(args)
+      elif args.cluster == 'sge':
+          from dmlc_tracker import sge
+          sge.submit(args)
+      else:
+          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
     else:
-        raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
+      if args.cluster == 'ssh':
+          from dmlc_tracker import ssh
+          ssh.submit(args)
+      elif args.cluster == 'mpi':
+          from dmlc_tracker import mpi
+          mpi.submit(args)
+      else:
+          raise RuntimeError('Unknown submission cluster type %s' % args.cluster)
 
 def signal_handler(signal, frame):
     logging.info('Stop luancher')